from binary to weighed mutation profile, based on observed/expected s…

…core of gnomAD
0m1n0 · Dec 5, 2019 · bccbf7b · bccbf7b
1 parent 705b98a
commit bccbf7b
Show file tree

Hide file tree

Showing 4 changed files with 171 additions and 74 deletions.
diff --git a/stratipy/biostat_go.py b/stratipy/biostat_go.py
@@ -173,7 +173,7 @@ def merge_2_subgroups(data_folder, ssc_mutation_data, ssc_subgroups, gene_data,
 
     df_enrich1, df_pcount1 = p_df_by_subgroup(
         data_folder, ssc_mutation_data, ssc_subgroups, gene_data, ppi_data,
-        n_components, mut_type, alpha, ngh_max, n_permutations, lambd, O,
+        n_components, mut_type, alpha, ngh_max, n_permutations, lambd, O,
         min_category_size, max_category_size, max_category_depth)
 
     ################ SSC 2 ################

diff --git a/stratipy/biostat_plot.py b/stratipy/biostat_plot.py
@@ -178,7 +178,7 @@ def biostat_individuals_plot(df, data_folder, ssc_mutation_data, gene_data,
 
     # k=20 -> figsize=(5, 9)
     fig, ax = plt.subplots(nrows=df_fill.shape[0], ncols=df_fill.shape[1],
-                           sharex=True, sharey=True, figsize=(5, 9))
+                           sharex=True, sharey=True, figsize=(5, 20))
     if lambd > 0:
         nmf = 'GNMF'
     else:

diff --git a/stratipy/load_data.py b/stratipy/load_data.py
@@ -8,6 +8,7 @@
 from stratipy.nbs_class import Ppi
 from tqdm import trange
 import glob
+from statistics import median
 
 # NOTE some variable names changed:
 # dataFolder -> data_folder
@@ -80,9 +81,10 @@ def get_indiv_list(indiv_from_df):
     return alist
 
 
-def mutation_profile_coordinate(df, indiv_list, indiv_type):
+def mutation_profile_coordinate_score(df, indiv_list, indiv_type, score_list):
     coord_gene = []
     coord_indiv = []
+    weight = []
 
     for i in trange(df.shape[0], desc="mutation profile coordinates"):
         # for each row (each gene), we get list of individuals' ID
@@ -93,8 +95,51 @@ def mutation_profile_coordinate(df, indiv_list, indiv_type):
             coord_gene.append(i)
             # individual is saved as his/her INDEX (not ID) in indiv_list
             coord_indiv.append(j)
+            # pLI score for each point
+            weight.append(score_list[i])
 
-    return coord_indiv, coord_gene
+    return coord_indiv, coord_gene, weight
+
+
+def generate_mp(df, pLI_type):
+    gene_id = [int(i) for i in df['entrez_id'].tolist()]
+    indiv_type = 'Admixture_individual'  # Admixture European probands
+    df[indiv_type] = df[indiv_type].apply(eval)
+    # create individual ID list
+    indiv = get_indiv_list(df[indiv_type])
+    if '[' in indiv: indiv.remove('[')
+    if ']' in indiv: indiv.remove(']')
+    # pLI_oe score list
+    pLI_score = df[pLI_type].tolist()
+
+    # calculate coordinates genes x individuals -> sparse matrix
+    coord_indiv, coord_gene, weight = mutation_profile_coordinate_score(
+        df, indiv, indiv_type, pLI_score)
+
+    mutation_profile = sp.coo_matrix((weight, (coord_indiv, coord_gene)),
+                                     shape=(len(indiv), len(gene_id)),
+                                     dtype=np.float32).tocsr()
+    return mutation_profile, gene_id, indiv
+
+
+def merge_lof_mis(df_lof, df_mis, pLI_lof_name, pLI_mis_name):
+    mp_lof, gene_lof, indiv_lof = generate_mp(df_lof, pLI_lof_name)
+    mp_mis, gene_mis, indiv_mis = generate_mp(df_mis, pLI_mis_name)
+
+    df_lof = pd.DataFrame(mp_lof.toarray(), columns=gene_lof, index=indiv_lof)
+    df_mis = pd.DataFrame(mp_mis.toarray(), columns=gene_mis, index=indiv_mis)
+
+    # merge 2 dataframes and take high value if needed
+    df = pd.concat([df_lof, df_mis]).groupby(level=0).max()
+    # fill NA by 0
+    df = df.fillna(0)
+
+    # dataframe to scipy sparse matrix
+    mutation_profile = sp.csr_matrix(df.values)
+    indiv = df.index.values.tolist()
+    gene_id = df.columns.tolist()
+
+    return mutation_profile, gene_id, indiv
 
 
 def load_overall_SSC_mutation_profile(data_folder, ssc_mutation_data):
@@ -115,29 +160,18 @@ def load_overall_SSC_mutation_profile(data_folder, ssc_mutation_data):
         print('overall_{}_mutation_profile file is calculating.....'
               .format(ssc_mutation_data))
 
-        df = pd.read_csv(data_folder + 'SSC_{}_gene_filtered.csv'
-                         .format(ssc_mutation_data), sep=';')
-        # create gene ID (EntrezGene ID) list
-        gene_id = [int(i) for i in df['entrez_id'].tolist()]
-
-        # individuals' ID list for each row is transformed in string of list
-        # we thus reformate to list
-        # indiv_type = 'individual' # all individuals
-        indiv_type = 'Admixture_individual' # Admixture European probands
-        df[indiv_type] = df[indiv_type].apply(eval)
-        # create individual ID list
-        indiv = get_indiv_list(df[indiv_type])
-        if '[' in indiv: indiv.remove('[')
-        if ']' in indiv: indiv.remove(']')
-
-        # calculate coordinates genes x individuals -> sparse matrix
-        coord_indiv, coord_gene = mutation_profile_coordinate(df, indiv, indiv_type)
-        # mutation weight = 1
-        weight = np.ones(len(coord_gene))
-        # coo matrix then to csr matrix
-        mutation_profile = sp.coo_matrix((weight, (coord_indiv, coord_gene)),
-                                         shape=(len(indiv), len(gene_id)),
-                                         dtype=np.float32).tocsr()
+        df_lof = pd.read_csv(
+            data_folder + 'SSC_MAF1_LoF_gene_filtered_pLI_oe.csv', sep=';')
+
+        if 'mis15' in ssc_mutation_data:
+            df_mis = pd.read_csv(
+                data_folder + 'SSC_MAF1_mis15_gene_filtered_pLI_oe.csv', sep=';')
+        elif 'mis30' in ssc_mutation_data:
+            df_mis = pd.read_csv(
+                data_folder + 'SSC_MAF1_mis30_gene_filtered_pLI_oe.csv', sep=';')
+
+        mutation_profile, gene_id, indiv = merge_lof_mis(
+            df_lof, df_mis, 'oe_lof_rescaled', 'oe_mis_rescaled')
 
         savemat(overall_mutation_profile_file,
                 {'mutation_profile': mutation_profile,
@@ -168,15 +202,30 @@ def load_specific_SSC_mutation_profile(data_folder, ssc_mutation_data, ssc_subgr
             load_overall_SSC_mutation_profile(data_folder, ssc_mutation_data))
         print("SSC overall mutation profile matrix\n    shape: {}\n    stored elements: {}".format(mutation_profile.shape, mutation_profile.nnz))
 
-        # if SSC 1 or 2
         if ssc_subgroups != "SSC":
             print('{}_{}_mutation_profile file is calculating.....'.format(
                 ssc_mutation_data, ssc_subgroups))
+            if (ssc_subgroups == 'SSC1') or (ssc_subgroups == 'SSC2'):
+                print('{}_{}_mutation_profile file is calculating.....'.format(
+                    ssc_mutation_data, ssc_subgroups))
+
+                df_ssc = pd.read_csv(data_folder + '{}_phenotype.csv'
+                                     .format(ssc_subgroups), sep='\t')
+                ind_ssc_raw = df_ssc.individual.tolist()
+    #             ind_ssc_raw = df_ssc.individual.apply(eval).tolist()
+
+            # 'SSC_all', 'SSC_male', 'SSC_female'
+            else:
+                if ssc_subgroups == 'SSC_all':
+                    indiv_file = data_folder + 'admixture_1175_all.txt'
+                elif ssc_subgroups == 'SSC_male':
+                    indiv_file = data_folder + 'admixture_1175_male.txt'
+                elif ssc_subgroups == 'SSC_female':
+                    indiv_file = data_folder + 'admixture_1175_female.txt'
+
+                with open(indiv_file) as f:
+                    ind_ssc_raw = [line.rstrip('\n') for line in f]
 
-            df_ssc = pd.read_csv(data_folder + '{}_phenotype.csv'
-                                 .format(ssc_subgroups), sep='\t')
-            ind_ssc_raw = df_ssc.individual.tolist()
-#             ind_ssc_raw = df_ssc.individual.apply(eval).tolist()
             # looking for corresponding individual ID in overall data (indiv)
             # then append their index in a list (ind_ssc)
             ind_ssc = []
@@ -189,7 +238,7 @@ def load_specific_SSC_mutation_profile(data_folder, ssc_mutation_data, ssc_subgr
             # slice overall mutation profile by SSC subgroup individuals
             mutation_profile = mutation_profile[ind_ssc, :]
 
-        if gene_data != 'all':
+        if gene_data != 'allGenes':
             print('genes filtering according to:', gene_data)
             df_gene = pd.read_csv(data_folder + 'EntrezGene_{}.csv'
                                   .format(gene_data), sep='\t')

diff --git a/stratipy/nbs.py b/stratipy/nbs.py
@@ -15,15 +15,20 @@
 from scipy.io import loadmat, savemat
 
 
-def initiation(mut_type, alpha, patient_data, data_folder, ssc_mutation_data,
+def initiation(mut_type, patient_data, data_folder, ssc_mutation_data,
                ssc_subgroups, gene_data, ppi_data, lambd, n_components):
-    if mut_type == 'raw':
-        alpha = 0
-        result_folder = (
-            data_folder + 'result_' + ssc_mutation_data + '_' +
-            ssc_subgroups + '_' + gene_data +  '/' + mut_type + '/')
-
     if patient_data == 'SSC':
+        if mut_type == 'raw':
+            alpha = 0
+            ppi_data = 'noPPI'
+
+        else:
+            if ppi_data == 'APID':
+                alpha = 0.6
+
+            elif ppi_data == 'STRING':
+                alpha = 0.581
+
         result_folder = (
             data_folder + 'result_' + ssc_mutation_data + '_' +
             ssc_subgroups + '_' + gene_data + '_' + ppi_data + '/')
@@ -33,31 +38,33 @@ def initiation(mut_type, alpha, patient_data, data_folder, ssc_mutation_data,
     else:
         result_folder = (data_folder + 'result_' + patient_data + '_' +
                          ppi_data + '/')
+        if mut_type == 'raw':
+            alpha = 0
 
     print(result_folder, flush=True)
+    print("\nIndividuals =", ssc_subgroups, flush=True)
     print("mutation type =", mut_type, flush=True)
     print("alpha =", alpha, flush=True)
     print("lambda =", lambd, flush=True)
     print("k =", n_components, flush=True)
 
-    return alpha, result_folder
+    return alpha, result_folder, ppi_data
 
 
-def preprocessing(data_folder, patient_data, ssc_mutation_data, ssc_subgroups, gene_data,
-                  ppi_data, result_folder, influence_weight, simplification,
-                  compute, overwrite, alpha, tol, ngh_max,
-                  keep_singletons, min_mutation, max_mutation, mut_type):
+def preprocess_noRaw(ppi_data, mut_type, ssc_subgroups, data_folder, patient_data, ssc_mutation_data,
+               gene_data, influence_weight, simplification, compute, overwrite, tol, ngh_max,
+               keep_singletons, min_mutation, max_mutation, result_folder, alpha):
     print("------------ load_data.py ------------ {}"
           .format(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
           flush=True)
     if patient_data == 'TCGA_UCEC':
-        (patient_id, mutation_profile, gene_id_patient, gene_symbol_profile) = (
+        (individual_id, mutation_profile, gene_id, gene_symbol_profile) = (
             load_data.load_TCGA_UCEC_patient_data(data_folder))
     elif patient_data == 'Faroe':
-        mutation_profile, gene_id_patient = (
+        mutation_profile, gene_id = (
             load_data.load_Faroe_Islands_data(data_folder))
     elif patient_data == 'SSC':
-        mutation_profile, gene_id_patient, patient_id = (
+        mutation_profile, gene_id, individual_id = (
             load_data.load_specific_SSC_mutation_profile(
                 data_folder, ssc_mutation_data, ssc_subgroups, gene_data))
 
@@ -73,7 +80,12 @@ def preprocessing(data_folder, patient_data, ssc_mutation_data, ssc_subgroups, g
           flush=True)
     idx_ppi, idx_ppi_only, ppi_total, mut_total, ppi_filt = (
         formatting_data.formatting(
-            network, mutation_profile, gene_id_ppi, gene_id_patient))
+            network, mutation_profile, gene_id_ppi, gene_id))
+
+    # EntrezGene ID to int
+    entrez_ppi = [int(i) for i in gene_id_ppi]
+    # EntrezGene indexes in PPI after formatting
+    idx_filtred = idx_ppi + idx_ppi_only
 
     print("------------ filtering_diffusion.py ------------ {}"
           .format(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
@@ -84,42 +96,78 @@ def preprocessing(data_folder, patient_data, ssc_mutation_data, ssc_subgroups, g
             overwrite, alpha, tol, ppi_total, mut_total, ngh_max,
             keep_singletons, min_mutation, max_mutation, mut_type))
 
-    return gene_id_ppi, idx_ppi, idx_ppi_only
+    return gene_id, individual_id, entrez_ppi, idx_filtred, mut_propag
+
+
+def preprocessing(ppi_data, mut_type, ssc_subgroups, data_folder, patient_data,
+                  ssc_mutation_data, gene_data, influence_weight,
+                  simplification, compute, overwrite, tol, ngh_max,
+                  keep_singletons, min_mutation, max_mutation, result_folder, alpha, return_val=False):
+    if mut_type == 'raw':
+        mutation_profile, mp_gene, mp_indiv = (
+            load_data.load_specific_SSC_mutation_profile(
+                data_folder, ssc_mutation_data, ssc_subgroups, gene_data))
+    else:
+        gene_id, mp_indiv, entrez_ppi, idx_filtred, mutation_profile = preprocess_noRaw(
+            ppi_data, mut_type, ssc_subgroups, data_folder, patient_data, ssc_mutation_data,
+                           gene_data, influence_weight, simplification, compute, overwrite, tol, ngh_max,
+                           keep_singletons, min_mutation, max_mutation, result_folder, alpha)
+        # Entrez Gene ID in filtered/formatted mutation profile
+        mp_gene = [entrez_ppi[i] for i in idx_filtred]
+
+    if return_val:
+        return mutation_profile, mp_indiv, mp_gene, entrez_ppi, idx_filtred
 
 
 def parallel_bootstrap(result_folder, mut_type, influence_weight,
                        simplification, alpha, tol, keep_singletons,
                        ngh_max, min_mutation, max_mutation, n_components,
                        n_permutations, run_bootstrap, lambd, tol_nmf,
-                       compute_gene_clustering, sub_perm):
-    final_influence_mutation_directory = result_folder + 'final_influence/'
-    final_influence_mutation_file = (
-        final_influence_mutation_directory +
-        'final_influence_mutation_profile_{}_alpha={}_tol={}.mat'.format(
-            mut_type, alpha, tol))
-    final_influence_data = loadmat(final_influence_mutation_file)
-    mut_propag = final_influence_data['mut_propag']
-
-    ppi_final_file = (
-        final_influence_mutation_directory +
-        'PPI_final_weight={}_simp={}_alpha={}_tol={}_singletons={}_ngh={}.mat'
-        .format(influence_weight, simplification, alpha, tol, keep_singletons,
-                ngh_max))
-    ppi_final_data = loadmat(ppi_final_file)
-    ppi_final = ppi_final_data['ppi_final']
-
-    nmf_bootstrap.bootstrap(
-        result_folder, mut_type, mut_propag, ppi_final,
-        influence_weight, simplification, alpha, tol, keep_singletons,
-        ngh_max, min_mutation, max_mutation, n_components,
-        n_permutations, run_bootstrap, lambd, tol_nmf,
-        compute_gene_clustering, sub_perm)
+                       compute_gene_clustering, sub_perm, data_folder, ssc_mutation_data, ssc_subgroups, gene_data):
+    # if mut_type == 'raw':
+    #     mut_propag, mp_gene, mp_indiv = (
+    #         load_data.load_specific_SSC_mutation_profile(
+    #             data_folder, ssc_mutation_data, ssc_subgroups, gene_data))
+    #     # fake
+    #     ppi_final_file = '../data/result_MAF1_LoF_mis15_SSC_all_allGenes_APID/final_influence/PPI_final_weight=min_simp=True_alpha=0.7_tol=0.01_singletons=False_ngh=11.mat'
+    #
+    # else:
+    #     final_influence_mutation_directory = result_folder + 'final_influence/'
+    #     final_influence_mutation_file = (
+    #         final_influence_mutation_directory +
+    #         'final_influence_mutation_profile_{}_alpha={}_tol={}.mat'.format(
+    #             mut_type, alpha, tol))
+    #     final_influence_data = loadmat(final_influence_mutation_file)
+    #     mut_propag = final_influence_data['mut_propag']
+    #
+    #     ppi_final_file = (
+    #         final_influence_mutation_directory +
+    #         'PPI_final_weight={}_simp={}_alpha={}_tol={}_singletons={}_ngh={}.mat'
+    #         .format(influence_weight, simplification, alpha, tol, keep_singletons,
+    #                 ngh_max))
+    # ppi_final_data = loadmat(ppi_final_file)
+    # ppi_final = ppi_final_data['ppi_final']
+    #
+    # nmf_bootstrap.bootstrap(
+    #     result_folder, mut_type, mut_propag, ppi_final,
+    #     influence_weight, simplification, alpha, tol, keep_singletons,
+    #     ngh_max, min_mutation, max_mutation, n_components,
+    #     n_permutations, run_bootstrap, lambd, tol_nmf,
+    #     compute_gene_clustering, sub_perm)
+
+    nmf_bootstrap.bootstrap(data_folder, ssc_mutation_data, ssc_subgroups, gene_data,
+                  result_folder, mut_type, influence_weight,
+                  simplification, alpha, tol, keep_singletons, ngh_max,
+                  min_mutation, max_mutation, n_components, n_permutations,
+                  run_bootstrap, lambd, tol_nmf,
+                  compute_gene_clustering, sub_perm)
 
 
 def post_bootstrap(result_folder, mut_type, influence_weight, simplification,
                    alpha, tol, keep_singletons, ngh_max, min_mutation,
                    max_mutation, n_components, n_permutations, lambd, tol_nmf,
-                   compute_gene_clustering, run_consensus, linkage_method,
+                   compute_gene_clustering, run_consensus, run_bootstrap,
+                   linkage_method,
                    ppi_data, patient_data, data_folder, ssc_subgroups,
                    ssc_mutation_data, gene_data, p_val_threshold, compute,
                    overwrite):