diff --git a/bin/bff.R b/bin/bff.R index b0ea7af..bc31c06 100755 --- a/bin/bff.R +++ b/bin/bff.R @@ -18,7 +18,7 @@ parser$add_argument("--methodsForConsensus", help='By default, a consensus call parser$add_argument("--cellbarcodeWhitelist", help='A vector of expected cell barcodes. This allows reporting on the total set of expected barcodes, not just those in the filtered count matrix',default=NULL) parser$add_argument("--metricsFile", help='If provided, summary metrics will be written to this file.', default="metrics_cell_hash_r.csv") parser$add_argument("--doTSNE", help='If true, tSNE will be run on the resulting hashing calls after each caller.', default=TRUE) -parser$add_argument("--preprocess", help='If true, the PreProcess function by CellHashR is executed', default=FALSE) +parser$add_argument("--preprocess_bff", help='If given, the PreProcess function by CellHashR is executed', action="store_true") parser$add_argument("--barcodeWhitelist", help='A vector of barcode names to retain, used for preprocessing step', default=NULL) parser$add_argument("--doHeatmap", help='f true, Seurat::HTOHeatmap will be run on the results of each caller.', default=TRUE) parser$add_argument("--perCellSaturation", help='An optional dataframe with the columns cellbarcode and saturation',default=NULL) @@ -31,6 +31,7 @@ parser$add_argument("--assignmentOutBff", help="Prefix name for the file contain parser$add_argument("--outputdir", help='Output directory') args <- parser$parse_args() +print(args$preprocess_bff) #Parameters originally Null methodsForConsensus <- args$methodsForConsensus if(is.null(methodsForConsensus)){ @@ -62,7 +63,7 @@ Argument <- c("HTO-File", "methods", "methodsForConsensus", "cellbarcodeWhitelis Value <- c(args$fileHto, args$methods, methodsForConsensus, cellbarcodeWhitelist, args$metricsFile, perCellSaturation, majorityConsensusThreshold, callerDisagreementThreshold, args$doTSNE, args$doHeatmap,args$chemistry) params <- data.frame(Argument, Value) -if(as.logical(args$preprocess)){ +if(args$preprocess_bff == TRUE){ print("Preprocessing activated") print(args$preprocess) #get barcodes @@ -90,7 +91,7 @@ if (!is.null(args$methodsForConsensus)) { perCell_args <- args$perCellSaturation perCell <- ifelse(perCell_args == "null" || perCell_args == "Null", NULL, perCell_args) -if(args$methodsForConsensus=="bff_raw" || args$methodsForConsensus=="bff_cluster" || args$methodsForConsensus=="bff_raw,bff_cluster" || is.null(args$methodsForConsensus) ) +if(args$methodsForConsensus=="bff_raw" || args$methodsForConsensus=="bff_cluster" || args$methodsForConsensus=="bff_raw,bff_cluster" || args$methodsForConsensus=="bff_cluster,bff_raw"|| is.null(args$methodsForConsensus) ) #Only Bff in its different variations is available if (args$methods == "bff_raw") { print("Executing BFF raw") diff --git a/bin/summary_hash.py b/bin/summary_hash.py index fac8ea8..f93bb27 100755 --- a/bin/summary_hash.py +++ b/bin/summary_hash.py @@ -18,6 +18,7 @@ parser.add_argument("--generate_mudata", help="Generate mudata", action='store_true') parser.add_argument("--read_rna_mtx", help="10x-Genomics-formatted mtx directory for gene expression", default=None) parser.add_argument("--read_hto_mtx", help="10x-Genomics-formatted mtx directory for HTO expression", default=None) +parser.add_argument("--sampleId", help="sampleID if multiple samples are demultiplexed", default=None) args = parser.parse_args() @@ -28,12 +29,15 @@ def demuxem_summary(demuxem_res, raw_adata, raw_mudata): for x in demuxem_res: obs_res_dir = os.path.join(x, [filename for filename in os.listdir(x) if filename.endswith("_obs.csv")][0]) obs_res = pd.read_csv(obs_res_dir) - obs_res.loc[:, "barcodekey"] = obs_res["barcodekey"].apply(lambda x: x + "-1") - demuxem_assign = obs_res[["barcodekey", "assignment"]] - demuxem_assign.columns = ["Barcode",os.path.basename(x)] - demuxem_assign.set_index("Barcode", inplace=True) + obs_res.rename(columns={obs_res.columns[0]: "Barcode"}, inplace=True) + demuxem_assign = obs_res[["Barcode", "assignment"]] + demuxem_assign.columns = ["Barcode", os.path.basename(x)] + #demuxem_assign.loc[:, "Barcode"] = demuxem_assign["Barcode"].apply(lambda x: x + "-1") + #demuxem_assign["Barcode"] = demuxem_assign["Barcode"].astype(str) + "-1" + demuxem_assign.index = demuxem_assign.Barcode + demuxem_assign = demuxem_assign.drop(columns=['Barcode']) assign.append(demuxem_assign) - + print(assign) if raw_adata is not None: adata = raw_adata.copy() adata.obs = adata.obs.merge(demuxem_assign, left_index=True, right_index=True, how='left') @@ -51,17 +55,21 @@ def demuxem_summary(demuxem_res, raw_adata, raw_mudata): mudata.update() mudata.write("hash_summary/mudata/mudata_with_mudata_"+ os.path.basename(x)+".h5mu") - demuxem_classi = obs_res[["barcodekey", "demux_type"]] + demuxem_classi = obs_res[["Barcode", "demux_type"]] demuxem_classi.columns = ["Barcode", os.path.basename(x)] demuxem_classi = demuxem_classi.replace("unknown", "negative") - demuxem_classi.set_index("Barcode", inplace=True) + #demuxem_classi.loc[:, "Barcode"] = demuxem_classi["Barcode"].apply(lambda x: x + '-1') + demuxem_classi.index = demuxem_classi.Barcode + demuxem_classi = demuxem_classi.drop(columns=['Barcode']) classi.append(demuxem_classi) params_dir = os.path.join(x, [filename for filename in os.listdir(x) if filename.endswith("params.csv")][0]) params_res = pd.read_csv(params_dir, keep_default_na=False, index_col=0) + #params_res.rename(columns={params_res.columns[1]: os.path.basename(x)}, inplace=True) params_res.columns = [os.path.basename(x)] params.append(params_res) + assign = pd.concat(assign, axis=1) assign.to_csv("hash_summary/demuxem_assignment.csv", quoting=False) @@ -104,9 +112,13 @@ def hashsolo_summary(hashsolo_res, raw_adata, raw_mudata): mudata.write("hash_summary/mudata/mudata_with_mudata_"+ os.path.basename(x)+".h5mu") hashsolo_classi = obs_res[["most_likely_hypothesis"]] - hashsolo_classi['most_likely_hypothesis'] = hashsolo_classi['most_likely_hypothesis'].apply(lambda x: 'negative' if x == 0.0 else ('singlet' if x == 1.0 else 'doublet')) - hashsolo_classi.columns = [os.path.basename(x)] - classi.append(hashsolo_classi) + hashsolo_classi_copy = hashsolo_classi.copy() + hashsolo_classi_copy.loc[hashsolo_classi_copy["most_likely_hypothesis"] == 0.0 , "most_likely_hypothesis"] = "negative" + hashsolo_classi_copy.loc[hashsolo_classi_copy["most_likely_hypothesis"] == 1.0 , "most_likely_hypothesis"] = "singlet" + hashsolo_classi_copy.loc[hashsolo_classi_copy["most_likely_hypothesis"] == 2.0 , "most_likely_hypothesis"] = "doublet" + + hashsolo_classi_copy.columns = [os.path.basename(x)] + classi.append(hashsolo_classi_copy) params_dir = os.path.join(x, [filename for filename in os.listdir(x) if filename.endswith("params.csv")][0]) params_res = pd.read_csv(params_dir, keep_default_na=False, index_col=0) @@ -157,6 +169,7 @@ def hasheddrops_summary(hasheddrops_res, raw_adata, raw_mudata): mudata.write("hash_summary/mudata/mudata_with_mudata_"+ os.path.basename(x)+".h5mu") + hasheddrops_classi = obs_res[["Barcode", "Classification"]] hasheddrops_classi = hasheddrops_classi.rename(columns={"Classification": os.path.basename(x)}) classi.append(hasheddrops_classi) @@ -184,6 +197,7 @@ def multiseq_summary(multiseq_res, raw_adata, raw_mudata): multiseq_assign.columns = ["Barcode", os.path.basename(x)] multiseq_assign.set_index("Barcode", inplace=True) multiseq_assign.replace({"Doublet": "doublet", "Negative": "negative"}, inplace=True) + assign.append(multiseq_assign) if raw_adata is not None: @@ -203,6 +217,7 @@ def multiseq_summary(multiseq_res, raw_adata, raw_mudata): mudata.update() mudata.write("hash_summary/mudata/mudata_with_mudata_"+ os.path.basename(x)+".h5mu") + params_dir = os.path.join(x, [filename for filename in os.listdir(x) if filename.endswith("params.csv")][0]) params_res = pd.read_csv(params_dir, usecols=[1, 2], keep_default_na=False, index_col=0) params_res.columns = [os.path.basename(x)] @@ -212,7 +227,7 @@ def multiseq_summary(multiseq_res, raw_adata, raw_mudata): assign.to_csv("hash_summary/multiseq_assignment.csv", quoting=False) classi = assign.copy() - classi[~classi.isin(["doublet", "negative"])] = "singlet" + classi[(classi != "doublet") & (classi != "negative")] = "singlet" classi.to_csv("hash_summary/multiseq_classification.csv", quoting=False) params = pd.concat(params, axis=1) @@ -226,8 +241,10 @@ def htodemux_summary(htodemux_res, raw_adata, raw_mudata): obs_res_dir = os.path.join(x, [filename for filename in os.listdir(x) if filename.endswith("_assignment_htodemux.csv")][0]) htodemux_assign = pd.read_csv(obs_res_dir) htodemux_assign.columns = ["Barcode", os.path.basename(x)] - htodemux_assign.replace({"Doublet": "doublet", "Negative": "negative"}, inplace=True) - htodemux_assign.set_index("Barcode", inplace=True) + htodemux_assign.replace("Doublet", "doublet", inplace=True) + htodemux_assign.replace("Negative", "negative", inplace=True) + htodemux_assign.index = htodemux_assign.Barcode + htodemux_assign = htodemux_assign.drop(columns=['Barcode']) assign.append(htodemux_assign) if raw_adata is not None: @@ -240,18 +257,22 @@ def htodemux_summary(htodemux_res, raw_adata, raw_mudata): if raw_mudata is not None: mudata = raw_mudata.copy() - mudata['rna'].obs = mudata['rna'].obs.merge(htodemux_assign, left_index=True, right_index=True, how='left') + mudata['rna'].obs = mudata['rna'].obs.merge(htodemux_assign, left_index=True, right_on='Barcode', how='left').set_index('Barcode') mudata['rna'].obs.rename(columns={mudata['rna'].obs.columns[0]: 'donor'}, inplace=True) mudata['rna'].obs.donor = mudata['rna'].obs.donor.fillna("negative") mudata['rna'].obs.donor = mudata['rna'].obs.donor.astype(str) mudata.update() mudata.write("hash_summary/mudata/mudata_with_mudata_"+ os.path.basename(x)+".h5mu") + obs_res_dir = os.path.join(x, [filename for filename in os.listdir(x) if filename.endswith("_classification_htodemux.csv")][0]) htodemux_classi = pd.read_csv(obs_res_dir) htodemux_classi.columns = ["Barcode", os.path.basename(x)] - htodemux_classi.replace({"Doublet": "doublet", "Negative": "negative", "Singlet": "singlet"}, inplace=True) - htodemux_classi.set_index("Barcode", inplace=True) + htodemux_classi.replace("Singlet", "singlet", inplace=True) + htodemux_classi.replace("Doublet", "doublet", inplace=True) + htodemux_classi.replace("Negative", "negative", inplace=True) + htodemux_classi.index = htodemux_classi.Barcode + htodemux_classi = htodemux_classi.drop(columns=['Barcode']) classi.append(htodemux_classi) params_dir = os.path.join(x, [filename for filename in os.listdir(x) if filename == "params.csv"][0]) @@ -267,24 +288,21 @@ def htodemux_summary(htodemux_res, raw_adata, raw_mudata): params = pd.concat(params, axis=1) params.to_csv("hash_summary/htodemux_params.csv") - - - + def gmm_summary(gmmDemux_res,raw_adata, raw_mudata): classi = [] assign = [] params = [] for x in gmmDemux_res: obs_res_dir = os.path.join(x, [filename for filename in os.listdir(x) if filename.endswith("GMM_full.csv")][0]) - #we get the number of hashes used for the experiment from the parameters #so that we now how many kinds of singlets we could find in the assignment params_dir = os.path.join(x, "params.csv") params_res = pd.read_csv(params_dir,index_col=False) params_res.columns = ["Argument", os.path.basename(x)] params.append(params_res) - + ##### Get number of hashes used for the experiment result_row = params_res[params_res['Argument'].str.contains('hto_name_gmm', case=False, na=False)] hashes_used = "" if not result_row.empty: @@ -293,46 +311,75 @@ def gmm_summary(gmmDemux_res,raw_adata, raw_mudata): print("No row contains the number of hashes") hashes = hashes_used.split(',') number_of_hashes = len(hashes) - #GMM assignement file + #----------------- + + #GMM assignement file - results GMM gmm_classi = pd.read_csv(obs_res_dir) - #GMM full is the name given by GMM_demux per default to all results + + + #GMM full is the name given by GMM_demux per default to all results + ##classif_file - contains the mapping of results classification_config = os.path.join(x, "GMM_full.config") classif_file = pd.read_csv(classification_config,header=None) #Classification and Assigment come from the same file gmm_dt = pd.DataFrame(gmm_classi) classification_dt = pd.DataFrame(classif_file) - + #change column names classification_dt = classification_dt.rename(columns={0: "Cluster_id", 1: "assignment"}) - + gmm_dt = gmm_dt.rename(columns={"Unnamed: 0": "Barcode"}) + #Create classification following the assignment found for the barcodes #we keep the original assigment and add a classification column def _classify_hash(row,number_hashes): + print(f"current row: {row}") if row == 0: return 'negative' - elif 0 > row <= number_hashes: + elif row > 0 and row <= number_hashes: + print("singlet found") return 'singlet' else: return 'doublet' classification_dt['Classification'] = classification_dt['Cluster_id'].apply(lambda x: _classify_hash(x, number_of_hashes)) - + #Compare classification guide file with classification found - merged = pd.merge(classification_dt, gmm_dt, on='Cluster_id', how='left') + #merged = pd.merge(classification_dt, gmm_dt, on='Cluster_id', how='left') + new_rows = [] + for index, row in gmm_dt.iterrows(): + cluster_id = row['Cluster_id'] + matching_row_map = classification_dt[classification_dt['Cluster_id'] == cluster_id] + if not matching_row_map.empty: + # Extract assignment and classification values from the matching row in df2 + assignment_gmm = matching_row_map.iloc[0]['assignment'] + classification_gmm = matching_row_map.iloc[0]['Classification'] + + new_row = { + 'Barcode': row['Barcode'], + 'Cluster_id': cluster_id, + 'assignment': assignment_gmm, + 'Classification': classification_gmm + } + new_rows.append(new_row) + merged = pd.DataFrame(new_rows) + + merged['assignment'] = merged.apply(lambda row: 'doublet' if 'doublet' in row['Classification'] else row['assignment'], axis=1) + print(merged) + gmm_dt['Classification'] = merged['Classification'] gmm_dt['Assignment'] = merged['assignment'] #instead of multiple hashes, add doublet - gmm_dt["Assignment"] = gmm_dt["Assignment"].apply(lambda x: "doublet" if "-" in x else x) classification_dt['Classification'] = classification_dt['Classification'].str.replace(' ', '') + + #Assigment for GMM-Demux #'Cluster_id', gmm_dt_assign = gmm_dt.drop(['Cluster_id','Confidence','Classification' ], axis=1) gmm_dt_assign['Assignment'] = gmm_dt_assign['Assignment'].str.replace(' ', '') gmm_dt_assign.columns = ["Barcode", os.path.basename(x)] - assign.append(gmm_dt_assign) if raw_adata is not None: @@ -357,7 +404,11 @@ def _classify_hash(row,number_hashes): gmm_dt_classi.columns =["Barcode", os.path.basename(x)] classi.append(gmm_dt_classi) - + params_dir = os.path.join(x, "params.csv") + params_res = pd.read_csv(params_dir,index_col=False) + params_res.columns = ["Argument", os.path.basename(x)] + params.append(params_res) + classi_df = pd.concat(classi, axis=1, join="outer") classi_df.to_csv("hash_summary" +"/GMM_classification.csv", index=False) @@ -384,6 +435,7 @@ def bff_summary(bff_res,raw_adata, raw_mudata): df = pd.DataFrame(columns=column_names) classi.append(df) assign.append(df) + else: #df contain data and we save it in the same way dt_assign = data_bff.copy() @@ -392,12 +444,16 @@ def bff_summary(bff_res,raw_adata, raw_mudata): for column in column_names: if column in dt_assign.columns: dt_assign = dt_assign.drop([column], axis=1) + #dt_assign.loc[dt_assign["consensuscall"] == "Singlet", "consensuscall"] = "singlet" dt_assign.loc[dt_assign["consensuscall"] == "Doublet", "consensuscall"] = "doublet" dt_assign.loc[dt_assign["consensuscall"] == "Negative", "consensuscall"] = "negative" dt_assign['consensuscall'] = dt_assign['consensuscall'].astype('category') dt_assign = dt_assign.rename(columns={"cellbarcode": "Barcode", "consensuscall": os.path.basename(x)}) - dt_assign['Barcode'] = dt_assign['Barcode'].apply(lambda x: str(x) + "-1") + dt_assign["Barcode"] = dt_assign["Barcode"].apply(lambda x: x + "-1" if isinstance(x, str) else x) + assign.append(dt_assign) + print("assign bff barcodes") + print(assign) if raw_adata is not None: adata = raw_adata.copy() adata.obs = adata.obs.merge(dt_assign, left_index=True, right_index=True, how='left') @@ -416,24 +472,24 @@ def bff_summary(bff_res,raw_adata, raw_mudata): mudata.update() mudata.write("hash_summary/mudata/mudata_with_mudata_"+ os.path.basename(x)+".h5mu") - + dt_classi = data_bff.copy() column_names_class = ["bff_raw","bff_cluster","consensuscall"] for column in column_names_class: - if column in dt_classi.columns: + if column in dt_assign.columns: dt_classi = dt_classi.drop([column], axis=1) dt_classi.loc[dt_classi["consensuscall.global"] == "Singlet", "consensuscall.global"] = "singlet" dt_classi.loc[dt_classi["consensuscall.global"] == "Doublet", "consensuscall.global"] = "doublet" dt_classi.loc[dt_classi["consensuscall.global"] == "Negative", "consensuscall.global"] = "negative" dt_classi = dt_classi.rename(columns={"cellbarcode": "Barcode", "consensuscall.global": os.path.basename(x)}) - dt_classi['Barcode'] = dt_classi['Barcode'].apply(lambda x: str(x) + "-1") + dt_classi["Barcode"] = dt_classi["Barcode"].apply(lambda x: x + "-1" if isinstance(x, str) else x) + print("classification bff barcodes") + print(assign) classi.append(dt_classi) - + params_dir = os.path.join(x, [filename for filename in os.listdir(x) if filename == "params.csv"][0]) - params_res = pd.read_csv(params_dir,sep=';',index_col=None) - params_res = params_res.drop(['Unnamed: 0' ], axis=1) - params_res = params_res.dropna(subset=['Argument', 'Value'], how='any') - params_res.columns = ["Argument",os.path.basename(x)] + params_res = pd.read_csv(params_dir, usecols=[1, 2], keep_default_na=False, index_col=0) + params_res.columns = [os.path.basename(x)] params.append(params_res) classi_df = pd.concat(classi, axis=1, join="outer") @@ -453,14 +509,13 @@ def bff_summary(bff_res,raw_adata, raw_mudata): os.mkdir("hash_summary") if args.generate_anndata is True: - if not os.path.exists("hash_summary/adata"): - os.mkdir("hash_summary/adata") + os.mkdir("hash_summary/adata") adata = sc.read_10x_mtx(args.read_rna_mtx) if args.generate_mudata is True: - if not os.path.exists("hash_summary/mudata"): - os.mkdir("hash_summary/mudata") + os.mkdir("hash_summary/mudata") rna_data = sc.read_10x_mtx(args.read_rna_mtx) + path_hto = args.read_hto_mtx hto_data = sc.read_10x_mtx(args.read_hto_mtx, gex_only=False) mudata = MuData({"rna": rna_data, "hto": hto_data }) @@ -483,6 +538,7 @@ def bff_summary(bff_res,raw_adata, raw_mudata): if args.htodemux is not None: htodemux_res = args.htodemux.split(':') htodemux_summary(htodemux_res, adata, mudata) + if args.gmm_demux is not None: gmmDemux_res = args.gmm_demux.split(':') @@ -492,25 +548,29 @@ def bff_summary(bff_res,raw_adata, raw_mudata): bff_res = args.bff.split(':') bff_summary(bff_res, adata, mudata) + # Read and combine assignment files assignment = [file for file in os.listdir("hash_summary") if file.endswith("_assignment.csv")] - assignment_all = pd.DataFrame() + assignment_all = pd.read_csv(os.path.join("hash_summary", assignment[0])) + - for file in assignment: - file_path = os.path.join("hash_summary", file) - df = pd.read_csv(file_path, usecols=['Barcode', pd.read_csv(file_path).columns[1]]) - assignment_all = pd.concat([assignment_all, df.set_index('Barcode')], axis=1, join='outer') - - # Reset the index to have 'Barcode' as a regular column - assignment_all.reset_index(inplace=True) + if len(assignment) > 1: + for df in assignment[1:]: + + df = pd.read_csv(os.path.join("hash_summary", df)) + assignment_all = pd.merge(assignment_all, df, on='Barcode', how='outer') + #print("---------assignment all--------") + #print(assignment_all) + #print("-----------------") + assignment_all.to_csv("hash_summary/hashing_assignment_all.csv", index=False) # Read and combine classification files classification = [file for file in os.listdir("hash_summary") if file.endswith("_classification.csv")] classification_all = pd.read_csv(os.path.join("hash_summary", classification[0])) - - if len(classification) > 1: + + if len(classification) > 1: for df in classification[1:]: df = pd.read_csv(os.path.join("hash_summary", df)) classification_all = pd.merge(classification_all, df, on='Barcode', how='outer') diff --git a/modules/hash_demulti/bff.nf b/modules/hash_demulti/bff.nf index c656133..37d6a36 100644 --- a/modules/hash_demulti/bff.nf +++ b/modules/hash_demulti/bff.nf @@ -29,13 +29,14 @@ process bff { script: + def run_preprocess = preprocess_bff != 'False' ? " --preprocess_bff" : '' """ mkdir bff_${sampleId} bff.R --fileHto hto_data --methods $methods --methodsForConsensus $methodsForConsensus \ - --cellbarcodeWhitelist $cellbarcodeWhitelist --metricsFile bff_${sampleId}_$metricsFile \ - --doTSNE $doTSNE --doHeatmap $doHeatmap --perCellSaturation $perCellSaturation --majorityConsensusThreshold $majorityConsensusThreshold \ - --chemistry $chemistry --callerDisagreementThreshold $callerDisagreementThreshold --outputdir bff_${sampleId} \ - --assignmentOutBff $assignmentOutBff --preprocess $preprocess_bff --barcodeWhitelist $barcodeWhitelist + --cellbarcodeWhitelist $cellbarcodeWhitelist --metricsFile bff_${sampleId}_$metricsFile \ + --doTSNE $doTSNE --doHeatmap $doHeatmap --perCellSaturation $perCellSaturation --majorityConsensusThreshold $majorityConsensusThreshold \ + --chemistry $chemistry --callerDisagreementThreshold $callerDisagreementThreshold --outputdir bff_${sampleId} \ + --assignmentOutBff $assignmentOutBff ${run_preprocess} --barcodeWhitelist $barcodeWhitelist """ } diff --git a/modules/hash_demulti/gmm_demux.nf b/modules/hash_demulti/gmm_demux.nf index bb98533..5eb2d6e 100644 --- a/modules/hash_demulti/gmm_demux.nf +++ b/modules/hash_demulti/gmm_demux.nf @@ -61,7 +61,7 @@ process gmm_demux{ workflow gmm_demux_hashing{ take: - hto_matrix + input_list main: summary = params.summary report_gmm = params.report_gmm @@ -70,7 +70,7 @@ take: threshold_gmm = params.threshold_gmm ambiguous = params.ambiguous - gmm_demux(hto_matrix,summary,report_gmm,mode,extract,threshold_gmm,ambiguous) + gmm_demux(input_list,summary,report_gmm,mode,extract,threshold_gmm,ambiguous) emit: gmm_demux.out.collect() diff --git a/modules/hash_demultiplexing.nf b/modules/hash_demultiplexing.nf index ad32ef9..542bac5 100644 --- a/modules/hash_demultiplexing.nf +++ b/modules/hash_demultiplexing.nf @@ -179,8 +179,12 @@ workflow hash_demultiplexing { if (params.gmmDemux == 'True') { Channel.fromPath(params.multi_input) \ | splitCsv(header:true) \ - | map { row-> tuple(row.sampleId, params.hto_matrix_gmm_demux == 'raw' ? row.hto_matrix_raw : row.hto_matrix_filtered, row.hto_name_gmm) } - | gmm_demux_hashing + + | map { row-> tuple(row.sampleId, + params.hto_matrix_gmm_demux == "raw" ? row.hto_matrix_raw : row.hto_matrix_filtered, + row.hto_name_gmm )} + | set {input_list_gmm_demux} + | gmm_demux_hashing(input_list_gmm_demux) gmmDemux_out = gmm_demux_hashing.out } else { diff --git a/modules/single/hash_demulti/bff.nf b/modules/single/hash_demulti/bff.nf index 0e4082f..3070299 100644 --- a/modules/single/hash_demulti/bff.nf +++ b/modules/single/hash_demulti/bff.nf @@ -28,6 +28,7 @@ process bff { path "bff_${task.index}" script: + def run_preprocess = preprocess_bff != 'False' ? " --preprocess_bff" : '' """ mkdir bff_${task.index} @@ -35,7 +36,7 @@ process bff { --cellbarcodeWhitelist $cellbarcodeWhitelist --metricsFile bff_${task.index}_$metricsFile \ --doTSNE $doTSNE --doHeatmap $doHeatmap --perCellSaturation $perCellSaturation --majorityConsensusThreshold $majorityConsensusThreshold \ --chemistry $chemistry --callerDisagreementThreshold $callerDisagreementThreshold --outputdir bff_${task.index} --assignmentOutBff $assignmentOutBff \ - --preprocess $preprocess_bff --barcodeWhitelist $barcodeWhitelist + ${run_preprocess} --barcodeWhitelist $barcodeWhitelist """ } diff --git a/modules/single/hash_demulti/gmm_demux.nf b/modules/single/hash_demulti/gmm_demux.nf index a2f2f4e..9febe1b 100644 --- a/modules/single/hash_demulti/gmm_demux.nf +++ b/modules/single/hash_demulti/gmm_demux.nf @@ -1,15 +1,17 @@ #!/usr/bin/env nextflow nextflow.enable.dsl = 2 -process gmm_demux { - publishDir "$projectDir/$params.outdir/$sampleId/$params.mode/hash_demulti/gmm_demux", mode:'copy' + +process gmm_demux{ + publishDir "$projectDir/$params.outdir/$params.mode/hash_demulti/gmm_demux", mode:'copy' + label 'small_mem' conda "$projectDir/conda/gmm_demux.yml" input: - tuple val(sampleId), path(filtered_hto_matrix_dir), val(hto_name_gmm) + path filtered_hto_matrix_dir //HTO names as string separated by commas - //val hto_name_gmm + val hto_name_gmm //mode 2 //need estimate number of cells in the single cell assay //obligatory @@ -26,7 +28,9 @@ process gmm_demux { val ambiguous output: - path "gmm_demux_${sampleId}" + + path "gmm_demux_${task.index}" + script: def extract_droplets = extract != 'None' ? " -x ${extract}" : '' @@ -34,21 +38,21 @@ process gmm_demux { if (mode_GMM == 'csv') { """ - mkdir gmm_demux_${sampleId} - touch gmm_demux_${sampleId}_$report_gmm - - GMM-demux -c $filtered_hto_matrix_dir $hto_name_gmm -u $summary --report gmm_demux_${sampleId}_$report_gmm --full gmm_demux_${sampleId} $extract_droplets -t $threshold_gmm - gmm_demux_params.py --path_hto $filtered_hto_matrix_dir --hto_name_gmm $hto_name_gmm --summary $summary --report gmm_demux_${sampleId}_$report_gmm --mode $mode_GMM $extract_droplets --threshold_gmm $threshold_gmm $ambiguous_droplets --outputdir gmm_demux_${sampleId} - + mkdir gmm_demux_${task.index} + touch gmm_demux_${task.index}_$report_gmm + + GMM-demux -c $filtered_hto_matrix_dir $hto_name_gmm -u $summary --report gmm_demux_${task.index}_$report_gmm --full gmm_demux_${task.index} $extract_droplets -t $threshold_gmm + gmm_demux_params.py --path_hto $filtered_hto_matrix_dir --hto_name_gmm $hto_name_gmm --summary $summary --report gmm_demux_${task.index}_$report_gmm --mode $mode_GMM $extract_droplets --threshold_gmm $threshold_gmm $ambiguous_droplets --outputdir gmm_demux_${task.index} + """ }else { """ - mkdir gmm_demux_${sampleId} - touch gmm_demux_${sampleId}_$report_gmm - - GMM-demux $filtered_hto_matrix_dir $hto_name_gmm -u $summary -r gmm_demux_${sampleId}_$report_gmm --full gmm_demux_${sampleId} -o gmm_demux_${sampleId} $extract_droplets -t $threshold_gmm - gmm_demux_params.py --path_hto $filtered_hto_matrix_dir --hto_name_gmm $hto_name_gmm --summary $summary --report gmm_demux_${sampleId}_$report_gmm --mode $mode_GMM $extract_droplets --threshold_gmm $threshold_gmm $ambiguous_droplets --outputdir gmm_demux_${sampleId} - + mkdir gmm_demux_${task.index} + touch gmm_demux_${task.index}_$report_gmm + + GMM-demux $filtered_hto_matrix_dir $hto_name_gmm -u $summary -r gmm_demux_${task.index}_$report_gmm --full gmm_demux_${task.index} -o gmm_demux_${task.index} $extract_droplets -t $threshold_gmm + gmm_demux_params.py --path_hto $filtered_hto_matrix_dir --hto_name_gmm $hto_name_gmm --summary $summary --report gmm_demux_${task.index}_$report_gmm --mode $mode_GMM $extract_droplets --threshold_gmm $threshold_gmm $ambiguous_droplets --outputdir gmm_demux_${task.index} + """ } } @@ -57,6 +61,7 @@ workflow gmm_demux_hashing { take: hto_matrix main: + hto_name_gmm = params.hto_name_gmm summary = params.summary report_gmm = params.report_gmm mode = params.mode_GMM @@ -64,8 +69,9 @@ take: threshold_gmm = params.threshold_gmm ambiguous = params.ambiguous - gmm_demux(hto_matrix, summary, report_gmm, mode, extract, threshold_gmm, ambiguous) + gmm_demux(hto_matrix,hto_name_gmm,summary,report_gmm,mode,extract,threshold_gmm,ambiguous) + emit: gmm_demux.out.collect() } diff --git a/test.config b/test.config index 5c74183..f1c0044 100644 --- a/test.config +++ b/test.config @@ -1,8 +1,8 @@ params { outdir = "result_test" // input for hashing-based deconvolution - hto_matrix_raw = "$projectDir/test_data/hto" - hto_matrix_filtered = "$projectDir/test_data/hto" + hto_matrix_raw = "$projectDir/test_data/hto_gz" + hto_matrix_filtered = "$projectDir/test_data/hto_gz" rna_matrix_raw = "$projectDir/test_data/rna" rna_matrix_filtered = "$projectDir/test_data/rna" @@ -23,4 +23,7 @@ params { // donor genotype file provided by popscle doesnt work on souporcell use_known_genotype = "False" ignore = "True" + //gmm_demux input + hto_name_gmm = "hto_1,hto_2" + } diff --git a/test_data/hto_gz/barcodes.tsv.gz b/test_data/hto_gz/barcodes.tsv.gz new file mode 100644 index 0000000..dd04944 Binary files /dev/null and b/test_data/hto_gz/barcodes.tsv.gz differ diff --git a/test_data/hto_gz/features.tsv.gz b/test_data/hto_gz/features.tsv.gz new file mode 100644 index 0000000..38d469b Binary files /dev/null and b/test_data/hto_gz/features.tsv.gz differ diff --git a/test_data/hto_gz/matrix.mtx.gz b/test_data/hto_gz/matrix.mtx.gz new file mode 100644 index 0000000..e9e65fc Binary files /dev/null and b/test_data/hto_gz/matrix.mtx.gz differ diff --git a/test_data/rna_gz/barcodes.tsv.gz b/test_data/rna_gz/barcodes.tsv.gz new file mode 100644 index 0000000..dd04944 Binary files /dev/null and b/test_data/rna_gz/barcodes.tsv.gz differ diff --git a/test_data/rna_gz/genes.tsv.gz b/test_data/rna_gz/genes.tsv.gz new file mode 100644 index 0000000..c240cef Binary files /dev/null and b/test_data/rna_gz/genes.tsv.gz differ diff --git a/test_data/rna_gz/matrix.mtx.gz b/test_data/rna_gz/matrix.mtx.gz new file mode 100644 index 0000000..d0e9510 Binary files /dev/null and b/test_data/rna_gz/matrix.mtx.gz differ