From 19da25a9ccaa65ed79f4bebb77148422afc2c3f1 Mon Sep 17 00:00:00 2001 From: Sam Westreich Date: Tue, 28 Aug 2018 11:54:01 -0700 Subject: [PATCH] PEAR output properly handled, counts tables R scripts added --- R_scripts/get_normalized_counts_table.R | 158 ++++++++++++++++++ R_scripts/get_raw_counts_table.R | 141 ++++++++++++++++ bash_scripts/master_script.sh | 2 +- .../test_of_master_script_tiny.bash | 2 +- 4 files changed, 301 insertions(+), 2 deletions(-) create mode 100644 R_scripts/get_normalized_counts_table.R create mode 100644 R_scripts/get_raw_counts_table.R diff --git a/R_scripts/get_normalized_counts_table.R b/R_scripts/get_normalized_counts_table.R new file mode 100644 index 0000000..014d1b8 --- /dev/null +++ b/R_scripts/get_normalized_counts_table.R @@ -0,0 +1,158 @@ +# Created 6/22/16, updated 6/16/2017 +# Run with --help flag for help. + +suppressPackageStartupMessages({ + library(optparse) +}) + +option_list = list( + make_option(c("-I", "--input"), type="character", default="./", + help="Input directory", metavar="character"), + make_option(c("-O", "--out"), type="character", default="normalized_counts_table.tab", + help="output file name [default= %default]", metavar="character"), + make_option(c("-R", "--raw_counts"), type="character", default=NULL, + help="raw (total) read counts for this starting file", metavar="character") +) + +opt_parser = OptionParser(option_list=option_list); +opt = parse_args(opt_parser); + +print("USAGE: $ get_normalized_counts_table.R -I working_directory/ -O save.filename") + +# check for necessary specs +if (is.null(opt$input)) { + print ("WARNING: No working input directory specified with '-I' flag.") + stop() +} else { cat ("Working directory is ", opt$input, "\n") + wd_location <- opt$input + setwd(wd_location) } + +if (is.null(opt$out)) { + print ("WARNING: No save name for DESeq results specified; defaulting to 'normalized_counts_table.tab'.") + save_filename <- opt$out +} else { save_filename <- opt$out } + +if (is.null(opt$raw_counts)) { + print ("WARNING: no raw counts file specified, skipping this info for DESeq analysis.") +} else { + counts_file <- opt$raw_counts +} + +# import other necessary packages +suppressPackageStartupMessages({ + library(DESeq2) +}) + +# GET FILE NAMES +control_files <- list.files( + pattern = "control_*", full.names = T, recursive = FALSE) +control_names = "" +for (name in control_files) { + control_names <- c(control_names, unlist(strsplit(name, split='_', fixed=TRUE))[2])} +control_names <- control_names[-1] +control_names_trimmed = "" +for (name in control_names) { + control_names_trimmed <- c(control_names_trimmed, unlist(strsplit(name, split='.', fixed=TRUE))[1])} +control_names_trimmed <- control_names_trimmed[-1] + +exp_files <- list.files( + pattern = "experimental_*", full.names = T, recursive = FALSE) +exp_names = "" +for (name in exp_files) { + exp_names <- c(exp_names, unlist(strsplit(name, split='_', fixed=TRUE))[2])} +exp_names <- exp_names[-1] +exp_names_trimmed = "" +for (name in exp_names) { + exp_names_trimmed <- c(exp_names_trimmed, unlist(strsplit(name, split='.', fixed=TRUE))[1])} +exp_names_trimmed <- exp_names_trimmed[-1] + +# READ IN FILES +# loading the control table +y <- 0 +for (x in control_files) { + y <- y + 1 + if (y == 1) { + control_table <- read.table(file = x, header = F, quote = "", sep = "\t") + colnames(control_table) = c("DELETE", x, "V3") + control_table <- control_table[,c(2,3)] } + if (y > 1) { + temp_table <- read.table(file = x, header = F, quote = "", sep = "\t") + colnames(temp_table) = c("DELETE", x, "V3") + temp_table <- temp_table[,c(2,3)] + control_table <- merge(control_table, temp_table, by = "V3", all = T) } +} +control_table[is.na(control_table)] <- 0 +rownames(control_table) = control_table$V3 +control_table_trimmed <- control_table[,-1] + +# loading the experimental table +y <- 0 +for (x in exp_files) { + y <- y + 1 + if (y == 1) { + exp_table <- read.table(file = x, header=F, quote = "", sep = "\t") + colnames(exp_table) = c("DELETE", x, "V3") + exp_table <- exp_table[,c(2,3)] } + if (y > 1) { + temp_table <- read.table(file = x, header = F, quote = "", sep = "\t") + colnames(temp_table) = c("DELETE", x, "V3") + exp_table <- merge(exp_table, temp_table[,c(2,3)], by = "V3", all = T) } +} +exp_table[is.na(exp_table)] <- 0 +rownames(exp_table) = exp_table$V3 +exp_table_trimmed <- exp_table[,-1] + +# getting the column names simplified +colnames(control_table_trimmed) = control_names_trimmed +colnames(exp_table_trimmed) = exp_names_trimmed + +# merging the two tables +complete_table <- merge(control_table_trimmed, exp_table_trimmed, by=0, all = TRUE) +complete_table[is.na(complete_table)] <- 0 +rownames(complete_table) <- complete_table$Row.names +complete_table <- complete_table[!(complete_table$Row.names == ""), ] +# reducing stuff down to avoid duplicates +complete_table <- complete_table[, lapply(.SD, sum), by=complete_table$Row.names] +complete_table <- aggregate(. ~ Row.names, data = complete_table, sum) +# removing extra Row.names column +complete_table <- complete_table[,-1] + +# OPTIONAL: importing the raw counts +if (is.null(opt$raw_counts) == FALSE) { + raw_counts_table <- read.table(counts_file, header=FALSE, sep = "\t", quote = "") + raw_counts_table <- data.frame(raw_counts_table, + do.call(rbind, strsplit(as.character(raw_counts_table$V1),'_'))) + raw_counts_table$X2 <- as.numeric(as.character(raw_counts_table$X2)) + raw_counts_table <- t(raw_counts_table[,c("X2", "V2")]) + row.names(raw_counts_table) <- c("SAMPLE","RAW TOTAL") + colnames(raw_counts_table) <- raw_counts_table[1,] + raw_counts_table <- as.data.frame(raw_counts_table) + raw_counts_table <- raw_counts_table[-1,] + + # Need to subtract off the total number of annotations + raw_counts_table["ANNOTATION COUNT",] <- colSums(complete_table) + raw_counts_table["OTHER",] <- raw_counts_table[1,] - raw_counts_table[2,] + + complete_table <- rbind(complete_table, raw_counts_table["OTHER",]) +} + +# DESeq statistical calculations +completeCondition <- data.frame(condition=factor(c( + rep(paste("control", 1:length(control_files), sep=".")), + rep(paste("experimental", 1:length(exp_files), sep="."))))) +completeCondition1 <- t(completeCondition) +colnames(complete_table) <- completeCondition1 +completeCondition2 <- data.frame(condition=factor(c( + rep("control", length(control_files)), + rep("experimental", length(exp_files))))) + +dds <- DESeqDataSetFromMatrix(complete_table, completeCondition2, ~condition) +dds <- DESeq(dds) + +# getting normalized counts +dds <- estimateSizeFactors(dds) +normalized_counts_table = counts(dds, normalized = TRUE) + +# saving and finishing up +cat ("\nSuccess!\nSaving normalized counts table as ", save_filename, "\n") +write.table(normalized_counts_table, file = save_filename, append = FALSE, quote = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE) diff --git a/R_scripts/get_raw_counts_table.R b/R_scripts/get_raw_counts_table.R new file mode 100644 index 0000000..305ed9c --- /dev/null +++ b/R_scripts/get_raw_counts_table.R @@ -0,0 +1,141 @@ +# Created 6/22/16, updated 6/16/2017 +# Run with --help flag for help. + +suppressPackageStartupMessages({ + library(optparse) +}) + +option_list = list( + make_option(c("-I", "--input"), type="character", default="./", + help="Input directory", metavar="character"), + make_option(c("-O", "--out"), type="character", default="raw_counts_table.tab", + help="output file name [default= %default]", metavar="character"), + make_option(c("-R", "--raw_counts"), type="character", default=NULL, + help="raw (total) read counts for this starting file", metavar="character") +) + +opt_parser = OptionParser(option_list=option_list); +opt = parse_args(opt_parser); + +print("USAGE: $ get_raw_counts_table.R -I working_directory/ -O save.filename") + +# check for necessary specs +if (is.null(opt$input)) { + print ("WARNING: No working input directory specified with '-I' flag.") + stop() +} else { cat ("Working directory is ", opt$input, "\n") + wd_location <- opt$input + setwd(wd_location) } + +if (is.null(opt$out)) { + print ("WARNING: No save name for DESeq results specified; defaulting to 'raw_counts_table.tab'.") + save_filename <- opt$out +} else { save_filename <- opt$out } + +if (is.null(opt$raw_counts)) { + print ("WARNING: no raw counts file specified, skipping this info for DESeq analysis.") +} else { + counts_file <- opt$raw_counts +} + +# import other necessary packages +#suppressPackageStartupMessages({ +# library(DESeq2) +#}) + +# GET FILE NAMES +control_files <- list.files( + pattern = "control_*", full.names = T, recursive = FALSE) +control_names = "" +for (name in control_files) { + control_names <- c(control_names, unlist(strsplit(name, split='_', fixed=TRUE))[2])} +control_names <- control_names[-1] +control_names_trimmed = "" +for (name in control_names) { + control_names_trimmed <- c(control_names_trimmed, unlist(strsplit(name, split='.', fixed=TRUE))[1])} +control_names_trimmed <- control_names_trimmed[-1] + +exp_files <- list.files( + pattern = "experimental_*", full.names = T, recursive = FALSE) +exp_names = "" +for (name in exp_files) { + exp_names <- c(exp_names, unlist(strsplit(name, split='_', fixed=TRUE))[2])} +exp_names <- exp_names[-1] +exp_names_trimmed = "" +for (name in exp_names) { + exp_names_trimmed <- c(exp_names_trimmed, unlist(strsplit(name, split='.', fixed=TRUE))[1])} +exp_names_trimmed <- exp_names_trimmed[-1] + +# READ IN FILES +# loading the control table +y <- 0 +for (x in control_files) { + y <- y + 1 + if (y == 1) { + control_table <- read.table(file = x, header = F, quote = "", sep = "\t") + colnames(control_table) = c("DELETE", x, "V3") + control_table <- control_table[,c(2,3)] } + if (y > 1) { + temp_table <- read.table(file = x, header = F, quote = "", sep = "\t") + colnames(temp_table) = c("DELETE", x, "V3") + temp_table <- temp_table[,c(2,3)] + control_table <- merge(control_table, temp_table, by = "V3", all = T) } +} +control_table[is.na(control_table)] <- 0 +rownames(control_table) = control_table$V3 +control_table_trimmed <- control_table[,-1] + +# loading the experimental table +y <- 0 +for (x in exp_files) { + y <- y + 1 + if (y == 1) { + exp_table <- read.table(file = x, header=F, quote = "", sep = "\t") + colnames(exp_table) = c("DELETE", x, "V3") + exp_table <- exp_table[,c(2,3)] } + if (y > 1) { + temp_table <- read.table(file = x, header = F, quote = "", sep = "\t") + colnames(temp_table) = c("DELETE", x, "V3") + exp_table <- merge(exp_table, temp_table[,c(2,3)], by = "V3", all = T) } +} +exp_table[is.na(exp_table)] <- 0 +rownames(exp_table) = exp_table$V3 +exp_table_trimmed <- exp_table[,-1] + +# getting the column names simplified +colnames(control_table_trimmed) = control_names_trimmed +colnames(exp_table_trimmed) = exp_names_trimmed + +# merging the two tables +complete_table <- merge(control_table_trimmed, exp_table_trimmed, by=0, all = TRUE) +complete_table[is.na(complete_table)] <- 0 +rownames(complete_table) <- complete_table$Row.names +complete_table <- complete_table[!(complete_table$Row.names == ""), ] +# reducing stuff down to avoid duplicates +complete_table <- complete_table[, lapply(.SD, sum), by=complete_table$Row.names] +complete_table <- aggregate(. ~ Row.names, data = complete_table, sum) +# removing extra Row.names column +complete_table <- complete_table[,-1] + +# OPTIONAL: importing the raw counts +if (is.null(opt$raw_counts) == FALSE) { + raw_counts_table <- read.table(counts_file, header=FALSE, sep = "\t", quote = "") + raw_counts_table <- data.frame(raw_counts_table, + do.call(rbind, strsplit(as.character(raw_counts_table$V1),'_'))) + raw_counts_table$X2 <- as.numeric(as.character(raw_counts_table$X2)) + raw_counts_table <- t(raw_counts_table[,c("X2", "V2")]) + row.names(raw_counts_table) <- c("SAMPLE","RAW TOTAL") + colnames(raw_counts_table) <- raw_counts_table[1,] + raw_counts_table <- as.data.frame(raw_counts_table) + raw_counts_table <- raw_counts_table[-1,] + + # Need to subtract off the total number of annotations + raw_counts_table["ANNOTATION COUNT",] <- colSums(complete_table) + raw_counts_table["OTHER",] <- raw_counts_table[1,] - raw_counts_table[2,] + + complete_table <- rbind(complete_table, raw_counts_table["OTHER",]) +} + +# writing the complete table +cat("\nNow saving raw counts table as ", save_filename, "\n") +write.table(complete_table, file = save_filename, append = FALSE, quote = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE) diff --git a/bash_scripts/master_script.sh b/bash_scripts/master_script.sh index ccff257..78ea57f 100644 --- a/bash_scripts/master_script.sh +++ b/bash_scripts/master_script.sh @@ -102,7 +102,7 @@ done # Note: if skipping PEAR, make sure that all starting files are in the # $STEP_1/ folder! -for file in $STEP_1/*.merged* +for file in $STEP_1/*.merged.assembled* do shortname=`echo $file | awk -F "merged" '{print $1 "cleaned.fastq"}'` checked java -jar $TRIMMOMATIC SE -phred33 $file $shortname SLIDINGWINDOW:4:15 MINLEN:99 diff --git a/setup_and_test/test_of_master_script_tiny.bash b/setup_and_test/test_of_master_script_tiny.bash index 04ff51b..60e72f7 100644 --- a/setup_and_test/test_of_master_script_tiny.bash +++ b/setup_and_test/test_of_master_script_tiny.bash @@ -97,7 +97,7 @@ echo -e "\nPaired-end merging step completed.\n" # Note: if skipping PEAR, make sure that all starting files are in the # $STEP_1/ folder! -for file in $STEP_1/*merged* +for file in $STEP_1/*merged.assembled* do shortname=`echo $file | awk -F "merged" '{print $1 "cleaned.fastq"}'` checked java -jar $TRIMMOMATIC SE -phred33 $file $shortname SLIDINGWINDOW:4:15 MINLEN:99