-
Notifications
You must be signed in to change notification settings - Fork 0
/
ppmi_deseq_salmon_star.R
185 lines (145 loc) · 7.64 KB
/
ppmi_deseq_salmon_star.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
# Title: ppmi_deseq_salmon_star.R
# Authorship: Elisa Gomez de Lope, Contact details: [email protected]
# Info: This script processes data and performs DESEQ analysis for STAR+featureCounts and Salmon quantification pipelines on all samples (PD/HC with Age & Gender as covariates)
# Usage: R ppmi_deseq_salmon_star.R
# Data: data from the quantification file at gene level, both STAR+FC & Salmon + pheno
# GC ----------------------------------------------------------------------
rm(list=ls())
gc(T)
# Packages ----------------------------------------------------------------
library(tidyverse)
library(tidyr)
library(dplyr)
library(stringr)
library(vroom)
library(DESeq2)
library(edgeR)
library(SummarizedExperiment)
# I/O ---------------------------------------------------------------------
analysis_name <- "01-dea-PDHC"
OUT_DIR_PLOTS <- paste0("../data/", analysis_name, "/01-plots")
OUT_DIR <- paste0("../data/", analysis_name, "/02-outfiles")
OUT_DIR_ENRICHMENT <- paste0("../data/", analysis_name, "/03-enrichment_tables")
PHENO.FILE <- "../data/00-cleansing/ppmi_pheno.tsv"
SALMON.FILE <- "../data/00-cleansing/ppmi_rnaseq_salmon_genequant_counts.tsv"
STAR.FILE <- "../data/00-cleansing/ppmi_rnaseq_fc_counts.tsv"
# ENSEMBL.FILE <- sort(list.files("../references/", pattern = "^ids2ensembldb_.+\\.tsv"), decreasing = TRUE)[1] # take newest version
# Main --------------------------------------------------------------------
if ((!dir.exists(OUT_DIR)) | (!dir.exists(OUT_DIR_PLOTS)) | (!dir.exists(OUT_DIR_ENRICHMENT))) {
dir.create(OUT_DIR, recursive = T)
dir.create(OUT_DIR_PLOTS, recursive = T)
dir.create(OUT_DIR_ENRICHMENT, recursive = T)
}
# Data load --------------------------------------------------------------
pheno <- vroom(PHENO.FILE, col_types = "cffdfildddddddddl")
salmon_genequant <- vroom(SALMON.FILE, col_types = cols())
fc_counts <- vroom(STAR.FILE, col_types = cols())
# prepare data for deseq analysis ----------------------------------------
# match pheno, quant and count data -> all same ids should be in all datasets but unfortunately they are not.
# assign rownames to gene ids, remove extra cols in fc counts and set salmon quants as integer
pheno <- pheno %>%
unite("patient_visit_id", c(GSEID, visit), sep = ".", remove = FALSE)
ids_phenoquant <- sort(colnames(salmon_genequant)[match(pheno$patient_visit_id, colnames(salmon_genequant))])
ids_phenocounts <- sort(colnames(fc_counts)[match(pheno$patient_visit_id, colnames(fc_counts))])
if ((length(ids_phenoquant) != length(pheno$patient_visit_id)) || (length(ids_phenocounts) != length(pheno$patient_visit_id))) {
message("###--------- ALERT: IDs in phenotype are not matching those in fc_counts or salmon quants-----------###")
if (!identical(ids_phenoquant, ids_phenocounts)) {
message("###--------- ALERT: IDs in fc_counts and salmon quants are not matching-----------###")
message("###--------- ALERT: Execution will stop -----------###")
stop()
} else {
message("###--------- ALERT: Non matching IDs will be removed from the corresponding datasets -----------###")
pheno_dsq <- pheno %>%
filter(patient_visit_id %in% ids_phenocounts)
salmon_genequant_dsq <- salmon_genequant %>%
dplyr::select(id, ids_phenoquant) %>%
remove_rownames %>%
column_to_rownames(var = "id") %>%
mutate(across(where(is.double), as.integer))
fc_counts_dsq <- fc_counts %>%
dplyr::select(Geneid, ids_phenocounts) %>%
remove_rownames %>%
column_to_rownames(var = "Geneid") %>%
mutate(across(where(is.double), as.integer))
}
} else {
message("Data in phenotype, fc_counts and salmon quants is matching")
salmon_genequant_dsq <- salmon_genequant %>%
remove_rownames %>%
column_to_rownames(var = "id") %>%
mutate(across(where(is.double), as.integer))
fc_counts_dsq <- fc_counts %>%
dplyr::select(Geneid, 7:ncol(.)) %>%
remove_rownames %>%
column_to_rownames(var = "Geneid") %>%
mutate(across(where(is.double), as.integer))
}
# export dsq data
readr::write_tsv(pheno_dsq, file = file.path(OUT_DIR, "ppmi_pheno_dsq.tsv"))
readr::write_tsv(salmon_genequant_dsq %>%
rownames_to_column(var = "geneid"), file = file.path(OUT_DIR, "salmon_genequant_dsq.tsv"))
readr::write_tsv(fc_counts_dsq %>%
rownames_to_column(var = "geneid") , file = file.path(OUT_DIR, "fc_counts_dsq.tsv"))
message("Data prepared for DESeq analysis")
rm(pheno, salmon_genequant, fc_counts)
# filtering & mean variance plots ----------------------------------------
# filtering of rows with no or nearly no information about the amount of gene expression with edgeR::filterByExpr()
# By default, the function keeps genes with about 10 read counts or more in a minimum number of samples, where the number of samples is chosen according to the minimum group sample size.
# mean - variance plot of filtered/unfiltered counts in salmon and star
# build DESeq2 object
E = list()
E[["salmon"]] <- DESeq2::DESeqDataSetFromMatrix(salmon_genequant_dsq, colData = pheno_dsq, design = ~ Diagnosis + Age + Gender)
E[["STAR"]] <- DESeq2::DESeqDataSetFromMatrix(fc_counts_dsq, colData = pheno_dsq, design = ~ Diagnosis + Age + Gender)
E.flt <- list()
E.flt.vst <- list()
plot_name <- "mean-variance.pdf"
pdf(file = file.path(OUT_DIR_PLOTS, plot_name), width = 12, height = 8)
par(mfrow = c(1,2))
for (pipeline in c("STAR", "salmon")) {
exprs <- DGEList(counts(E[[pipeline]]), group = colData(E[[pipeline]])$Diagnosis)
v <- voom(exprs, plot = T)
gene.flt <- edgeR::filterByExpr(exprs)
E.flt[[pipeline]] <- E[[pipeline]][gene.flt, ]
E.flt.vst[[pipeline]] <- vst(E.flt[[pipeline]], blind = FALSE)
exprs.flt <- DGEList(assay(E.flt.vst[[pipeline]]), group = colData(E.flt.vst[[pipeline]])$Diagnosis)
v.flt <- voom(exprs.flt, plot = T)
# readr::write_tsv(as.data.frame(assay(E.flt.vst[[pipeline]])) %>%
# rownames_to_column(var = "geneid"), file = file.path(OUT_DIR, paste0("flt_transf_", pipeline, ".tsv")))
}
dev.off()
rm(E.flt.vst, E, salmon_genequant_dsq, fc_counts_dsq, pheno_dsq)
# correlation of counts in salmon and star + fcounts ------------------
ix <- match(rownames(E.flt[["STAR"]]), rownames(E.flt[["salmon"]]))
df <- data.frame(
gene = rep(rownames(E.flt[["STAR"]]), ncol(E.flt[["STAR"]])),
STAR = c(counts(E.flt[["STAR"]])),
salmon = c(counts(E.flt[["salmon"]])[ix, ])
)
plot_name <- "correlation_STAR-salmon.pdf"
pdf(file = file.path(OUT_DIR_PLOTS, plot_name), width = 10, height = 8)
ggplot(data = df, aes(x = STAR, y = salmon)) +
geom_point(shape = 16, alpha = 0.1) +
theme_bw() +
xlab("STAR + featureCounts count") +
ylab("salmon count") +
scale_x_continuous(trans = 'log10') +
scale_y_continuous(trans = 'log10') +
geom_smooth(method = "lm", formula = 'y ~ x', orientation = "y")
dev.off()
# DEA of PD/controls for star and salmon counts ------------------------------
DDS <- list()
RES <- list()
for (pipeline in c("STAR", "salmon")) {
DDS[[pipeline]] <- DESeq(E.flt[[pipeline]])
res.flt <- results(DDS[[pipeline]], contrast = c("Diagnosis", "PD", "HC"))
res.flt$gene_id <- str_replace(rownames(res.flt), '\\.[0-9]*$', '')
# res.flt$gene_name <- ensembl.db$hgnc_symbol[match(res.flt$gene_id, ensembl.db$ensembl_gene_id)]
RES[[pipeline]] <- res.flt[order(res.flt$pvalue),]
readr::write_tsv(as.data.frame(RES[[pipeline]]) %>% rownames_to_column( var = "geneid") , file = file.path(OUT_DIR, paste0("DESeq_res_", pipeline, ".tsv")))
message(paste0("DESeq analysis for ", pipeline, " completed"))
}
# Session info ------------------------------------------------------------
rm(list = ls())
gc(T)
cat("\n================\n SESSION INFO\n================\n")
sessionInfo()