shengxinbaixiaosheng
/
RNA-Seq-Workshop-Library-Preparation-and-Introduction-to-Data-Analysis-2
Public
forked from ucdavis-bioinformatics-training/RNA-Seq-Workshop-Library-Preparation-and-Introduction-to-Data-Analysis-2
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Analysis_EdgeR_RNAseq.R
98 lines (78 loc) · 4.23 KB
/
Analysis_EdgeR_RNAseq.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
# 1. Load the edgeR package and use the utility function
library("edgeR")
# 2. Set up the experiment
samples <- read.table("samples.txt",header=T,as.is=T)
# Set up how you'll print the sample_id and condition variables, based on the samples.txt sheet
sample_id = samples$SAMPLE_ID
condition = samples$treatment
folder = "04-HTseqCounts"
results = "05-edgeR"
dir.create(results)
annof <- ""
# 3. Identify the count files and read them into R using readDGE
countf <- sapply(file.path(folder,samples$SAMPLE_ID),dir,pattern=".counts$",full.names=T)
counts = readDGE(countf)$counts
#### HTSEQ_COUNT RESULTS
# 4. Filter weakly expressed and noninformative (e.g., non-aligned) features:
noint = rownames(counts) %in% c("__no_feature","__ambiguous","__too_low_aQual",
"__not_aligned","__alignment_not_unique")
mean(colSums(counts[!noint,])/colSums(counts))
## MEAN % of reads map to features
cpms = cpm(counts) ## counts per million
# In edgeR, it is recommended to remove features without
# at least 1 read per million in n of the samples,
# where n is the size of the smallest group of replicates,
keep = rowSums(cpms >1) >=3 & !noint
dim(counts) ## the number of features you started with
counts = counts[keep,]
dim(counts) ## count counts of features you have left over after initial filter
colnames(counts) = samples$SAMPLE_ID
# 5. Create a DGEList object (edgeR's container for RNA-seq count data):
d = DGEList(counts=counts, group=condition)
# 6. Estimate normalization factors using, RNA composition and adjust for read depth:
d = calcNormFactors(d)
# 7. Inspect the relationships between samples using a multidimensional scaling (MDS) plot, as shown in Figure 4:
pdf(file.path(results,"MDS-edgeR.pdf"))
plotMDS(d, labels=sample_id,
col = rainbow(length(levels(factor(condition))))[factor(condition)],cex=0.6, main="MDS")
dev.off()
# edgeR - using glm
# 8. Create a design matrix to specify the factors that are expected to affect
# expression levels:
design = model.matrix( ~ treatment, samples) ## samples is your sample sheet, treatment is a column in the sample sheet
design
### Here it is pH6 - pH2.5 (so positive fold change indicates higher expression in Normal, negative is higher in Affected)
# 9. Estimate dispersion values, relative to the design matrix, using the Cox-Reid (CR)-adjusted likelihood
d2 = estimateGLMCommonDisp(d, design)
d2 = estimateGLMTrendedDisp(d2, design)
d2 = estimateGLMTagwiseDisp(d2, design)
# 10. plot the mean-variance relationship:
pdf(file.path(results,"mean.variance-edgeR.pdf"))
# Plot the relationship between mean expression and variance of expression
plotMeanVar(d2, show.tagwise.vars=TRUE, NBline=TRUE,main="MeanVar")
# Plot the Biological Coefficient of Variation (as opposed to technical coefficient of variation)
plotBCV(d2,main="BCV")
dev.off()
# 11. Given the design matrix and dispersion estimates, fit a GLM to each feature:
f = glmFit(d2, design)
# 12. Perform a likelihood ratio test, specifying the difference of interest
de = glmLRT(f, coef=2) ## Treatment coefficient
# 13. Use the topTags function to present a tabular summary of the differential expression statistics
tt = topTags(de, n=nrow(d)) ## all tags, sorted
head(tt$table) ## Check result
table(tt$table$FDR< 0.05) ## the number of "Statistically Differentially Expressed Genes" at an FDR of 0.05
# 15. Inspect the depth-adjusted reads per million for some of the top differentially expressed genes:
nc = cpm(d, normalized.lib.sizes=TRUE)
rn = rownames(tt$table)
head(nc[rn,order(samples$treatment)],5)
# 16. Plot the M (log-fold change) versus A (log-average expression)
deg = rn[tt$table$FDR < .05]
pdf(file.path(results,"smear-edgeR.pdf"))
plotSmear(d, de.tags=deg,main="Smear")
dev.off()
# 17. Save the result table as a CSV file:
write.table(tt$table,file=file.path(results,"toptags_edgeR.annotated.txt"),sep="\t",row.names=T,col.names=T,quote=F)
### if you have annotation
# anno <- read.table(annof,sep="\t",header=T,comment.char="",quote="",as.is=T)
# write.table(data.frame(tt$table,anno[match(rownames(tt$table),anno$Ensembl.Gene.ID),]),file=file.path(results,"toptags_tibia_edgeR.annotated.txt"),sep="\t",row.names=T,col.names=T,quote=F)
### END OF DIFFERENTIAL EXPRESSION ANALYSIS, ADDITIONAL PROCESSING ON THE TABLE CAN BE PERFORMED