-
Notifications
You must be signed in to change notification settings - Fork 21
/
load_SubreadOutput.R
78 lines (67 loc) · 3.92 KB
/
load_SubreadOutput.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
## Load Fcount output from : DEXSeq_after_Fcount.R into DEXSeq
## Copyright 2015 Vivek Bhardwaj ([email protected]). Licence: GPLv3.
suppressPackageStartupMessages({
require(DEXSeq)
})
## Read Fcount output and convert to dxd
DEXSeqDataSetFromFeatureCounts <- function (countfile, sampleData,
design = ~sample + exon + condition:exon, flattenedfile = NULL)
{
# Take a fcount file and convert it to dcounts for dexseq
message("Reading and adding Exon IDs for DEXSeq")
read.table(countfile,skip = 2) %>% dplyr::arrange(V1,V3,V4) %>% dplyr::select(-(V2:V6)) -> dcounts
colnames(dcounts) <- c("GeneID", rownames(sampleData) )
id <- as.character(dcounts[,1])
n <- id
split(n,id) <- lapply(split(n ,id), seq_along )
rownames(dcounts) <- sprintf("%s%s%03.f",id,":E",as.numeric(n))
dcounts <- dcounts[,2:ncol(dcounts)]
dcounts <- dcounts[substr(rownames(dcounts), 1, 1) != "_", ] #remove _ from beginnning of gene name
## get genes and exon names out
splitted <- strsplit(rownames(dcounts), ":")
exons <- sapply(splitted, "[[", 2)
genesrle <- sapply(splitted, "[[", 1)
## parse the flattened file
if (!is.null(flattenedfile)) {
aggregates <- read.delim(flattenedfile, stringsAsFactors = FALSE,
header = FALSE)
colnames(aggregates) <- c("chr", "source", "class", "start",
"end", "ex", "strand", "ex2", "attr")
aggregates$strand <- gsub("\\.", "*", aggregates$strand)
aggregates <- aggregates[which(aggregates$class == "exon"), # exonic_part
]
aggregates$attr <- gsub("\"|=|;", "", aggregates$attr)
aggregates$gene_id <- sub(".*gene_id\\s(\\S+).*", "\\1",
aggregates$attr)
# trim the gene_ids to 255 chars in order to match with featurecounts
longIDs <- sum(nchar(unique(aggregates$gene_id)) > 255)
warning(paste0(longIDs,
" aggregate geneIDs were found truncated in featureCounts output"),
call. = FALSE)
aggregates$gene_id <- substr(aggregates$gene_id,1,255)
transcripts <- gsub(".*transcripts\\s(\\S+).*", "\\1",
aggregates$attr)
transcripts <- strsplit(transcripts, "\\+")
exonids <- gsub(".*exon_number\\s(\\S+).*", "\\1", # exonic_part_number
aggregates$attr)
exoninfo <- GRanges(as.character(aggregates$chr), IRanges(start = aggregates$start,
end = aggregates$end), strand = aggregates$strand)
names(exoninfo) <- paste(aggregates$gene_id, exonids,
sep = ":E")
names(transcripts) <- names(exoninfo)
if (!all(rownames(dcounts) %in% names(exoninfo))) {
stop("Count files do not correspond to the flattened annotation file")
}
matching <- match(rownames(dcounts), names(exoninfo))
stopifnot(all(names(exoninfo[matching]) == rownames(dcounts)))
stopifnot(all(names(transcripts[matching]) == rownames(dcounts)))
dxd <- DEXSeqDataSet(dcounts, sampleData, design, exons,
genesrle, exoninfo[matching], transcripts[matching])
return(dxd)
}
else {
dxd <- DEXSeqDataSet(dcounts, sampleData, design, exons,
genesrle)
return(dxd)
}
}