forked from MelinaKlostermann/Molitor-et-al-2022
-
Notifications
You must be signed in to change notification settings - Fork 0
/
0_get_stuff_ready.Rmd
187 lines (138 loc) · 5.73 KB
/
0_get_stuff_ready.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
---
title: "0 Getting thing ready: filter annotation, import crosslinks"
author: "Melina"
date: "`r format(Sys.time(), '%d %B, %Y')`"
output:
BiocStyle::html_document:
toc_float: TRUE
toc: TRUE
vignette: >
%\VignetteIndexEntry{AS USP39 KD}
%\VignetteEngine{knitr::rmarkdown}
%\VignetteEncoding{UTF-8}
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE, results = TRUE)
```
# What was done?
- The gencode annotation retrieved as gtf file is filtered for standard chromosomes. Transcripts with transcript support level <=3 or NA are only kept if no other transcript of the same gene with a higher levels exists.
- Crosslinks files are imported as Granges, split into 1nt size and stored
```{r}
# libraries
library(tidyverse)
library(rtracklayer)
library(GenomicRanges)
library(GenomicFeatures)
```
```{r}
# bigwig files of crosslink events (all 4 samples merged)
bw_all_plus_path <-
"/Users/melinaklostermann/Documents/projects/PURA/01_raw_data/PURA_endo/imb_koenig_2020_07_koenig_iCLIP_PURA_endogene/merged/bw/imb_koenig_2020_07_PURAendo.v2uniqMD.duprm.plus.bw"
bw_all_minus_path <-
"/Users/melinaklostermann/Documents/projects/PURA/01_raw_data/PURA_endo/imb_koenig_2020_07_koenig_iCLIP_PURA_endogene/merged/bw/imb_koenig_2020_07_PURAendo.v2uniqMD.duprm.minus.bw"
# annotation
mygft <- "/Users/melinaklostermann/Documents/projects/anno/GENCODEv31-p12/gencode.v31.annotation.gff3"
```
# Crosslink files
```{r}
bw_all_plus <- import.bw(bw_all_plus_path)
bw_all_minus <- import.bw(bw_all_minus_path)
strand(bw_all_plus) <- "+"
strand(bw_all_minus) <- "-"
################################
# Split crosslinks to 1-nt events
################################
split_bw_crosslinks_to_1_nt <- function(bw){
# split ranges in 1 nt events
bw_split <- exomeCopy::subdivideGRanges(bw, subsize=1)
# match scores by an overlap index
idx <- findOverlaps(bw_split, bw)
# add scores
bw_split$score <- bw[subjectHits(idx)]$score
# read strand info
strand(bw_split) <- strand(bw)
return(bw_split)
}
bw_merges <- list(bw_all_plus, bw_all_minus) %>% lapply(., function(x) split_bw_crosslinks_to_1_nt(bw=x)) %>%
lapply(., function(x) keepStandardChromosomes(x, pruning.mode = "coarse"))
saveRDS(bw_merges, paste0("/Users/melinaklostermann/Documents/projects/PURA/Molitor-et-al-2022/", "bw_merges.rds"))
```
# Filter annotation
```{r}
filter_gft_anno <- function(gft_anno, standard_chrom, protein_coding, include_GL_3, longest_transcript){
library(GenomicRanges)
library(dplyr)
library(GenomicFeatures)
# load gft
gft <- rtracklayer::import(gft_anno)
# standard chromosomes
if(standard_chrom == T){
gft <-keepStandardChromosomes(gft, pruning.mode = "coarse")
}
# protein-coding transcripts
if(protein_coding == T){
gft <- gft[gft$gene_type=="protein_coding"]
gft <- c(gft[gft$type!="gene" & gft$transcript_type=="protein_coding"], gft[gft$type=="gene"])
}
# gene level 1 or 2
gft_GL<- gft[gft$level <= 2]
# gene level 3
if(include_GL_3==T){
gft_GL3 <- gft[gft$level==3 & !(gft$gene_id %in% gft_GL$gene_id)]
gft_GL <- c(gft_GL, gft_GL3)
}
# transcript support level <=3 or NA
gft_GL_TL <-gft_GL[!is.na(gft_GL$transcript_support_level) & gft_GL$transcript_support_level <= 3]
gft_TL_NA <- gft_GL[is.na(gft_GL$transcript_support_level)]
gft_Transcripts <- c(gft_GL_TL, gft_TL_NA[!(gft_TL_NA$gene_id %in% gft_GL_TL$gene_id)])
gft_gen_trans <- c(gft_Transcripts, gft_GL[gft_GL$type=="gene"]) # readd genes (have transcript support level NA)
# longest transcript
if(longest_transcript==T){
# use genomicfeatures file to get information about transcript length (sum of exons)
gft_GenFeat<- makeTxDbFromGRanges(gft_gen_trans)
transcriptLength <- transcriptLengths(gft_GenFeat)
# add transcript_length to granges
gft_GL_TL_dframe <- as.data.frame(gft_gen_trans)
gft_with_tx_len <- merge(transcriptLength[, c("tx_name", "tx_len")], gft_GL_TL_dframe, by.x ="tx_name", by.y = "transcript_id",
all.y = TRUE)
gft_longestTranscript <- gft_with_tx_len[gft_with_tx_len$type == "transcript", ] %>%
dplyr::group_by(.$gene_id) %>%
dplyr::arrange(dplyr::desc(.$tx_len)) %>%
dplyr::slice(1) %>%
ungroup()
gft_regions_longestTranscript <- gft_gen_trans[gft_gen_trans$transcript_id %in% gft_longestTranscript$tx_name]
gft_region_genes_LT <- c(gft_regions_longestTranscript, gft_gen_trans[gft_gen_trans$type=="gene" & gft_gen_trans$gene_id %in% gft_regions_longestTranscript$gene_id])
gft_gen_trans <- gft_region_genes_LT
}
return(gft_gen_trans)
}
annotation <- filter_gft_anno(gft_anno = mygft,
standard_chrom = T,
protein_coding = F,
include_GL_3 = T,
longest_transcript = F)
```
## clean annotation
- exclude pseudogenes
- throw out identical annotations
- shorten gene_ids
```{r}
# exclude pseudogenes
annotation <- annotation[!grepl(annotation$gene_type, pattern = "pseudo") & !(annotation$gene_type %in% c("TEC", "Mt_tRNA", "Mt_rRNA"))]
# throw out identical annotations
annotation <- as.data.frame(annotation) %>%
unique.data.frame(.) %>%
makeGRangesFromDataFrame(keep.extra.columns = T)
# shorten gene_ids
annotation$gene_id <- substr(annotation$gene_id, 1,15 )
# make txdb to load it faster later
anno_txdb <- makeTxDbFromGRanges(annotation)
```
```{r}
# save filtered annotation
saveRDS(annotation, paste0("/Users/melinaklostermann/Documents/projects/PURA/Molitor-et-al-2022/", "annotation.rds"))
saveDb(anno_txdb , file = "/Users/melinaklostermann/Documents/projects/PURA/Molitor-et-al-2022/annotation_txdb.db")
```
# Output
- crosslinks: bw_merges.rds
-filtered annotation: annotation.rds