-
Notifications
You must be signed in to change notification settings - Fork 0
/
Rscript.r
201 lines (174 loc) · 7.32 KB
/
Rscript.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
## Protease cleavage site specificity analysis - PICS
# If you have any problem running the code you can check the following:
# 1 - pay attention to the conflicts between the packages used here and those ones you have appended
# 2 - run the sessionInfo() to see the versions of each package used
# 3 - of course, check the names of your files peptide.tsv and protein.fas. Sometimes people change names
# output of the sessionInfo() in my workflow
# other attached packages:
# [1] ggseqlogo_0.1 janitor_2.2.0 pheatmap_1.0.12 lubridate_1.9.3
# [5] forcats_1.0.0 stringr_1.5.0 dplyr_1.1.3 purrr_1.0.2
# [9] readr_2.1.4 tidyr_1.3.0 tibble_3.2.1 ggplot2_3.4.4
#[13] tidyverse_2.0.0 seqinr_4.2-30
library(seqinr)
library(tidyverse)
library(pheatmap)
library(janitor)
library(ggseqlogo)
# replace the comment with your working directory in setwd("")
# the working directory should be the folder where you have the peptide.tsv and the protein.fas files.
setwd("/yourWorkingDirectoryHere")
# load the peptide file from FragPipe output as peptides_df
peptides_df <- read_tsv("peptide.tsv") %>%
clean_names() %>%
rename(name = protein)
# read the fasta file and store sequences
fasta_sequences <- read.fasta("protein.fas",
seqtype = "AA",
as.string = TRUE
)
# extract names from fasta file
fasta_names <- sapply(fasta_sequences, attr, "name")
# function to match names and extract sequences
matched_sequences <- lapply(peptides_df$name, function(name) {
idx <- match(name, fasta_names)
if (!is.na(idx)) {
return(as.character(fasta_sequences[[idx]]))
} else {
return("Name not found in fasta file")
}
})
# add matched sequences to the data frame
peptides_df$protein_sequence <- matched_sequences
# match peptide sequence, extract previous and next 4 amino acids, and add to new columns
peptides_df <- peptides_df %>%
mutate(previous_4_aa = sapply(1:nrow(.), function(row_idx) {
peptide <- peptides_df$peptide[row_idx]
protein <- peptides_df$protein_sequence[row_idx]
match_idx <- regexpr(peptide, protein)
if (match_idx > 0) {
start_idx <- max(match_idx - 4, 1)
previous_aa <- substr(protein, start_idx, match_idx - 1)
return(previous_aa)
} else {
return("Peptide not found in protein")
}
}),
next_4_aa = sapply(1:nrow(.), function(row_idx) {
peptide <- peptides_df$peptide[row_idx]
protein <- peptides_df$protein_sequence[row_idx]
match_idx <- regexpr(peptide, protein)
if (match_idx > 0) {
end_idx <- match_idx + attr(match_idx, "match.length")
next_aa <- substr(protein, end_idx, end_idx + 3)
return(next_aa)
} else {
return("Peptide not found in protein")
}
}))
# Add new columns with the previous and the next 4 amino acids + the peptide fragment
# hint 1 - if you did your search in MaxQuant you already have this column in your peptide file
# hint 2 - if you are using the latest version of FragPipe and selecting your peptide sequences from psm.tsv
# file instead of peptide.tsv file, you already have this column as extended peptide sequence
peptides_df <- peptides_df %>%
dplyr::mutate(
fingerprint_Nterm = paste(previous_4_aa, peptide, sep = ""),
fingerprint_Cterm = paste(peptide, next_4_aa, sep = "")
) %>%
filter(
!grepl("Peptide not found in protein", fingerprint_Nterm),
previous_4_aa != "M"
) %>%
filter(nchar(fingerprint_Nterm) >= 8) %>% # remove peptides with less than 8 amino acids in the N-term
dplyr::mutate(
fingerprint_Nterm = substr(fingerprint_Nterm, 1, 8),
fingerprint_Cterm = substr(fingerprint_Cterm, nchar(fingerprint_Cterm) - 7, nchar(fingerprint_Cterm)),
fingerprint_Cterm = case_when(
nchar(next_4_aa) < 4 ~ NA_character_,
TRUE ~ fingerprint_Cterm
)
)
# if the column next4_aa is of lenght 0, it means that the peptide is at the end of the protein and the next_4_aa are not available.
# So the peptide is removed from the fingerprint_Cterm column
view(peptides_df)
# at this point you should be able to plot the sequence logo of the fingerprint in the N-term
ggseqlogo(
peptides_df$fingerprint_Nterm,
method = "bits",
seq_type = "AA") +
theme_bw() +
geom_vline(xintercept = 4.5, color = "black", linetype = "dashed") +
labs(
x = "Position",
y = "Frequency (bits)",
title = "Sequence logo of the N-terminal fingerprint"
) +
theme(
text = element_text(size = 25)
)
# and the sequence logo of the fingerprint in the C-term
ggseqlogo(
peptides_df$fingerprint_Cterm %>% na.omit(),
method = "bits",
seq_type = "AA",
na_col = "grey50") +
theme_bw() +
geom_vline(xintercept = 4.5, color = "black", linetype = "dashed") +
labs(
x = "Position",
y = "Frequency (bits)",
title = "Sequence logo of the C-terminal fingerprint"
) +
theme(
text = element_text(size = 25)
)
# create a list of peptide sequences including the N-term and C-term fingerprints
fingerprint_protease <- c(peptides_df$fingerprint_Nterm, peptides_df$fingerprint_Cterm) %>%
na.omit() %>%
strsplit("")
# create a matrix with the list of peptide sequences
mat_aa <- matrix(unlist(fingerprint_protease), ncol = 8, byrow = TRUE)
# remove the rows containing "B" or any other unwanted amino acids in the matrix
# mat_aa <- mat_aa[!apply(mat_aa, 1, function(x) any(x == "B")), ]
# function to calculate the frequency of each amino acid in each column in percentage
aa_freq <- function(x) {
table(x) / length(x) * 100
}
# calculate the frequency of each amino acid in each column and plot a heatmap
mat_aa_freq <- apply(mat_aa, 2, aa_freq)
# List of all 20 amino acids
# twenty_amino_acids <- c('A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y')
# Function to complete missing amino acids with zero and reorder if necessary
# you can skip this step if you have a complete matrix
complete_and_reorder_amino_acids <- function(element) {
# Complete missing amino acids with zero
for (amino_acid in twenty_amino_acids) {
if (!amino_acid %in% names(element)) {
element[[amino_acid]] <- 0
}
}
# Reorder amino acids
element <- element[match(twenty_amino_acids, names(element))]
return(element)
}
# If you used the function above, you want to apply it to each column of the matrix
# new_list <- lapply(mat_aa_freq, complete_and_reorder_amino_acids)
# If you didn't use the function above, you can jump to this step
# final_matrix <- matrix(unlist(new_list,), ncol = 8, byrow = FALSE)
colnames(mat_aa_freq) <- c("P4", "P3", "P2", "P1", "P1'", "P2'", "P3'", "P4'")
# row.names(final_matrix) <- c("A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y")
PICS_map <- mat_aa_freq %>%
pheatmap(
cluster_rows = FALSE,
cluster_cols = FALSE,
fontsize = 20,
show_rownames = TRUE,
show_colnames = TRUE,
color = colorRampPalette(c("grey95", "#A73030FF"))(20),
main = "Cleavage Site specificity",
gaps_col = 4,
angle_col = 0
)
ggsave("PICS_map.png",
PICS_map,
width = 10, height = 12,
units = "in", dpi = 300)