-
Notifications
You must be signed in to change notification settings - Fork 4
/
plot_busco_combined_table.R
78 lines (64 loc) · 3.57 KB
/
plot_busco_combined_table.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
# plot combined BUSCO results from multiple taxa
# last modified 2023-06-13
# PDF will be automatically generated
# input file was generated by
# ./combine_busco_table_results.py -s species_to_dir_list.tab > hexa_species_all_busco_results.tab
library(tidyr)
library(ggplot2)
args = commandArgs(trailingOnly=TRUE)
inputfile = args[1]
#inputfile = "~/git/Aphrocallistes_vastus_genome/hexa_phylogeny/hexa_species_all_busco_results.tab"
if (is.na(inputfile)){print("# ERROR: NO INPUT FILE")}
outputfile = gsub("([\\w/]+)\\....$","\\1.pdf",gsub(".gz","",inputfile),perl=TRUE)
print(paste("# reading input file from", inputfile, sep=" " ))
busco_data = read.table(inputfile, header = TRUE, sep = "\t", stringsAsFactors = FALSE)
dim(busco_data)
gene_ids = busco_data$gene_id
sp_names = names(busco_data[,-1])
gene_totals = rowSums(busco_data[,-1])
is_empty_gene = (gene_totals == 0)
#gene_ids[is_empty_gene]
#rownames(busco_data)[gene_totals == 0]
# [1] "239149at33208" "257141at33208" "344346at33208" "346541at33208" "359532at33208" "361216at33208"
# [7] "495802at33208" "522988at33208" "526839at33208" "539215at33208" "556039at33208" "557694at33208"
# [13] "558385at33208" "560566at33208" "591391at33208" "608669at33208" "640187at33208" "643954at33208"
#hexa_totals = rowSums(busco_data[,2:21])
#demo_totals = rowSums(busco_data[,22:23])
#rownames(busco_data)[hexa_totals == 0 & demo_totals > 0]
#gene_ids[hexa_totals == 0 & demo_totals > 0]
# [1] "14776at33208" "40200at33208" "108764at33208" "114954at33208" "121127at33208" "157138at33208"
# [7] "201397at33208" "203698at33208" "206858at33208" "273367at33208" "279992at33208" "295556at33208"
# [13] "334272at33208" "356175at33208" "360379at33208" "360753at33208" "365561at33208" "383313at33208"
# [19] "390402at33208" "393219at33208" "399439at33208" "412930at33208" "430701at33208" "432287at33208"
# [25] "435253at33208" "439736at33208" "440195at33208" "447320at33208" "449056at33208" "464243at33208"
# [31] "470197at33208" "493505at33208" "495329at33208" "502584at33208" "503828at33208" "508986at33208"
# [37] "523376at33208" "524148at33208" "529708at33208" "534172at33208" "547099at33208" "549988at33208"
# [43] "552949at33208" "553411at33208" "556461at33208" "569812at33208" "582951at33208" "584333at33208"
# [49] "590347at33208" "601875at33208" "601886at33208" "606795at33208" "613522at33208" "619250at33208"
# [55] "625091at33208" "628231at33208" "631767at33208" "641073at33208" "648300at33208"
# complete duplicated frag none
busco_colors = c("#41b712ff","#d4f172ff","#ac38a2ff","#EEEEEE")
rstarts = c(1,97,193,289,384,481,577,673,769,865)
rends = c(96,192,288,384,480,576,672,768,864,954)
# print multipage pdf
print(paste("# generating PDF as", outputfile, sep=" " ))
pdf(file = outputfile, width = 11, height = 8, paper = "a4r", title = "BUSCO plot" )
for (i in 1:length(rstarts)){
bdlong = busco_data[rstarts[i]:rends[i],] %>%
pivot_longer(cols = !gene_id, names_to = "species", values_to = "match")
gene_occ_color = ifelse( is_empty_gene[rstarts[i]:rends[i]], "#c22294", "black")
print(
ggplot(bdlong, aes(y= species, x = gene_id, fill = as.factor(match))) +
theme(axis.text.x = element_text(angle=45, vjust=1, hjust=1, colour=gene_occ_color),
legend.title=element_blank(),
legend.position = "top",
legend.direction = "horizontal") +
labs(x=NULL, y=NULL,
caption = paste("Page",i,"of",length(rstarts))) +
scale_fill_manual(values=busco_colors, breaks=c(1, 100, 0.01, 0 ),
labels=c("Complete", "Duplicated", "Fragmented", "Missing")) +
geom_tile()
)
}
dev.off()
#