-
Notifications
You must be signed in to change notification settings - Fork 2
/
fungison.pl
160 lines (140 loc) · 5.41 KB
/
fungison.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
#Created by Pablo Cruz-Morales, sept 2022
#It writes the globals.pm file using user defined parameters
#It delivers the results in a directory named after the -q option
#this script should be placed at /fungison
#the rest should be under /fungison/bin
#the genomes database must be in /fungison/bin/genomes
#the GENOMES.Ids file must be in /fungison/bin/
use strict;
#use warnings;
use Getopt::Long qw(GetOptions);
print "USAGE: perl fungison.pl <OPTIONS>\n\n";
print "OPTIONS:\n\n";
print "-q FILE.query |QUERY FILE, [a file with .query extension}\n";
print "-r 1234 |REFERENCE GENOME ID FROM GENOMES.IDs, WHEN NOT USING -d full MAKE SURE THE ENTRY IS LISTED IN -d [a number]\n";
print "-e 0.0000001 |E-VALUE CUTOFF, [a number]\n";
print "-s 200 |BIT-SCORE CUTOFF [a number]\n";
print "-f 10 |NUMBER OF FLAKING GENES INCLUDED IN THE ANALYSIS, [a number]\n";
print "-d full OR -db 1,2,3 |IDs OF THE GENOMES INCLUDED IN THE ANALYSIS, ][full= entire database OR selected genomes separated by ',' ]\n";
print "-x no or -F FORMATDB |FORMAT THE DATABASE SELECTED WITH THE -d OPTION, ['no' is the recommeded option or 'FORMATDB']\n\n";
my $query;
my $reference;
my $evalue;
my $score;
my $database;
my $flanks;
my $refname;
my $databasesize;
my $dbstatement;
my $filenames;
my $formatoption;
GetOptions(
'q=s' => \$query,
'r=s' => \$reference,
'e=s' => \$evalue,
's=s' => \$score,
'd=s' => \$database,
'f=s' => \$flanks,
'x=s' => \$formatoption,
) or die "missing parameters\n";
#printing the globals.pm file
open OUT, ">./bin/globals.pm";
#printing defaults
print OUT "use lib \'\.\/\'\;\n";
print OUT "use Cwd\;\n";
print OUT "\$eCluster=\"0.1\"\;\n";
print OUT "\$eCore=\"0.1\"\;\n";
print OUT "\$GENOMES_IDs=\"GENOMES.IDs\"\;\n";
print OUT "\$BLAST_CALL=\"\"\;\n";
print OUT "\$currWorkDir = &Cwd::cwd();\n";
print OUT "\$dir=\$currWorkDir\;\n";
print OUT "\$NAME= pop \@\{\[split m|/|, \$currWorkDir]}\;\n";
print OUT "\$BLAST=\"\$NAME.blast\";\n";
print OUT "\$NUM = `wc -l < \$GENOMES_IDs`;\n";
print OUT "chomp \$NUM;\n";
print OUT "\$NUM=int(\$NUM);\n";
#edit this to zoom in or out inthe SVG canvas
print OUT "\$RESCALE=195000;\n";
#printing user input parameters
#database selection options
if ($database=~/full/) {
$databasesize=`grep "." ./bin/GENOMES.IDs -c`;
chomp $databasesize;
$dbstatement="You are searching in the full database with $databasesize entries";
print OUT "\$LIST=\"\";\n";
}
elsif ($database=~/,/) {
print OUT "\$LIST=\"$database\";\n";
$dbstatement="You are searching in entries $database";
}
else {
die "missing argument -d full OR -db 1,2,3 [IDs OF THE GENOMES INCLUDED IN THE ANALYSIS, full= ENTIRE DATABASE, selected genomes separated by ',']\n\n";
}
#database formating options
if ($formatoption=~/FORMATDB/) {
print OUT "\$FORMAT_DB=\"1\"\;\n";
print "The database will be created with $database entries\n";
}
elsif ($formatoption=~/no/) {
print OUT "\$FORMAT_DB=\"0\"\;\n";
print "\nYou selected the current database, no DB formatting is requiered\n";
print "\nWARNING: MAKE SURE THE CURRENT DATABASE INCLUDES YOUR ENTRIES\n";
print "E.G. USE OF options '-d 1,2,3,4 -x FORMATDB' CREATES A DATABASE WITH ONLY ENTRIES 1,2,3,4\n";
print "TRY -db full and -x FORMATDB TO CREATE A DATABASE WITH ALL THE ENTRIES\n";
}
else {
die "missing argument -x no or -x FORMATDB [FORMAT THE DATABASE SELECTED WITH THE -d OPTION, 'no' or 'FORMATDB]\n\n";
}
if ($query=~/.+query/) {
print OUT "\$QUERIES=\"$query\"\;\n";
system "cp $query ./bin/$query";
}
else {
die "missing argument -q FILE.query [QUERY FILE, a file with .query extension]\n\n";
}
if ($reference) {
$refname=`awk '(\$4==$reference){print \$3}' ./bin/GENOMES.IDs`;
chomp $refname;
print OUT "\$SPECIAL_ORG=\"$reference\";\n";
}
else {
die "missing argument -r 1234 [REFERENCE GENOME ID FROM GENOMES.IDs INDEX, a number]\n\n";
}
if ($evalue) {
print OUT "\$e=\"$evalue\"\;\n";
}
else {
die "missing argument -e 0.0000001 [E-VALUE CUTOFF, a number]\n\n";
}
if ($score) {
print OUT "\$BITSCORE=\"$score\"\;\n";
}
else {
die "missing argument -s 200 [BIT-SCORE CUTOFF a number]\n\n";
}
if ($flanks) {
print OUT "\$ClusterRadio=\"$flanks\"\;\n";
}
else {
die "missing argument -f 10 [NUMBER OF FLAKING GENES INCLUDED IN THE ANALYSIS, a number]\n\n";
}
close OUT;
print "All arguments were provided\n";
print "Running CORASON2 with query $query and reference gene context from entry ID:$reference, $refname\n";
print "The e-value cutoff is $evalue and the bitscore cut-off is $score\n";
print "$dbstatement, the search will be done for $flanks genes flanking the query hits\n\n";
#running corason2.pl
system "cd bin; perl corason2.pl";
#labelling and organizing the outputs
system "mv ./bin/*.contree ./bin/Report.report ./bin/concatenated_matrix.aln ./bin/GENE_CONTEXT.svg ./bin/QUERY_HITS.aln ./bin/*.BLAST .";
$query=~/(.+).(query)/;
$filenames="$1";
system "mkdir $filenames\_results";
system "mv concatenated_matrix.aln ./$filenames\_results/$filenames.core.aln";
system "mv GENE_CONTEXT.svg ./$filenames\_results/$filenames.gene_context.svg";
system "mv QUERY_HITS.aln ./$filenames\_results/$filenames.hits.aln";
system "mv QUERY_HITS.aln.contree ./$filenames\_results/$filenames.hits.contree";
system "mv concatenated_matrix.aln.contree ./$filenames\_results/$filenames.core.contree";
system "mv Report.report ./$filenames\_results/$filenames.report";
system "mv *.BLAST ./$filenames\_results/";
system "rm ./bin/$query";