forked from weizhongli/cdhit
-
Notifications
You must be signed in to change notification settings - Fork 0
/
clstr_sql_tbl.pl
executable file
·127 lines (111 loc) · 2.86 KB
/
clstr_sql_tbl.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#!/usr/bin/env perl
if(@ARGV==0){
print "Usage:\n\tclstr_sql_tbl.pl clstr_file tbl_file\n";
exit(1);
}
#usage clstr_sql_tbl.pl clstr_file tbl_file
# 1 if tbl_file does not exist
# create a new tbl_file
# 2 if tbl_file exists
# add 2 new columns to it
#format prot_id prot_len cid_level1 rep_level1 cid_level2 rep_level2 ... cid_leveln rep_leveln
# if I have db90.clstr db60.clstr then db30.clstr
#
# run clstr_sql_tbl.pl db90.clstr db_sql.txt #create first 4 columns
# clstr_sql_tbl.pl db60.clstr db_sql.txt #adding next 2 columns
# clstr_sql_tbl.pl db30.clstr db_sql.txt #adding next 2 columns
my $clstr_file = shift;
my $tbl_file = shift;
my $tbl_file_tmp = shift;
$tbl_file_tmp = "$tbl_file.$$" unless ($tbl_file_tmp);
my ($i, $j, $k, $ll);
if (not (-e $tbl_file)) {
open(TMP, $clstr_file) || die;
open(OUT, "> $tbl_file") || die;
my $cid = -1;
while($ll=<TMP>){
if ($ll =~ /^>/) {
$cid++;
}
else {
chop($ll);
if ($ll =~ /\s(\d+)(aa|nt), >(.+)\.\.\./) {
my $len = $1;
my $id = $3;
my $rep = 0;
if ($ll =~ /\*$/) { $rep = 1; }
print OUT "$id\t$len\t$cid\t$rep\n";
}
else {
die "format error $ll";
}
}
}
close(OUT);
close(TMP);
}
else {
add_info_to_tbl();
}
sub add_info_to_tbl {
my ($i, $j, $k, $ll);
my %id2cid = ();
my %idisrep = ();
open(TMP, $clstr_file) || die;
my $cid = -1;
while($ll=<TMP>){
if ($ll =~ /^>/) {
$cid++;
}
else {
chop($ll);
if ($ll =~ /\s(\d+)(aa|nt), >(.+)\.\.\./) {
my $id = $3;
if ($ll =~ /\*$/) { $idisrep{$id} = 1;}
$id2cid{$id} = $cid;
}
else {
die "format error $ll";
}
}
}
close(TMP);
print STDERR "done reading $clstr_file\n";
my @last_cid_2_id = ();
open(TMP0, $tbl_file) || die;
while($ll=<TMP0>) {
chop($ll);
my @lls = split(/\t/, $ll);
my $id = $lls[0];
my $last_level_cid = $lls[-2];
my $last_level_rep = $lls[-1];
next unless ($last_level_rep == 1);
$last_cid_2_id[$last_level_cid] = $id;
}
close(TMP0);
open(TMP2, $tbl_file) || die;
open(OUT, "> $tbl_file_tmp") || die;
while($ll=<TMP2>) {
chop($ll);
my @lls = split(/\t/, $ll);
my $id = $lls[0];
my $last_level_cid = $lls[-2];
my $last_level_rep = $lls[-1];
my $this_level_cid = "-1";
my $this_level_rep = 0;
if (defined($idisrep{$id})) { $this_level_rep = $idisrep{$id} ;}
if (defined($id2cid{$id})) { $this_level_cid = $id2cid{$id}; }
else {
my $rep1 = $last_cid_2_id[$last_level_cid];
if (defined($id2cid{$rep1})) { $this_level_cid = $id2cid{$rep1}; }
else { die "at $ll";}
}
print OUT "$ll\t$this_level_cid\t$this_level_rep\n";
}
close(OUT);
close(TMP2);
if(-e $tbl_file_tmp){
system("cp $tbl_file_tmp $tbl_file");
system("rm -f $tbl_file_tmp");
}
}