-
Notifications
You must be signed in to change notification settings - Fork 7
/
CTaxonNamesDictionary.h
178 lines (147 loc) · 5.85 KB
/
CTaxonNamesDictionary.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
/* BaitFisher (version 1.2.8) a program for designing DNA target enrichment baits
* Copyright 2013-2016 by Christoph Mayer
*
* This source file is part of the BaitFisher-package.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with BaitFisher. If not, see <http://www.gnu.org/licenses/>.
*
*
* For any enquiries send an Email to Christoph Mayer
*
* When publishing work that is based on the results please cite:
* Mayer et al. 2016: BaitFisher: A software package for multi-species target DNA enrichment probe design
*
*/
#ifndef CTAXONNAMESDICTIONARY_H
#define CTAXONNAMESDICTIONARY_H
#include "faststring2.h"
#include "CDistance_matrix.h"
#include "CSequences2.h"
#include "CFile/CFile2_1.h"
#include "CSeqNameList.h"
#include <set>
#include <list>
#include <map>
#include <vector>
class CTaxonNamesDictionary
{
CSeqNameList *snl;
CSeqNameList all;
// Each index stored in the taxon_names_to_index map is equal to the index of the same taxon name in the taxon_names_vec.
// This allows us to translate indices and names back and forth.
std::vector<faststring> taxon_names_vec;
std::map<faststring, unsigned> taxon_names_to_index;
public:
CTaxonNamesDictionary(faststring directoryname, std::list<faststring> &list_of_fasta_file_names, unsigned short sequence_name_taxon_field_number, char field_delimiter):all(true)
{
// Collect all taxon names to build a taxon name dictionary
// The collection of all sequence names is stored in the all object.
faststring full_transcript_file_name;
std::list<faststring>::iterator file_names_it, file_names_it_end, file_names_it_begin;
file_names_it_begin = file_names_it = list_of_fasta_file_names.begin();
file_names_it_end = list_of_fasta_file_names.end();
// For each fasta file
while (file_names_it != file_names_it_end)
{
full_transcript_file_name = directoryname + "/" + *file_names_it;
// std::cout << "Reading sequence names in " << full_transcript_file_name << std::endl;
snl = new CSeqNameList(full_transcript_file_name.c_str(), true);
if (file_names_it == file_names_it_begin)
{
all.set_to(*snl, "all-names-union");
}
else
{
all.add_List_non_redundant(*snl);
}
++file_names_it;
}
// all.print(cout,0);
// TAXON NAMES DICTIONARY: consists of a vector, and map
// A set is used to store all names temporarily.
std::set<faststring> taxon_names_set;
bool error_get_set_of_name_field;
// Take the names of the sequences, divide it according to the delimiter '|' and
// extract field number sequence_name_taxon_field_number. Store all names in the set.
error_get_set_of_name_field = all.get_set_of_name_field(sequence_name_taxon_field_number+1, field_delimiter, taxon_names_set);
if (error_get_set_of_name_field)
{
std::cerr << "WARNING: At least one error occurred when trying to read taxon names from the fasta sequence headers."
<< " Usually this error occurs, when headers are not in the proper format, when the parameter file contains a wrong"
<< " wrong sequence name field delimiter or if the taxon name field number is wrong." << std::endl;
std::cerr << "This error will influence: TODO." << std::endl;
}
// Now we have a set that contains all taxon names found in all fasta files.
std::set<faststring>::iterator it, it_end;
it = taxon_names_set.begin();
it_end = taxon_names_set.end();
unsigned i;
i=0;
while (it != it_end)
{
// Since the taxon_names_vec was empty originally, the names have an index equal to i.
taxon_names_vec.push_back(*it); // push all names into the vector. unique-index -> name
taxon_names_to_index[*it] = i; // for each name store its index in the map. name -> unique-index
++it;
++i;
}
// Now we can translate unique indices and taxon names back and forth.
}
void print(std::ostream &outfile)
{
outfile << "## Index <-> taxon name:" << std::endl;
unsigned i, N = taxon_names_vec.size();
for (i=0; i<N; ++i)
{
outfile << i << " " << taxon_names_vec[i] << std::endl;
}
}
// Main dictionary retrieval function.
unsigned dictionary_get_index_of_taxonname(faststring taxonname)
{
/* map<faststring, unsigned>::iterator it, it_end; */
/* it = taxon_names_to_index.begin(); */
/* it_end = taxon_names_to_index.end(); */
/* cout << "taxon_names_to_index map" << endl; */
/* while (it != it_end) */
/* { */
/* cout << it->first << "=>" << it->second << endl; */
/* ++it; */
/* } */
std::map<faststring, unsigned>::iterator it_find = taxon_names_to_index.find(taxonname);
if (it_find != taxon_names_to_index.end() )
return it_find->second;
else
return -1u;
}
const std::vector<faststring> & get_taxon_names_vec()
{
return taxon_names_vec;
}
void print_taxon_names_to_index(std::ostream &os)
{
std::map<faststring, unsigned>::iterator it, it_end;
it = taxon_names_to_index.begin();
it_end = taxon_names_to_index.end();
os << "Content of taxon names to global index dictionary: " << std::endl;
while (it != it_end)
{
os.width(30);
os << it->first << " " << it->second << std::endl;
++it;
}
}
};
#endif