forked from kharchenkolab/dropEst
-
Notifications
You must be signed in to change notification settings - Fork 0
/
droptag.cpp
328 lines (276 loc) · 10.2 KB
/
droptag.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
#include <iostream>
#include <iomanip>
#include <vector>
#include <algorithm>
#include <getopt.h>
#include <boost/property_tree/ptree.hpp>
#include <boost/property_tree/xml_parser.hpp>
#include <boost/filesystem/path.hpp>
#include <TagsSearch/IndropV3LibsTagsFinder.h>
#include <Tools/UtilFunctions.h>
#include "TagsSearch/FixPosSpacerTagsFinder.h"
#include "TagsSearch/IndropV1TagsFinder.h"
#include "TagsSearch/TagsFinderBase.h"
#include "TagsSearch/IndropV3TagsFinder.h"
#include <TagsSearch/ConcurrentGzWriter.h>
#include <TagsSearch/IClipTagsFinder.h>
#include <TagsSearch/SplitSeqTagsFinder.h>
#include <Rcpp.h>
#include "Tools/Logs.h"
using namespace std;
using namespace TagsSearch;
using namespace boost::property_tree;
const std::string CONFIG_PATH = "config.TagsSearch";
const std::string PROCESSING_CONFIG_PATH = CONFIG_PATH + ".Processing";
const std::string MULTIPLE_BARCODES_CONFIG_PATH = CONFIG_PATH + ".MultipleBarcodeSearch";
const std::string BARCODES_CONFIG_PATH = CONFIG_PATH + ".BarcodesSearch";
const std::string SPACER_CONFIG_PATH = CONFIG_PATH + ".SpacerSearch";
static const std::string SCRIPT_NAME = "droptag";
struct Params
{
bool cant_parse = false;
bool save_reads_params = false;
bool quiet = false;
bool save_stats = false;
int num_of_threads = 1;
int reads_per_out_file = -1;
string base_name = "";
string config_file_name = "";
string log_prefix = "";
string lib_tag = "";
vector<string> read_files = vector<string>();
};
void save_stats(const string &out_filename, const shared_ptr<TagsFinderBase> &tags_finder);
static void usage()
{
cerr << SCRIPT_NAME << " -- generate tagged fastq files for alignment\n";
cerr << "Version: " << VERSION << "\n\n";
cerr << "SYNOPSIS\n";
cerr << "\t" << SCRIPT_NAME << " [options] "
<< "-c config.xml barcode_reads.fastq [barcode_umi_reads.fastq] gene_reads.fastq [library_tags.fastq]\n";
cerr << "OPTIONS:\n";
cerr << "\t-c, --config filename: xml file with " << SCRIPT_NAME << " parameters\n";
cerr << "\t-h, --help: show this info\n";
cerr << "\t-l, --log-prefix prefix: logs prefix\n";
cerr << "\t-n, --name name: alternative output base name\n";
cerr << "\t-p, --parallel number: number of threads\n";
cerr << "\t-s, --save-reads-params : serialize reads parameters to save quality info\n";
cerr << "\t-S, --save-stats : save stats to rds file\n";
cerr << "\t-r, --reads-per-out-file : maximum number of reads per output file; (0 - unlimited). Overrides corresponding xml parameter.\n";
cerr << "\t-t, --lib-tag library tag : (for IndropV3 with library tag only)\n";
cerr << "\t-q, --quiet : disable logs\n";
}
static void check_files_existence(const Params ¶ms)
{
if (!params.config_file_name.empty() && !std::ifstream(params.config_file_name))
throw std::runtime_error("Can't open config file '" + params.config_file_name + "'");
for (auto const &file : params.read_files)
{
if (!std::ifstream(file))
throw std::runtime_error("Can't open file with reads: '" + file + "'");
}
}
/// Returns instance of TagsFinderBase based on the config file and number of input files, initializing it according to CLI parameters
/// @param params CLI parameters
/// @param pt specific node in the config file (config/TagsSearch)
shared_ptr<TagsFinderBase> get_tags_finder(const Params ¶ms, const ptree &pt)
{
auto const &config = pt.get_child(CONFIG_PATH);
std::string protocol_type = config.get<std::string>("protocol", "");
if (protocol_type.empty())
throw std::runtime_error("Protocol is empty. Please, specify it in the config (TagsSearch/protocol)");
auto const &processing_config = pt.get_child(PROCESSING_CONFIG_PATH, ptree());
size_t max_records_per_file = params.reads_per_out_file;
if(max_records_per_file==-1) max_records_per_file = processing_config.get<size_t>("reads_per_out_file", 0);
auto writer = std::make_shared<ConcurrentGzWriter>(params.base_name, "fastq.gz", max_records_per_file);
const std::string input_files_num_error_text = "Unexpected number of read files: " +
std::to_string(params.read_files.size()) +
" for protocol \"" + protocol_type + "\"";
if (protocol_type == "indrop3")
{
if (params.read_files.size() == 4)
{
if (params.lib_tag.empty())
throw std::runtime_error("For IndropV3 with library tag, tag (-t option) should be specified");
return shared_ptr<TagsFinderBase>(
new IndropV3LibsTagsFinder(params.read_files, params.lib_tag, pt.get_child(BARCODES_CONFIG_PATH),
processing_config, writer, params.save_stats, params.save_reads_params));
}
if (params.read_files.size() != 3)
throw std::runtime_error(input_files_num_error_text);
return shared_ptr<TagsFinderBase>(
new IndropV3TagsFinder(params.read_files, pt.get_child(BARCODES_CONFIG_PATH), processing_config,
writer, params.save_stats, params.save_reads_params));
}
if (protocol_type == "10x") // 10x has the same format of files as indrop v3
{
if (params.read_files.size() != 3)
throw std::runtime_error(input_files_num_error_text);
return shared_ptr<TagsFinderBase>(
new IndropV3TagsFinder(params.read_files, pt.get_child(BARCODES_CONFIG_PATH), processing_config,
writer, params.save_stats, params.save_reads_params));
}
if (protocol_type == "indrop")
{
if (params.read_files.size() != 2)
throw std::runtime_error(input_files_num_error_text);
if (!pt.get<std::string>(SPACER_CONFIG_PATH + ".barcode_mask", "").empty())
return shared_ptr<TagsFinderBase>(
new FixPosSpacerTagsFinder(params.read_files, pt.get_child(SPACER_CONFIG_PATH), processing_config,
writer, params.save_stats, params.save_reads_params));
return shared_ptr<TagsFinderBase>(
new IndropV1TagsFinder(params.read_files, pt.get_child(SPACER_CONFIG_PATH), processing_config,
writer, params.save_stats, params.save_reads_params));
}
if (protocol_type == "iclip")
{
if (params.read_files.size() != 1)
throw std::runtime_error(input_files_num_error_text);
return shared_ptr<TagsFinderBase>(
new IClipTagsFinder(params.read_files, pt.get_child(BARCODES_CONFIG_PATH), processing_config,
writer, params.save_stats, params.save_reads_params));
}
if ((protocol_type == "split_seq") || (protocol_type == "drop_seq") || (protocol_type == "cel_seq2") ||
(protocol_type == "seq_well"))
{
if (params.read_files.size() != 2)
throw std::runtime_error(input_files_num_error_text);
return shared_ptr<TagsFinderBase>(
new SplitSeqTagsFinder(params.read_files, pt.get_child(MULTIPLE_BARCODES_CONFIG_PATH), processing_config,
writer, params.save_stats, params.save_reads_params));
}
throw std::runtime_error("Unknown protocol: '" + protocol_type + "'. Please, check it in the config (TagsSearch/protocol)");
}
Params parse_cmd_params(int argc, char **argv)
{
int option_index = 0;
int c;
static struct option long_options[] = {
{"config", required_argument, 0, 'c'},
{"help", no_argument, 0, 'h'},
{"log-prefix", required_argument, 0, 'l'},
{"name", required_argument, 0, 'n'},
{"parallel", required_argument, 0, 'p'},
{"reads-per-out-file", required_argument, 0, 'r'},
{"save-reads-params", no_argument, 0, 's'},
{"save-stats", required_argument, 0, 'S'},
{"lib-tag", required_argument, 0, 't'},
{"quiet", no_argument, 0, 'q'},
{0, 0, 0, 0}
};
Params params;
while ((c = getopt_long(argc, argv, "c:hl:n:p:r:sSt:q", long_options, &option_index)) != -1)
{
switch (c)
{
case 'c' :
params.config_file_name = string(optarg);
break;
case 'l' :
params.log_prefix = string(optarg);
break;
case 'h' :
usage();
exit(0);
case 'n' :
params.base_name = string(optarg);
break;
case 'p' :
params.num_of_threads = int(strtol(optarg, nullptr, 10));
break;
case 's' :
params.save_reads_params = true;
break;
case 'r' :
params.reads_per_out_file = int(strtol(optarg, nullptr, 10));
break;
case 'S' :
params.save_stats = true;
break;
case 't' :
params.lib_tag= string(optarg);
break;
case 'q' :
params.quiet = true;
break;
default:
cerr << SCRIPT_NAME << ": unknown arguments passed" << endl;
usage();
params.cant_parse = true;
return params;
}
}
if (params.config_file_name.empty())
{
cerr << SCRIPT_NAME << ": config file must be supplied" << endl;
params.cant_parse = true;
}
if (optind == argc)
{
cerr << SCRIPT_NAME << ": read files must be supplied" << endl;
usage();
params.cant_parse = true;
return params;
}
while (optind != argc)
{
params.read_files.emplace_back(argv[optind++]);
}
if (params.base_name.empty())
{
params.base_name = boost::filesystem::path(params.read_files.back()).filename().generic_string() + ".tagged";
}
return params;
}
void save_stats(const string &out_filename, const shared_ptr<TagsFinderBase> &tags_finder)
{
using namespace Rcpp;
Tools::trace_time("Writing R data to " + out_filename);
auto R = Tools::init_r();
(*R)["d"] = List::create(
_["reads_per_cb"] = wrap(tags_finder->num_reads_per_cb())
);
R->parseEvalQ("saveRDS(d, '" + out_filename + "')");
}
int main(int argc, char **argv)
{
std::string command_line;
for (int i = 0; i < argc; ++i)
{
command_line += argv[i];
command_line += " ";
}
Params params = parse_cmd_params(argc, argv);
if (params.cant_parse)
return 1;
check_files_existence(params);
if (params.log_prefix.length() != 0)
{
params.log_prefix += "_";
}
Tools::init_log(!params.quiet, false, params.log_prefix + "tag_main.log", params.log_prefix + "tag_debug.log");
Tools::copy_file(params.config_file_name, params.log_prefix + "tag_config.dump.xml");
L_TRACE << command_line;
L_TRACE << "Version: " << VERSION << ".";
ptree pt;
xml_parser::read_xml(params.config_file_name, pt);
try
{
shared_ptr<TagsFinderBase> finder = get_tags_finder(params, pt);
Tools::trace_time("Run");
// All work is done here
finder->run(params.num_of_threads);
Tools::trace_time("All done");
if (params.save_stats)
{
save_stats(params.base_name + ".rds", finder);
}
}
catch (std::runtime_error &err)
{
time_t ctt = time(nullptr);
L_ERR << err.what() << "\nTime: " << asctime(localtime(&ctt));
return 1;
}
return 0;
}