From bb89aa10982e7dcc78f5b1d28785fef8d6c778a0 Mon Sep 17 00:00:00 2001 From: Peter Orchard Date: Fri, 8 Nov 2019 15:25:30 -0500 Subject: [PATCH 1/3] Add option to append barcodes from another fastq file to read names --- src/cta.cpp | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/src/cta.cpp b/src/cta.cpp index 519bcbd..50b4f6a 100644 --- a/src/cta.cpp +++ b/src/cta.cpp @@ -33,6 +33,13 @@ class FASTQRecord { std::string comment; std::string sequence; std::string quality; + void append_barcode(const std::string& barcode) { + if (name.find(' ')!=std::string::npos) { + name.insert(name.find(' '), "_" + barcode); + } else { + name += "_" + barcode; + } + } }; std::ostream& operator<<(std::ostream& os, const FASTQRecord& record) { @@ -279,6 +286,9 @@ void print_usage() { << "-t|--trim-from-start\n" << " Trim this number of bases from the start of each sequence. The default is zero.\n" + << "-a|--append-barcode\n" + << " Append barcodes from this fastq file to the end of read names (useful for e.g. single-cell data).\n" + << std::endl; } @@ -293,6 +303,7 @@ int main(int argc, char **argv) long long int rc_length = 20; std::string input_filename1; std::string input_filename2; + std::string barcode_filename; std::string output_filename1; std::string output_filename2; @@ -304,11 +315,12 @@ int main(int argc, char **argv) {"fudge", required_argument, NULL, 'f'}, {"trim-from-start", required_argument, NULL, 't'}, {"rc-length", required_argument, NULL, 'r'}, + {"append-barcode", required_argument, NULL, 'a'}, {0, 0, 0, 0} }; // parse the command line arguments - while ((c = getopt_long(argc, argv, "d:f:t:r:vh?", long_options, &option_index)) != -1) { + while ((c = getopt_long(argc, argv, "d:f:t:r:a:vh?", long_options, &option_index)) != -1) { switch (c) { case 'h': case '?': @@ -329,6 +341,9 @@ int main(int argc, char **argv) case 'r': rc_length = std::stoll(optarg); break; + case 'a': + barcode_filename = optarg; + break; case 0: if (long_options[option_index].flag != 0){ break; @@ -369,6 +384,9 @@ int main(int argc, char **argv) bio::file_source input_file2(input_filename2); bio::stream input_stream2(input_file2); + bio::file_source barcode_file(barcode_filename); + bio::stream barcode_stream(barcode_file); + bio::filtering_stream in1; if (is_gzipped_file(input_filename1)) { in1.push(bio::gzip_decompressor()); @@ -381,6 +399,12 @@ int main(int argc, char **argv) } in2.push(input_stream2); + bio::filtering_stream inBarcode; + if (!barcode_filename.empty() && is_gzipped_file(barcode_filename)) { + inBarcode.push(bio::gzip_decompressor()); + } + inBarcode.push(barcode_stream); + if (output_filename1.empty()) { output_filename1 = make_trimmed_filename(input_filename1); } @@ -410,9 +434,16 @@ int main(int argc, char **argv) FASTQRecord record1; FASTQRecord record2; + FASTQRecord recordBarcode; while ((in1 >> record1) && (in2 >> record2)) { trim_pair(record1, record2, rc_length, max_edit_distance, fudge, trim_start, verbose); + if (!barcode_filename.empty()) { + inBarcode >> recordBarcode; + std::string barcode = recordBarcode.sequence; + record1.append_barcode(barcode); + record2.append_barcode(barcode); + } out1 << record1; out2 << record2; } From f61211c0e0b524bd8c5108507492cb369bc96e7a Mon Sep 17 00:00:00 2001 From: Peter Orchard Date: Fri, 8 Nov 2019 15:26:05 -0500 Subject: [PATCH 2/3] Abort Levenshtein distance calculation early when possible --- src/cta.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/cta.cpp b/src/cta.cpp index 50b4f6a..9a53702 100644 --- a/src/cta.cpp +++ b/src/cta.cpp @@ -153,9 +153,17 @@ long long int levenshtein_distance(const std::string& s1, const std::string& s2, v2[j + 1] = std::min(v2[j] + 1, std::min(v1[j + 1] + 1, v1[j] + cost)); } + long long int minimum = maximum + 1; for (long long int j = 0; j < v1_size; j++) { + if (v2[j] < minimum) { + minimum = v2[j]; + } v1[j] = v2[j]; } + if (minimum > maximum) { + // abort early + return maximum + 1; + } } return v2[s2_length]; From a8ffd37c6adf3cd9114a308366f69012e81e59e7 Mon Sep 17 00:00:00 2001 From: arushiv Date: Tue, 11 Feb 2020 18:49:59 -0500 Subject: [PATCH 3/3] option to only append barcodes and trim if the barcode is in a list of selected barcodes --- src/cta.cpp | 89 +++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 77 insertions(+), 12 deletions(-) diff --git a/src/cta.cpp b/src/cta.cpp index 9a53702..cefd9da 100644 --- a/src/cta.cpp +++ b/src/cta.cpp @@ -1,3 +1,4 @@ + #include #include #include @@ -10,6 +11,7 @@ #include #include #include +#include #include #include @@ -297,9 +299,38 @@ void print_usage() { << "-a|--append-barcode\n" << " Append barcodes from this fastq file to the end of read names (useful for e.g. single-cell data).\n" + << "-x|--selected-barcodes\n" + << " Select and output reads corresponding to these barcodes from this unzipped text file\n" + << std::endl; } +bool getFileContent(std::string fileName, std::map & vecOfStrs) +{ + + // Open the File + std::ifstream in(fileName.c_str()); + + // Check if object is valid + if(!in) + { + std::cerr << "Cannot open the File : "< 0 then save it in vector + if(str.size() > 0) + vecOfStrs[str] = true; + } + //Close The File + in.close(); + return true; +} + int main(int argc, char **argv) { int c, option_index = 0; @@ -312,6 +343,7 @@ int main(int argc, char **argv) std::string input_filename1; std::string input_filename2; std::string barcode_filename; + std::string selected_barcode_filename; std::string output_filename1; std::string output_filename2; @@ -324,11 +356,12 @@ int main(int argc, char **argv) {"trim-from-start", required_argument, NULL, 't'}, {"rc-length", required_argument, NULL, 'r'}, {"append-barcode", required_argument, NULL, 'a'}, + {"selected-barcodes", required_argument, NULL, 'x'}, {0, 0, 0, 0} }; // parse the command line arguments - while ((c = getopt_long(argc, argv, "d:f:t:r:a:vh?", long_options, &option_index)) != -1) { + while ((c = getopt_long(argc, argv, "d:f:t:r:a:x:vh?", long_options, &option_index)) != -1) { switch (c) { case 'h': case '?': @@ -352,6 +385,9 @@ int main(int argc, char **argv) case 'a': barcode_filename = optarg; break; + case 'x': + selected_barcode_filename = optarg; + break; case 0: if (long_options[option_index].flag != 0){ break; @@ -444,17 +480,46 @@ int main(int argc, char **argv) FASTQRecord record2; FASTQRecord recordBarcode; - while ((in1 >> record1) && (in2 >> record2)) { - trim_pair(record1, record2, rc_length, max_edit_distance, fudge, trim_start, verbose); - if (!barcode_filename.empty()) { - inBarcode >> recordBarcode; - std::string barcode = recordBarcode.sequence; - record1.append_barcode(barcode); - record2.append_barcode(barcode); - } - out1 << record1; - out2 << record2; - } + if (!barcode_filename.empty()) { + if (!selected_barcode_filename.empty()) { + // Read list of selected barcodes + std::map selected_barcodes; + + // Get the contents of file in a vector + bool result = getFileContent(selected_barcode_filename, selected_barcodes); + + // check is barcode is in selected barcodes, if yes, trim and append + while ((in1 >> record1) && (in2 >> record2) && (inBarcode >> recordBarcode)) { + std::string barcode = recordBarcode.sequence; + if (selected_barcodes.count(barcode) > 0) { + trim_pair(record1, record2, rc_length, max_edit_distance, fudge, trim_start, verbose); + record1.append_barcode(barcode); + record2.append_barcode(barcode); + out1 << record1; + out2 << record2; + } + } + } + else { + // just trim and append barcode + while ((in1 >> record1) && (in2 >> record2) && (inBarcode >> recordBarcode)) { + std::string barcode = recordBarcode.sequence; + trim_pair(record1, record2, rc_length, max_edit_distance, fudge, trim_start, verbose); + record1.append_barcode(barcode); + record2.append_barcode(barcode); + out1 << record1; + out2 << record2; + } + } + } + else { + // just trim + while ((in1 >> record1) && (in2 >> record2)){ + trim_pair(record1, record2, rc_length, max_edit_distance, fudge, trim_start, verbose); + out1 << record1; + out2 << record2; + } + } bio::close(out1); bio::close(out2); }