Skip to content

Commit

Permalink
Merge branch 'lh3:master' into master
Browse files Browse the repository at this point in the history
  • Loading branch information
sanchit-misra authored Oct 20, 2022
2 parents 13fb4ed + 2a319c8 commit 10bde16
Showing 1 changed file with 236 additions and 12 deletions.
248 changes: 236 additions & 12 deletions misc/paftools.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env k8

var paftools_version = '2.24-r1132-dirty';
var paftools_version = '2.24-r1141-dirty';

/*****************************
***** Library functions *****
Expand Down Expand Up @@ -2345,12 +2345,15 @@ function paf_pbsim2fq(args)

function paf_junceval(args)
{
var c, l_fuzzy = 0, print_ovlp = false, print_err_only = false, first_only = false, chr_only = false;
while ((c = getopt(args, "l:epc")) != null) {
var c, l_fuzzy = 0, print_ovlp = false, print_err_only = false, first_only = false, chr_only = false, aa = false, is_bed = false;
while ((c = getopt(args, "l:epcab1")) != null) {
if (c == 'l') l_fuzzy = parseInt(getopt.arg);
else if (c == 'e') print_err_only = print_ovlp = true;
else if (c == 'p') print_ovlp = true;
else if (c == 'c') chr_only = true;
else if (c == 'a') aa = true;
else if (c == 'b') is_bed = true;
else if (c == '1') first_only = true;
}

if (args.length - getopt.ind < 1) {
Expand All @@ -2360,6 +2363,9 @@ function paf_junceval(args)
print(" -p print overlapping introns");
print(" -e print erroreous overlapping introns");
print(" -c only consider alignments to /^(chr)?([0-9]+|X|Y)$/");
print(" -a miniprot PAF as input");
print(" -b BED as input");
print(" -1 only process the first alignment of each query");
exit(1);
}

Expand Down Expand Up @@ -2413,13 +2419,17 @@ function paf_junceval(args)

file = getopt.ind+1 >= args.length || args[getopt.ind+1] == '-'? new File() : new File(args[getopt.ind+1]);
var last_qname = null;
var re_cigar = /(\d+)([MIDNSHP=X])/g;
var re_cigar = /(\d+)([MIDNSHP=XFGUV])/g;
while (file.readline(buf) >= 0) {
var m, t = buf.toString().split("\t");
var ctg_name = null, cigar = null, pos = null, qname = t[0];
var ctg_name = null, cigar = null, pos = null, qname;

if (t[0].charAt(0) == '@') continue;
if (t[4] == '+' || t[4] == '-' || t[4] == '*') { // PAF
if (t[0] == "##PAF") t.shift();
qname = t[0];
if (is_bed) {
ctg_name = t[0], pos = parseInt(t[1]), cigar == null;
} else if (t[4] == '+' || t[4] == '-' || t[4] == '*') { // PAF
ctg_name = t[5], pos = parseInt(t[7]);
var type = 'P';
for (i = 12; i < t.length; ++i) {
Expand Down Expand Up @@ -2449,12 +2459,43 @@ function paf_junceval(args)
}

var intron = [];
while ((m = re_cigar.exec(cigar)) != null) {
var len = parseInt(m[1]), op = m[2];
if (op == 'N') {
intron.push([pos, pos + len]);
pos += len;
} else if (op == 'M' || op == 'X' || op == '=' || op == 'D') pos += len;
if (is_bed) {
intron.push([pos, parseInt(t[2])]);
} else if (aa) {
var tmp_junc = [], tmp = 0;
while ((m = re_cigar.exec(cigar)) != null) {
var len = parseInt(m[1]), op = m[2];
if (op == 'N') {
tmp_junc.push([tmp, tmp + len]);
tmp += len;
} else if (op == 'U') {
tmp_junc.push([tmp + 1, tmp + len - 2]);
tmp += len;
} else if (op == 'V') {
tmp_junc.push([tmp + 2, tmp + len - 1]);
tmp += len;
} else if (op == 'M' || op == 'X' || op == '=' || op == 'D') {
tmp += len * 3;
} else if (op == 'F' || op == 'G') {
tmp += len;
}
}
if (t[4] == '+') {
for (var i = 0; i < tmp_junc.length; ++i)
intron.push([pos + tmp_junc[i][0], pos + tmp_junc[i][1]]);
} else if (t[4] == '-') {
var glen = parseInt(t[8]) - parseInt(t[7]);
for (var i = tmp_junc.length - 1; i >= 0; --i)
intron.push([pos + (glen - tmp_junc[i][1]), pos + (glen - tmp_junc[i][0])]);
}
} else {
while ((m = re_cigar.exec(cigar)) != null) {
var len = parseInt(m[1]), op = m[2];
if (op == 'N') {
intron.push([pos, pos + len]);
pos += len;
} else if (op == 'M' || op == 'X' || op == '=' || op == 'D') pos += len;
}
}
if (intron.length == 0) {
++n_sgl;
Expand Down Expand Up @@ -3110,6 +3151,183 @@ function paf_pafcmp(args)
buf.destroy();
}

function paf_longcs2seq(args) {
var c, opt = { query:false };
while ((c = getopt(args, "q")) != null)
if (c == 'q') opt.query = true;
if (args.length == getopt.ind) {
print("Usage: paftools.js longcs2seq [-q] <long-cs.paf>");
return;
}
var re_cs = /([:=*+-])(\d+|[A-Za-z]+)/g
var buf = new Bytes();
var file = args[getopt.ind] == "-"? new File() : new File(args[getopt.ind]);
while (file.readline(buf) >= 0) {
var m, cs = null, t = buf.toString().split("\t");
for (var i = 12; i < t.length; ++i)
if ((m = /^cs:Z:(\S+)/.exec(t[i])) != null) {
cs = m[1];
break;
}
if (cs == null) continue;
var ts = "", qs = "";
while ((m = re_cs.exec(cs)) != null) {
if (m[1] == "=") ts += m[2], qs += m[2];
else if (m[1] == "+") qs += m[2].toUpperCase();
else if (m[1] == "-") ts += m[2].toUpperCase();
else if (m[1] == "*") ts += m[2][0].toUpperCase(), qs += m[2][1].toUpperCase();
else if (m[1] == ":") throw Error("Long cs is required");
}
if (opt.query) {
print(">" + t[0] + "_" + t[2] + "_" + t[3]);
print(qs);
} else {
print(">" + t[5] + "_" + t[7] + "_" + t[8]);
print(ts);
}
}
file.close();
buf.destroy();
}

function paf_paf2gff(args) {
var c, opt = { aa:false };
var re_cigar = /(\d+)([A-Z=])/g;
while ((c = getopt(args, "a")) != null) {
if (c == 'a') opt.aa = true;
}
if (args.length == getopt.ind) {
print("Usage: paftools.js paf2gff [-a] <in.paf>");
return;
}
var buf = new Bytes();
var file = args[getopt.ind] == '-'? new File() : new File(args[getopt.ind]);
var hid = 1, last_name = null;
while (file.readline(buf) >= 0) {
var m, t = buf.toString().split("\t");
if (t[5] == '*') continue; // skip unmapped lines

if (t[0] != last_name) last_name = t[0], hid = 1;
else ++hid;
for (var i = 1; i <= 3; ++i) t[i] = parseInt(t[i]);
for (var i = 6; i <= 11; ++i) t[i] = parseInt(t[i]);
var cigar = null, score = null, np = null, dist_stop = null, dist_start = null;
for (var i = 12; i < t.length; ++i) {
if ((m = /^(cg:Z|AS:i|np:i|da:i|do:i):(\S+)/.exec(t[i])) != null) {
if (m[1] == 'cg:Z') cigar = m[2];
else if (m[1] == 'AS:i') score = parseInt(m[2]);
else if (m[1] == 'np:i') np = parseInt(m[2]);
else if (m[1] == 'do:i') dist_stop = parseInt(m[2]);
else if (m[1] == 'da:i') dist_start = parseInt(m[2]);
}
}
if (cigar == null) throw Error("failed to find the cg:Z tag");
if (score == null) throw Error("failed to find the AS:i tag");

var st = 0, en = 0, phase = 0, pseudo = false, fs = 0, a = [];
if (dist_start != null && dist_start == 0)
a.push([t[5], 'paf2gff', 'start_codon', 0, 3, 0, t[4], '.', 0]);
while ((m = re_cigar.exec(cigar)) != null) {
var len = parseInt(m[1]);
if (m[2] == 'M' || m[2] == 'D') {
en += opt.aa? len * 3 : len;
} else if (m[2] == 'F' || m[2] == 'G' || m[2] == 'R') {
en += len, pseudo = true, fs = 1;
} else if (m[2] == 'N') {
a.push([t[5], 'paf2gff', 'exon', st, en, 0, t[4], phase, fs]);
st = en + len, en += len, phase = 0, fs = 0;
} else if (m[2] == 'U') { // ...xGT...AGxx...
a.push([t[5], 'paf2gff', 'exon', st, en + 1, 0, t[4], phase, fs]);
st = en + len - 2, en += len, phase = 2, fs = 0;
} else if (m[2] == 'V') { // ...xxGT...AGx...
a.push([t[5], 'paf2gff', 'exon', st, en + 2, 0, t[4], phase, fs]);
st = en + len - 1, en += len, phase = 1, fs = 0;
}
}
a.push([t[5], 'paf2gff', 'exon', st, en, 0, t[4], phase, fs]);
if (en != t[8] - t[7]) throw Error("inconsistent cigar");
if (dist_stop != null && dist_stop == 0)
a.push([t[5], 'paf2gff', 'stop_codon', en, en + 3, 0, t[4], '.', 0]);
var type = pseudo? 'pseudogene' : 'protein_coding';
var attr = ['transcript_id=' + t[0] + '#' + hid, 'transcript_type=' + type].join(";");
var trans_attr = 'identity=' + (t[9] / t[10]).toFixed(4);
if (np != null) trans_attr += ';positive=' + (np * 3 / t[10]).toFixed(4);
trans_attr += ';aa_start=' + t[2];
trans_attr += ';aa_end=' + (t[1] - t[3]);
if (dist_start != null && dist_start >= 0) trans_attr += ';dist_start_codon=' + dist_start;
if (dist_stop != null && dist_stop >= 0) trans_attr += ';dist_stop_codon=' + dist_stop;
var trans_st = t[7], trans_en = t[8];
if (dist_stop != null && dist_stop == 0) {
if (t[4] == '-') trans_st -= 3;
else trans_en += 3;
}
print([t[5], 'paf2gff', 'transcript', trans_st + 1, trans_en, score, t[4], '.', attr + ';' + trans_attr].join("\t"));
if (opt.aa && t[4] == '-') {
var b = [], len = t[8] - t[7];
for (var i = a.length - 1; i >= 0; --i) {
var x = len - a[i][3];
a[i][3] = len - a[i][4];
a[i][4] = x;
//a[i][7] = a[i][7] == 0? 0 : 3 - a[i][7]; // not sure if this line is needed
b.push(a[i]);
}
a = b;
}
for (var i = 0; i < a.length; ++i) {
if (!pseudo && a[i][2] == "exon") a[i][2] = "CDS";
a[i][3] += t[7] + 1;
a[i][4] += t[7];
a[i][8] = attr + ";frameshift=" + a[i][8];
print(a[i].join("\t"));
}
}
file.close();
buf.destroy();
}

function paf_gff2junc(args) {
var c, feat = "CDS";
while ((c = getopt(args, "f:")) != null) {
if (c == 'f') feat = getopt.arg;
}
if (getopt.ind == args.length) {
print("Usage: paftools.js gff2junc [-f feature] <in.gff3>");
return;
}
var buf = new Bytes();
var file = args[getopt.ind] == "-"? new File() : new File(args[getopt.ind]);

function process_a(a) {
if (a.length < 2) return;
a = a.sort(function(x, y) { return x[4] - y[4] });
for (var i = 1; i < a.length; ++i)
print([a[i][1], a[i-1][5], a[i][4], a[i][0], 0, a[i][7]].join("\t"));
}

var a = [];
while (file.readline(buf) >= 0) {
var m, t = buf.toString().split("\t");
if (t[0][0] == '#') continue;
if (t[2].toLowerCase() != feat.toLowerCase()) continue;
//print(t.join("\t"));
if ((m = /\bParent=([^;]+)/.exec(t[8])) == null) {
warn("Can't find Parent");
continue;
}
t[3] = parseInt(t[3]) - 1;
t[4] = parseInt(t[4]);
t.unshift(m[1]);
if (a.length > 0 && a[0][0] != m[1]) {
process_a(a);
a.length = 0;
a.push(t);
} else a.push(t);
}
process_a(a);
file.close();
buf.destroy();
}

/*************************
***** main function *****
*************************/
Expand All @@ -3124,6 +3342,9 @@ function main(args)
print(" sam2paf convert SAM to PAF");
print(" delta2paf convert MUMmer's delta to PAF");
print(" gff2bed convert GTF/GFF3 to BED12");
print(" gff2junc convert GFF3 to junction BED");
print(" longcs2seq convert long-cs PAF to sequences");
// print(" paf2gff convert PAF to GFF3 (tested for miniprot only)");
print("");
print(" stat collect basic mapping information in PAF/SAM");
print(" asmstat collect basic assembly information");
Expand Down Expand Up @@ -3151,6 +3372,7 @@ function main(args)
else if (cmd == 'delta2paf') paf_delta2paf(args);
else if (cmd == 'splice2bed') paf_splice2bed(args);
else if (cmd == 'gff2bed') paf_gff2bed(args);
else if (cmd == 'gff2junc') paf_gff2junc(args);
else if (cmd == 'stat') paf_stat(args);
else if (cmd == 'asmstat') paf_asmstat(args);
else if (cmd == 'asmgene') paf_asmgene(args);
Expand All @@ -3168,6 +3390,8 @@ function main(args)
else if (cmd == 'vcfstat') paf_vcfstat(args);
else if (cmd == 'sveval') paf_sveval(args);
else if (cmd == 'vcfsel') paf_vcfsel(args);
else if (cmd == 'longcs2seq') paf_longcs2seq(args);
else if (cmd == 'paf2gff') paf_paf2gff(args);
else if (cmd == 'version') print(paftools_version);
else throw Error("unrecognized command: " + cmd);
}
Expand Down

0 comments on commit 10bde16

Please sign in to comment.