-
Notifications
You must be signed in to change notification settings - Fork 0
/
ucsc2gtf.pl
executable file
·79 lines (73 loc) · 3.61 KB
/
ucsc2gtf.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#!/usr/bin/perl -w -s
use tools;
### converts UCSC table file (mysql table with headers
### '#bin,name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds')
### to a GTF file (-out argument), and also a mapping (-m argument) from gene -> transcripts/isoforms (isoform ids
### are in format: gene.'__'. chromosome \t refseqid '__' chromosome '__' number
### The addition of chromosome name is needed for alternate locus groups (i.e. alternate partial assemblies; typically
### for part of the other half of a diploid genome), as they often have the same genes mapping there.
### Looks like the --utr=3,5 only outputs the 3 or 5' UTRs, in weird format:
### ---->>>>------>>>>------==== (--- : untransl exon, >>>: intron, ===: translated part of exon)
### |<- - - - - - - - - - ->| is transcript coordinates (should have been called 5UTR or 3UTR)
### |<- ->| is the untranslated part of the first exon (unclear what to call this)
### original written by Dominic Grün
### See also genePredToGtf and
### http://genomewiki.ucsc.edu/index.php/Genes_in_gtf_or_gff_format
warn "### NOTE: the GRCh38 RefSeq table has duplicate gene names for e.g. RNU1-1; basically this table is corrupt,
### leading to some huge transcripts (with exons from both strands !?!?) ... UCSC has been notified
" ;
if (@ARGV == 0 && (!$in || !$out || !$m)) {
die "usage: -in=INPUT.ucsc_format -out=OUTPUT.gtf -m=gene2isoforms.tsv -utr=0,3,5\n";
}
$utr = 0 if !$utr;
open(OUT,">",$out);
open(OUT2,">",$m);
open(IN,"<",$in) or die "$in: $!";
while(<IN>){
chomp;
next if /^\#/;
($dum,$name,$chr,$str,$start,$end,$cdsStart,$cdsEnd,$dum,$exstart,$exend,$dum,$name2)=split(/\t/);
if($start <= 0 ) {
print STDERR "transcript start <= 0, adjusted to 1. Line $.: $_\n";
$start=1;
$exstart =~ s/0,/1,/;
}
@S = split(/\,/,$exstart); # exon starts
@E = split(/\,/,$exend);
$sid = $name2."__".$chr; # gene_id
$t = $name."__".$chr; # refseqid, i.e. transcript name
$count{$t}++;
$tid = $name."__".$chr."__".$count{$t}; # transcript_id
push(@{$n{$sid}},$tid); # mappings of gene_id
if ( $utr == 0 ){
print OUT join("\t",($chr,"UCSC","transcript",$start,$end,0,$str,".", "gene_id \"".$name2."\"; transcript_id \"".$tid."\";"))."\n";
for $i (0..$#S){
print OUT join("\t",($chr,"UCSC","exon",$S[$i],$E[$i],0,$str,".", "gene_id \"".$name2."\"; transcript_id \"".$tid."\"; exon \"".($i + 1)."\";"))."\n";
}
}else{
if ( ( ( $utr == 5 && $str eq "+" ) || ( $utr == 3 && $str eq "-" ) ) && $start < $cdsStart ){
print OUT join("\t",($chr,"UCSC","transcript",$start,$cdsStart - 1,0,$str,".", "gene_id \"".$name2."\"; transcript_id \"".$tid."\";"))."\n";
for $i (0..$#S){
if ( $S[$i] < $cdsStart ){
if ( $E[$i] < $cdsStart ){ $e = $E[$i] }else{ $e = $cdsStart - 1};
print OUT join("\t",($chr,"UCSC","exon",$S[$i],$e,0,$str,".", "gene_id \"".$name2."\"; transcript_id \"".$tid."\"; exon \"".($i + 1)."\";"))."\n";
}
}
}
if ( ( ( $utr == 5 && $str eq "-" ) || ( $utr == 3 && $str eq "+" ) ) && $end > $cdsEnd ){
print OUT join("\t",($chr,"UCSC","transcript",$cdsEnd + 1,$end,0,$str,".", "gene_id \"".$name2."\"; transcript_id \"".$tid."\";"))."\n";
for $i (0..$#S){
if ( $E[$i] > $cdsEnd ){
if ( $S[$i] > $cdsEnd ){ $s = $S[$i] }else{ $s = $cdsEnd + 1};
print OUT join("\t",($chr,"UCSC","exon",$s,$E[$i],0,$str,".", "gene_id \"".$name2."\"; transcript_id \"".$tid."\"; exon \"".($i + 1)."\";"))."\n";
}
}
}
}
}
close(IN);
close(OUT);
foreach ( sort keys %n ){
print OUT2 $_."\t".join("|",@{$n{$_}})."\n";
}
close(OUT2);