forked from Ensembl/VEP_plugins
-
Notifications
You must be signed in to change notification settings - Fork 0
/
MTR.pm
153 lines (111 loc) · 3.95 KB
/
MTR.pm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
=head1 CONTACT
Slave Petrovski <[email protected]>
Michael Silk <[email protected]>
=cut
=head1 NAME
MTR (Missense Tolerance Ratio)
=head1 SYNOPSIS
mv MTR.pm ~/.vep/Plugins
curl -O ftp://mtr-viewer.mdhs.unimelb.edu.au/pub/mtrflatfile_1.0.txt.gz
curl -O ftp://mtr-viewer.mdhs.unimelb.edu.au/pub/mtrflatfile_1.0.txt.gz.tbi
perl variant_effect_predictor.pl -i variations.vcf --plugin MTR,mtrflatfile_1.0.txt.gz
=head1 DESCRIPTION
A VEP plugin that retrieves Missense Tolerance Ratio (MTR) scores for
variants from a tabix-indexed flat file.
MTR scores quantify the amount of purifying selection acting
specifically on missense variants in a given window of protein-coding
sequence. It is estimated across a sliding window of 31 codons and uses
observed standing variation data from the WES component of the Exome
Aggregation Consortium Database (ExAC), version 2.0
(http://gnomad.broadinstitute.org).
Please cite the MTR publication alongside the VEP if you use this resource:
http://genome.cshlp.org/content/27/10/1715
The Bio::DB::HTS perl library or tabix utility must be installed in your path
to use this plugin. MTR flat files can be downloaded from:
ftp://mtr-viewer.mdhs.unimelb.edu.au/pub
NB: Data are available for GRCh37 only
=cut
package MTR;
use strict;
use warnings;
use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp);
use Bio::EnsEMBL::Variation::Utils::BaseVepTabixPlugin;
use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepTabixPlugin);
sub new {
my $class = shift;
my $self = $class->SUPER::new(@_);
# test tabix
die "ERROR: tabix does not seem to be in your path\n" unless `which tabix 2>&1` =~ /tabix$/;
# get MTR file
my $file = $self->params->[0];
$self->add_file($file);
# remote files?
if($file =~ /tp\:\/\//) {
my $remote_test = `tabix -f $file 1:1-1 2>&1`;
if($remote_test && $remote_test !~ /get_local_version/) {
die "$remote_test\nERROR: Could not find file or index file for remote annotation file $file\n";
}
}
# check files exist
else {
die "ERROR: MTR file $file not found\n" unless -e $file;
die "ERROR: Tabix index file $file\.tbi not found - perhaps you need to create it first?\n" unless -e $file.'.tbi';
}
# get headers and store on self
open HEAD, "tabix -fh $file 1:1-1 2>&1 | ";
while(<HEAD>) {
next unless /^\#/;
chomp;
$self->{headers} = [split];
}
close HEAD;
return $self;
}
sub feature_types {
return ['Transcript'];
}
sub variation_feature_types {
return ['VariationFeature'];
}
sub get_header_info {
my $self = shift;
return {
MTR => 'MTR score',
FDR => 'MTR false discovery rate adjusted binomial exact test.',
MTR_centile => 'MTR gene-specific percentile'
}
}
sub run {
my ($self, $tva) = @_;
my $vf = $tva->variation_feature;
# get allele, reverse comp if needed
my $allele = $tva->variation_feature_seq;
reverse_comp(\$allele) if $vf->{strand} < 0;
return {} unless $allele =~ /^[ACGT]$/;
my $tr_id = $tva->transcript->stable_id;
# data is written by pos, allele, transcript ID (feature)
# grep lines read in matched on position so that they also are matched on allele and transcript ID
my ($res) = grep {
$_->{Genomic_position} eq $vf->{start} &&
$_->{Genomic_position} eq $vf->{end} &&
$_->{alt} eq $allele &&
$_->{Feature} eq $tr_id
} @{$self->get_data($vf->{chr}, $vf->{start}, $vf->{end})};
# return only the keys defined by get_header_info()
return $res ? { map {$_ => $res->{$_}} grep {defined($res->{$_}) && $res->{$_} ne '.'} keys %{$self->get_header_info} } : {};
}
sub parse_data {
my ($self, $line) = @_;
$line =~ s/\r$//g;
my @split = split /\t/, $line;
# parse data into hash of col names and values
my %data = map {$self->{headers}->[$_] => $split[$_]} (0..(scalar @{$self->{headers}} - 1));
return \%data;
}
sub get_start {
return $_[1]->{'Genomic_position'};
}
sub get_end {
return $_[1]->{'Genomic_position'};
}
1;