diff --git a/LOEUF.pm b/LOEUF.pm index 504d24e7..03964065 100644 --- a/LOEUF.pm +++ b/LOEUF.pm @@ -48,25 +48,24 @@ limitations under the License. LOEUF scores can be downloaded from - https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7334197/bin/41586_2020_2308_MOESM4_ESM.zip + GRCh37: https://gnomad.broadinstitute.org/downloads#v2-constraint (pLoF Metrics by Gene TSV) + GRCh38: https://gnomad.broadinstitute.org/downloads#v4-constraint (Constraint metrics TSV) For GRCh37: These files can be tabix-processed by: - Unzip 41586_2020_2308_MOESM4_ESM.zip - cd supplement - zcat supplementary_dataset_11_full_constraint_metrics.tsv.gz | (head -n 1 && tail -n +2 | sort -t$'\t' -k 76,76 -k 77,77n ) > loeuf_temp.tsv + zcat gnomad.v2.1.1.lof_metrics.by_gene.txt.bgz | (head -n 1 && tail -n +2 | sort -t$'\t' -k 76,76 -k 77,77n ) > loeuf_temp.tsv sed '1s/.*/#&/' loeuf_temp.tsv > loeuf_dataset.tsv bgzip loeuf_dataset.tsv tabix -f -s 76 -b 77 -e 78 loeuf_dataset.tsv.gz For GRCh38: - The LOEUF scores are available for assembly GRCh37, to be able to run the plugin for GRCh38 - please remap the regions from file supplementary_dataset_11_full_constraint_metrics.tsv - After the remapping the file can be tabix-processed by: - cat supplementary_dataset_11_full_constraint_metrics_grch38.tsv | (head -n 1 && tail -n +2 | sort -t$'\t' -k 76,76 -k 77,77n ) > loeuf_grch38_temp.tsv + The GRCh38 file does not have gene co-ordinates information. First you need to add the gene co-ordiates information. + You can use the Ensembl Perl API to create a script and perform that - https://www.ensembl.org/info/docs/api/core/index.html. + After adding the start and end position of the genes at the last 2 columns you can process the file as follows: + cat gnomad.v4.1.constraint_metrics_with_coordinates.tsv | (head -n 1 && tail -n +2 | sort -t$'\t' -k 53,53 -k 56,56n ) > loeuf_grch38_temp.tsv sed '1s/.*/#&/' loeuf_grch38_temp.tsv > loeuf_dataset_grch38.tsv bgzip loeuf_dataset_grch38.tsv - tabix -f -s 76 -b 77 -e 78 loeuf_dataset_grch38.tsv.gz + tabix -f -s 53 -b 56 -e 57 loeuf_dataset_grch38.tsv.gz The tabix utility must be installed in your path to use this plugin. @@ -84,6 +83,18 @@ use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepTabixPlugin); use Scalar::Util qw(looks_like_number); +# gnomAD v2 and v4 data have different headers +my $valid_headers = { + "v2" => { + "index" => [1, 30, 64], + "header" => ["transcript", "oe_lof_upper", "gene_id"] + }, + "v4" => { + "index" => [2, 22, 1], + "header" => ["transcript", "lof.oe_ci.upper", "gene_id"] + } +}; + sub new { my $class = shift; @@ -113,15 +124,25 @@ sub new { # Compare indexes of expected and observed columns die "ERROR: Could not read headers from $param_hash->{file}\n" unless defined($headers) && scalar @{$headers}; - my @obs_columns = @{$headers}[1,30,64] ; - my @exp_columns = ("transcript", "oe_lof_upper", "gene_id"); + my @missing_columns; - foreach my $index (0 .. $#exp_columns){ - if ($obs_columns[$index] ne $exp_columns[$index]){ - push @missing_columns,$exp_columns[$index]; + foreach my $version (keys %{ $valid_headers }){ + my $i = 0; + @missing_columns = (); + foreach my $index ( @{ $valid_headers->{$version}->{"index"} }){ + my $exp_column = $valid_headers->{$version}->{"header"}->[$i]; + my $obs_column = $headers->[$index] ? $headers->[$index] : ""; + + push @missing_columns, $exp_column if $exp_column ne $obs_column; + $i++; + } + + unless (@missing_columns) { + $self->{data_version} = $version; + last; } } - my $missing_columns_str = join(", ", @missing_columns) if scalar @missing_columns; + my $missing_columns_str = join(", ", @missing_columns) if scalar @missing_columns; die "ERROR: Missing columns: $missing_columns_str\n" if defined($missing_columns_str); # Check match_by argument and store on self @@ -172,11 +193,12 @@ sub run { my $first_entry_flag = 0; foreach (@data){ if (looks_like_number($_->{result}->{LOEUF})){ - if (!$first_entry_flag){ - $min_loeuf_score = $_->{result}->{LOEUF}; - $first_entry_flag = 1 ; - next; - } + if (!$first_entry_flag){ + $min_loeuf_score = $_->{result}->{LOEUF}; + $first_entry_flag = 1 ; + next; + } + if ($_->{result}->{LOEUF} < $min_loeuf_score){ $min_loeuf_score = $_->{result}->{LOEUF}; } @@ -194,7 +216,9 @@ sub run { sub parse_data { my ($self, $line) = @_; my @values = split /\t/, $line; - my ($transcript_id, $oe_lof_upper, $gene_id) = @values[1,30,64]; + + my ($transcript_id, $oe_lof_upper, $gene_id) = $self->{data_version} ? + @values[ @{ $valid_headers->{$self->{data_version}}->{"index"} } ] : @values[1,30,64]; return { gene_id => $gene_id, transcript_id => $transcript_id, diff --git a/pLI.pm b/pLI.pm index c5583a32..5f9015a0 100644 --- a/pLI.pm +++ b/pLI.pm @@ -68,6 +68,9 @@ pLI - Add pLI score to the VEP output ./vep -i variants.vcf --plugin pLI,values_file.txt ./vep -i variants.vcf --plugin pLI,values_file.txt,transcript # to check for the transcript score. + gnomAD v4 release expanded the scale of pLI score calculation. The file can be downloaded from - + https://gnomad.broadinstitute.org/downloads#v4-constraint (Constraint metrics TSV) + To use the data you can follow the same procedure as above but needs to change the column number to accordingly. =cut