From 0f38ad681039f307ccf75103de7a9ed21fc12137 Mon Sep 17 00:00:00 2001 From: Syed Nakib Hossain Date: Thu, 7 Nov 2024 15:55:32 +0000 Subject: [PATCH 1/3] Allow gnomAD v4 data for LOEUF plugin --- LOEUF.pm | 66 ++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 45 insertions(+), 21 deletions(-) diff --git a/LOEUF.pm b/LOEUF.pm index 504d24e7..3217dcb5 100644 --- a/LOEUF.pm +++ b/LOEUF.pm @@ -48,25 +48,24 @@ limitations under the License. LOEUF scores can be downloaded from - https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7334197/bin/41586_2020_2308_MOESM4_ESM.zip + GRCh37: https://gnomad.broadinstitute.org/downloads#v2-constraint (pLoF Metrics by Gene TSV) + GRCh38: https://gnomad.broadinstitute.org/downloads#v4-constraint (Constraint metrics TSV) For GRCh37: These files can be tabix-processed by: - Unzip 41586_2020_2308_MOESM4_ESM.zip - cd supplement - zcat supplementary_dataset_11_full_constraint_metrics.tsv.gz | (head -n 1 && tail -n +2 | sort -t$'\t' -k 76,76 -k 77,77n ) > loeuf_temp.tsv + zcat gnomad.v2.1.1.lof_metrics.by_gene.txt.bgz | (head -n 1 && tail -n +2 | sort -t$'\t' -k 76,76 -k 77,77n ) > loeuf_temp.tsv sed '1s/.*/#&/' loeuf_temp.tsv > loeuf_dataset.tsv bgzip loeuf_dataset.tsv tabix -f -s 76 -b 77 -e 78 loeuf_dataset.tsv.gz For GRCh38: - The LOEUF scores are available for assembly GRCh37, to be able to run the plugin for GRCh38 - please remap the regions from file supplementary_dataset_11_full_constraint_metrics.tsv - After the remapping the file can be tabix-processed by: - cat supplementary_dataset_11_full_constraint_metrics_grch38.tsv | (head -n 1 && tail -n +2 | sort -t$'\t' -k 76,76 -k 77,77n ) > loeuf_grch38_temp.tsv + The GRCh38 file does not have gene co-ordinates information. First you need to add the gene co-ordiates information. + You can use the Ensembl Perl API to create a script and perform that - https://www.ensembl.org/info/docs/api/core/index.html. + After adding the start and end position of the genes at the last 2 columns you can process the file as follows: + cat gnomad.v4.1.constraint_metrics_with_coordinates.tsv | (head -n 1 && tail -n +2 | sort -t$'\t' -k 53,53 -k 56,56n ) > loeuf_grch38_temp.tsv sed '1s/.*/#&/' loeuf_grch38_temp.tsv > loeuf_dataset_grch38.tsv bgzip loeuf_dataset_grch38.tsv - tabix -f -s 76 -b 77 -e 78 loeuf_dataset_grch38.tsv.gz + tabix -f -s 53 -b 56 -e 57 loeuf_dataset_grch38.tsv.gz The tabix utility must be installed in your path to use this plugin. @@ -84,6 +83,20 @@ use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepTabixPlugin); use Scalar::Util qw(looks_like_number); +# gnomAD v2 and v4 data have different headers +my $valid_headers = { + "v2" => { + 1 => "transcript", + 30 => "oe_lof_upper", + 64 => "gene_id" + }, + "v4" => { + 2 => "transcript", + 22 => "lof.oe_ci.upper", + 1 => "gene_id" + }, +}; + sub new { my $class = shift; @@ -113,15 +126,23 @@ sub new { # Compare indexes of expected and observed columns die "ERROR: Could not read headers from $param_hash->{file}\n" unless defined($headers) && scalar @{$headers}; - my @obs_columns = @{$headers}[1,30,64] ; - my @exp_columns = ("transcript", "oe_lof_upper", "gene_id"); + my @missing_columns; - foreach my $index (0 .. $#exp_columns){ - if ($obs_columns[$index] ne $exp_columns[$index]){ - push @missing_columns,$exp_columns[$index]; + foreach my $version (keys %{ $valid_headers }){ + @missing_columns = (); + foreach my $index (keys %{ $valid_headers->{$version} }){ + my $exp_column = $valid_headers->{$version}->{$index}; + my $obs_column = $headers->[$index] ? $headers->[$index] : ""; + + push @missing_columns, $exp_column if $exp_column ne $obs_column; + } + + unless (@missing_columns) { + $self->{data_version} = $version; + last; } } - my $missing_columns_str = join(", ", @missing_columns) if scalar @missing_columns; + my $missing_columns_str = join(", ", @missing_columns) if scalar @missing_columns; die "ERROR: Missing columns: $missing_columns_str\n" if defined($missing_columns_str); # Check match_by argument and store on self @@ -172,11 +193,12 @@ sub run { my $first_entry_flag = 0; foreach (@data){ if (looks_like_number($_->{result}->{LOEUF})){ - if (!$first_entry_flag){ - $min_loeuf_score = $_->{result}->{LOEUF}; - $first_entry_flag = 1 ; - next; - } + if (!$first_entry_flag){ + $min_loeuf_score = $_->{result}->{LOEUF}; + $first_entry_flag = 1 ; + next; + } + if ($_->{result}->{LOEUF} < $min_loeuf_score){ $min_loeuf_score = $_->{result}->{LOEUF}; } @@ -194,7 +216,9 @@ sub run { sub parse_data { my ($self, $line) = @_; my @values = split /\t/, $line; - my ($transcript_id, $oe_lof_upper, $gene_id) = @values[1,30,64]; + + my ($transcript_id, $oe_lof_upper, $gene_id) = $self->{data_version} ? + @values[keys %{ $valid_headers->{$self->{data_version}} }] : @values[1,30,64]; return { gene_id => $gene_id, transcript_id => $transcript_id, From e8903a7b4a649d97a399656fe0534a81f4540746 Mon Sep 17 00:00:00 2001 From: Syed Nakib Hossain Date: Thu, 7 Nov 2024 16:43:12 +0000 Subject: [PATCH 2/3] Add a note for pLI for v4 data --- pLI.pm | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pLI.pm b/pLI.pm index c5583a32..5f9015a0 100644 --- a/pLI.pm +++ b/pLI.pm @@ -68,6 +68,9 @@ pLI - Add pLI score to the VEP output ./vep -i variants.vcf --plugin pLI,values_file.txt ./vep -i variants.vcf --plugin pLI,values_file.txt,transcript # to check for the transcript score. + gnomAD v4 release expanded the scale of pLI score calculation. The file can be downloaded from - + https://gnomad.broadinstitute.org/downloads#v4-constraint (Constraint metrics TSV) + To use the data you can follow the same procedure as above but needs to change the column number to accordingly. =cut From e2bfa57c10c35512dd39d7dcd47878f57767ab9c Mon Sep 17 00:00:00 2001 From: Syed Nakib Hossain Date: Fri, 8 Nov 2024 11:30:45 +0000 Subject: [PATCH 3/3] Fix issue with hash --- LOEUF.pm | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/LOEUF.pm b/LOEUF.pm index 3217dcb5..03964065 100644 --- a/LOEUF.pm +++ b/LOEUF.pm @@ -86,15 +86,13 @@ use Scalar::Util qw(looks_like_number); # gnomAD v2 and v4 data have different headers my $valid_headers = { "v2" => { - 1 => "transcript", - 30 => "oe_lof_upper", - 64 => "gene_id" + "index" => [1, 30, 64], + "header" => ["transcript", "oe_lof_upper", "gene_id"] }, "v4" => { - 2 => "transcript", - 22 => "lof.oe_ci.upper", - 1 => "gene_id" - }, + "index" => [2, 22, 1], + "header" => ["transcript", "lof.oe_ci.upper", "gene_id"] + } }; sub new { @@ -129,12 +127,14 @@ sub new { my @missing_columns; foreach my $version (keys %{ $valid_headers }){ + my $i = 0; @missing_columns = (); - foreach my $index (keys %{ $valid_headers->{$version} }){ - my $exp_column = $valid_headers->{$version}->{$index}; + foreach my $index ( @{ $valid_headers->{$version}->{"index"} }){ + my $exp_column = $valid_headers->{$version}->{"header"}->[$i]; my $obs_column = $headers->[$index] ? $headers->[$index] : ""; push @missing_columns, $exp_column if $exp_column ne $obs_column; + $i++; } unless (@missing_columns) { @@ -218,7 +218,7 @@ sub parse_data { my @values = split /\t/, $line; my ($transcript_id, $oe_lof_upper, $gene_id) = $self->{data_version} ? - @values[keys %{ $valid_headers->{$self->{data_version}} }] : @values[1,30,64]; + @values[ @{ $valid_headers->{$self->{data_version}}->{"index"} } ] : @values[1,30,64]; return { gene_id => $gene_id, transcript_id => $transcript_id,