Skip to content

Commit

Permalink
remove sql usage of assembly_exception table
Browse files Browse the repository at this point in the history
refs #25
refs #22

As of ensembl 110, this table was emptied causing our detection of
primary assembly to be flawed. This adopts a slightly flawed approach
for detecting the primary assembly, but one that should unblock
builds. Chromosomes detection is also now incomplete.
  • Loading branch information
dhimmel committed Feb 14, 2024
1 parent 4b1a4d8 commit 9cdc34e
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 16 deletions.
12 changes: 6 additions & 6 deletions ensembl_genes/queries/gene_alt_alleles.sql
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
-- Get alt allele groups: genes with multiple alleles
SELECT
gene.stable_id AS ensembl_gene_id,
alt_allele.alt_allele_group_id,
alt_allele_attrib.attrib = "IS_REPRESENTATIVE" AS alt_allele_is_representative,
assembly_exception.exc_seq_region_id IS NULL AS primary_assembly,
-- we used to use assembly_exception to determine primary assembly, but this table is now empty
-- https://github.com/related-sciences/ensembl-genes/issues/22#issuecomment-1664197773
-- instead just look for a short seq_region name (e.g. '19' instead of 'HSCHR19LRC_PGF1_CTG3_1'),
-- even though this is a flawed method that would miss scaffolds that are primary assemblies.
LENGTH(seq_region.name) <= 3 AS primary_assembly,
seq_region.name AS seq_region,
alt_allele_attrib.attrib AS alt_allele_attrib,
gene.created_date AS ensembl_created_date
Expand All @@ -13,11 +18,6 @@ INNER JOIN alt_allele_attrib
ON alt_allele.alt_allele_id = alt_allele_attrib.alt_allele_id
INNER JOIN seq_region
ON gene.seq_region_id = seq_region.seq_region_id
LEFT JOIN assembly_exception
ON seq_region.seq_region_id = assembly_exception.seq_region_id
-- keep exc_type in (PATCH_FIX, PATCH_NOVEL, HAP)
-- refs internal Related Sciences issue 606.
AND NOT assembly_exception.exc_type <=> "PAR"
-- all genes were current when query was written, ensure this is always the case
WHERE gene.is_current
ORDER BY alt_allele_group_id, alt_allele_is_representative DESC, primary_assembly DESC, ensembl_created_date, ensembl_gene_id
Expand Down
18 changes: 8 additions & 10 deletions ensembl_genes/queries/genes.sql
Original file line number Diff line number Diff line change
Expand Up @@ -15,26 +15,24 @@ SELECT
gene.modified_date AS ensembl_modified_date,
coord_system.version AS coord_system_version,
coord_system.name AS coord_system,
-- get chromosome: refs internal Related Sciences issue 606.
CASE WHEN coord_system.name = "chromosome"
THEN COALESCE(exc_seq_region.name, seq_region.name)
-- we are not able to determine the chromosomes not on the primary assembly
CASE WHEN LENGTH(seq_region.name) <= 3
THEN seq_region.name
END AS chromosome,
assembly_exception.exc_type AS seq_region_exc_type,
seq_region.name AS seq_region,
gene.seq_region_start AS seq_region_start,
gene.seq_region_end AS seq_region_end,
gene.seq_region_strand AS seq_region_strand,
assembly_exception.exc_seq_region_id IS NULL AS primary_assembly
-- we used to use assembly_exception to determine primary assembly, but this table is now empty
-- https://github.com/related-sciences/ensembl-genes/issues/22#issuecomment-1664197773
-- instead just look for a short seq_region name (e.g. '19' instead of 'HSCHR19LRC_PGF1_CTG3_1')
-- even though this is a flawed method that would miss scaffolds that are primary assemblies.
LENGTH(seq_region.name) <= 3 AS primary_assembly
FROM gene
LEFT JOIN xref ON xref.xref_id = gene.display_xref_id
LEFT JOIN external_db ON xref.external_db_id = external_db.external_db_id
LEFT JOIN seq_region ON gene.seq_region_id = seq_region.seq_region_id
LEFT JOIN coord_system ON seq_region.coord_system_id = coord_system.coord_system_id
LEFT JOIN assembly_exception ON seq_region.seq_region_id = assembly_exception.seq_region_id
-- keep exc_type in (PATCH_FIX, PATCH_NOVEL, HAP)
-- refs internal Related Sciences issue 606.
AND NOT assembly_exception.exc_type <=> "PAR"
LEFT JOIN seq_region AS exc_seq_region ON assembly_exception.exc_seq_region_id = exc_seq_region.seq_region_id
WHERE
-- all genes were current when query was written, ensure this is always the case
gene.is_current AND
Expand Down

0 comments on commit 9cdc34e

Please sign in to comment.