diff --git a/config/ce11.clean.yml b/config/ce11.clean.yml deleted file mode 100644 index 913ddfa6f..000000000 --- a/config/ce11.clean.yml +++ /dev/null @@ -1,58 +0,0 @@ ---- -assembly: ce11 -chromosomes: - - chrI - - chrII - - chrIII - - chrIV - - chrM - - chrV - - chrX -database_dir: "~" -files: "~" -files_dir: ~ -statistics: - exonicAlleleFunctionField: refSeq.exonicAlleleFunction - outputExtensions: - json: .statistics.json - qc: .statistics.qc.tab - tab: .statistics.tab - refTrackField: ref - siteTypeField: refSeq.siteType - programPath: bystro-stats -temp_dir: "~" -fileProcessors: - snp: - args: --emptyField NA --minGq .95 - program: bystro-snp - vcf: - args: --emptyField NA --keepId --keepPos - program: bystro-vcf -tracks: - - name: ref - remote_dir: http://hgdownload.soe.ucsc.edu/goldenPath/ce11/chromosomes/ - remote_files: - - chrI.fa.gz - - chrII.fa.gz - - chrIII.fa.gz - - chrIV.fa.gz - - chrM.fa.gz - - chrV.fa.gz - - chrX.fa.gz - type: reference - - features: - - name - - name2 - name: refSeq - sql_statement: SELECT * FROM ce11.refGene - type: gene - - name: phastCons - remote_dir: http://hgdownload.soe.ucsc.edu/goldenPath/ce11/phastCons26way/ - remote_files: - - ce11.phastCons26way.wigFix.gz - type: score - - name: phyloP - remote_dir: http://hgdownload.soe.ucsc.edu/goldenPath/ce11/phyloP26way/ - remote_files: - - ce11.phyloP26way.wigFix.gz - type: score diff --git a/config/ce11.mapping.yml b/config/ce11.mapping.yml deleted file mode 120000 index 317a04219..000000000 --- a/config/ce11.mapping.yml +++ /dev/null @@ -1 +0,0 @@ -./hg19.mapping.yml \ No newline at end of file diff --git a/config/danRer10.clean.yml b/config/danRer10.clean.yml deleted file mode 100644 index 5d55bfc4c..000000000 --- a/config/danRer10.clean.yml +++ /dev/null @@ -1,64 +0,0 @@ ---- -assembly: danRer10 -chromosomes: - - chr1 - - chr2 - - chr3 - - chr4 - - chr5 - - chr6 - - chr7 - - chr8 - - chr9 - - chr10 - - chr11 - - chr12 - - chr13 - - chr14 - - chr15 - - chr16 - - chr17 - - chr18 - - chr19 - - chr20 - - chr21 - - chr22 - - chr23 - - chr24 - - chr25 - - chrM -database_dir: "~" -files: "~" -files_dir: ~ -statistics: - dbSNPnameField: ~ - exonicAlleleFunctionField: refSeq.exonicAlleleFunction - outputExtensions: - json: .statistics.json - qc: .statistics.qc.tab - tab: .statistics.tab - refTrackField: ref - siteTypeField: refSeq.siteType - programPath: bystro-stats -temp_dir: "~" -fileProcessors: - snp: - args: --emptyField NA --minGq .95 - program: bystro-snp - vcf: - args: --emptyField NA --keepId --keepPos - program: bystro-vcf -tracks: - - local_files: - - danRer10.fa.gz - name: ref - remote_dir: http://hgdownload.cse.ucsc.edu/goldenPath/danRer10/bigZips/ - remote_files: - - danRer10.fa.gz - type: reference - - features: - - name - - name2 - name: refSeq - sql_statement: SELECT * FROM danRer10.refGene - type: gene diff --git a/config/dm6.clean.yml b/config/dm6.clean.yml deleted file mode 100644 index 8ea93858e..000000000 --- a/config/dm6.clean.yml +++ /dev/null @@ -1,121 +0,0 @@ -assembly: dm6 -build_author: ec2-user -build_date: 2018-05-28T02:46:00 -chromosomes: - - chr2L - - chr2R - - chr3L - - chr3R - - chr4 - - chrM - - chrX - - chrY -database_dir: "~" -files_dir: "~" -statistics: - dbSNPnameField: "" - exonicAlleleFunctionField: refSeq.exonicAlleleFunction - outputExtensions: - json: .statistics.json - qc: .statistics.qc.tsv - tab: .statistics.tsv - refTrackField: ref - siteTypeField: refSeq.siteType -temp_dir: "~" -fileProcessors: - snp: - args: --emptyField NA --minGq .95 - program: bystro-snp - vcf: - args: --emptyField NA --keepId --keepPos - program: bystro-vcf -tracks: - - build_author: ec2-user - build_date: 2017-04-23T20:40:00 - fetch_date: 2017-04-23T20:17:00 - local_files: - - dm6.fa.gz - name: ref - remote_dir: http://hgdownload.cse.ucsc.edu/goldenPath/dm6/bigZips/ - remote_files: - - dm6.fa.gz - type: reference - version: 1 - - build_author: ec2-user - build_date: 2017-04-23T20:40:00 - features: - - name - - name2 - fetch_date: 2017-04-23T20:39:00 - local_files: - - dm6.refGene.chr2L.gz - - dm6.refGene.chr2R.gz - - dm6.refGene.chr3L.gz - - dm6.refGene.chr3R.gz - - dm6.refGene.chr4.gz - - dm6.refGene.chrX.gz - - dm6.refGene.chrY.gz - name: refSeq - sql_statement: SELECT * FROM dm6.refGene - type: gene - version: 1 - - build_author: ec2-user - build_date: 2018-05-28T02:46:00 - features: - - name - - name2 - fetch_date: 2018-05-28T02:42:00 - local_files: - - dm6.ensGene.chr2L.gz - - dm6.ensGene.chr2R.gz - - dm6.ensGene.chr3L.gz - - dm6.ensGene.chr3R.gz - - dm6.ensGene.chr4.gz - - dm6.ensGene.chrX.gz - - dm6.ensGene.chrY.gz - name: ensGene - sql_statement: SELECT * FROM dm6.ensGene - type: gene - version: 1 - - build_author: ec2-user - build_date: 2018-05-28T02:46:00 - features: - - name - - name2 - fetch_date: 2018-05-28T02:43:00 - local_files: - - dm6.ncbiRefSeq.chr2L.gz - - dm6.ncbiRefSeq.chr2R.gz - - dm6.ncbiRefSeq.chr3L.gz - - dm6.ncbiRefSeq.chr3R.gz - - dm6.ncbiRefSeq.chr4.gz - - dm6.ncbiRefSeq.chrM.gz - - dm6.ncbiRefSeq.chrX.gz - - dm6.ncbiRefSeq.chrY.gz - name: ncbiRefSeq - sql_statement: SELECT * FROM dm6.ncbiRefSeq - type: gene - version: 1 - - build_author: ec2-user - build_date: 2017-04-23T20:40:00 - fetch_date: 2017-04-23T20:16:00 - local_files: - - dm6.27way.phastCons.wigFix.gz - name: phastCons - remote_dir: http://hgdownload.cse.ucsc.edu/goldenPath/dm6/phastCons27way/ - remote_files: - - dm6.27way.phastCons.wigFix.gz - type: score - version: 1 - - build_author: ec2-user - build_date: 2017-04-23T20:40:00 - fetch_date: 2017-04-23T20:16:00 - local_files: - - dm6.phyloP27way.wigFix.gz - name: phyloP - remote_dir: http://hgdownload.cse.ucsc.edu/goldenPath/dm6/phyloP27way/ - remote_files: - - dm6.phyloP27way.wigFix.gz - type: score - version: 1 -version: 2 diff --git a/config/dm6.mapping.yml b/config/dm6.mapping.yml deleted file mode 100644 index 9685579c2..000000000 --- a/config/dm6.mapping.yml +++ /dev/null @@ -1,442 +0,0 @@ -#Our own, special field, tells an interface which fields to run prefix queries on -sort: - refSeq.codonNumber: avg - refSeq.codonPosition: avg - ensGene.codonNumber: avg - ensGene.codonPosition: avg - ncbiRefSeq.codonNumber: avg - ncbiRefSeq.codonPosition: avg - post_index_settings: - index: - refresh_interval: 15s - number_of_replicas: 1 - index_settings: - index: - refresh_interval: -1 - number_of_replicas: 0 - number_of_shards: 6 - codec: best_compression - analysis: - normalizer: - lowercase_normalizer: - type: custom - filter: - - lowercase - - asciifolding - uppercase_normalizer: - type: custom - filter: - - uppercase - - asciifolding - filter: - catenate_filter: - type: word_delimiter - catenate_words: true - catenate_numbers: true - catenate_all: true - preserve_original: false - generate_word_parts: false - stem_english_possessive: true - generate_number_parts: false - split_on_numerics: false - split_on_case_change: false - catenate_filter_split: - type: word_delimiter - catenate_words: true - catenate_numbers: true - catenate_all: true - preserve_original: false - generate_word_parts: true - stem_english_possessive: true - generate_number_parts: false - split_on_numerics: false - split_on_case_change: true - english_stemmer: - type: stemmer - language: light_english - search_synonym_filter: - type: synonym - synonyms_path: "analysis/search-synonyms.txt" - amino_synonym_filter: - type: synonym - synonyms_path: "analysis/amino-synonyms.txt" - type_synonym_filter: - type: synonym - synonyms_path: "analysis/type-synonyms.txt" - dbSNP_func_synonyms: - type: synonym - synonyms_path: "analysis/dbsnp-func-synonyms.txt" - dbSNP_class_synonyms: - type: synonym - synonyms_path: "analysis/dbsnp-class-synonyms.txt" - exonic_allele_function_search_synonyms: - type: synonym - synonyms_path: "analysis/exonic-allele-function-search-synonyms.txt" - site_type_synonym_filter: - type: synonym - synonyms_path: "analysis/site-type-synonyms.txt" - codon_map_synonym_filter: - type: synonym - synonyms_path: "analysis/codon-map-synonyms.txt" - description_synonyms: - type: synonym - synonyms_path: "analysis/refseq-description-synonyms.txt" - disease_synonyms: - type: synonym - synonyms_path: "analysis/disease-synonyms.txt" - autocomplete_filter: - type: edge_ngram - min_gram: 1 - max_gram: 30 - token_chars: - - letter - - digit - english_stop: - type: stop - stopwords: - - a - - an - - and - - are - - as - - at - - be - - but - - by - - for - - if - - in - - into - - is - - it - - of - - on - - or - - has - - such - - that - - the - - their - - then - - there - - these - - they - - this - - to - - was - - will - - with - - maybe - analyzer: - autocomplete_english: - type: custom - tokenizer: whitespace - filter: - - lowercase - - asciifolding - # - english_stop - - catenate_filter - - english_stemmer - - autocomplete_filter - autocomplete_english_split: - type: custom - tokenizer: whitespace - filter: - - lowercase - - asciifolding - #- english_stop - - catenate_filter_split - - english_stemmer - - autocomplete_filter - search_english: - type: custom - tokenizer: whitespace - filter: - - lowercase - - asciifolding - #- english_stop - - catenate_filter - - english_stemmer - - search_synonym_filter - search_english_split: - type: custom - tokenizer: whitespace - filter: - - lowercase - - asciifolding - #- english_stop - - catenate_filter_split - - english_stemmer - - search_synonym_filter - search_english_type: - type: custom - tokenizer: whitespace - filter: - - lowercase - - asciifolding - #- english_stop - - catenate_filter - - english_stemmer - - type_synonym_filter - - dbSNP_class_synonyms - search_english_description_synonyms: - type: custom - tokenizer: whitespace - filter: - - lowercase - - asciifolding - #- english_stop - - catenate_filter_split - - english_stemmer - - description_synonyms - - disease_synonyms - search_english_class: - type: custom - tokenizer: whitespace - filter: - - lowercase - - asciifolding - #- english_stop - - catenate_filter - - english_stemmer - - dbSNP_class_synonyms - search_english_func: - type: custom - tokenizer: whitespace - filter: - - lowercase - - asciifolding - #- english_stop - - catenate_filter - - english_stemmer - - dbSNP_func_synonyms - uppercase_keyword_text: - type: custom - tokenizer: keyword - filter: - - uppercase - - asciifolding - uppercase_keyword_text_codon: - type: custom - tokenizer: keyword - filter: - - uppercase - - asciifolding - - codon_map_synonym_filter - - amino_synonym_filter - uppercase_keyword_text_amino: - type: custom - tokenizer: keyword - filter: - - uppercase - - asciifolding - - amino_synonym_filter - mappings: - _all: - enabled: false - properties: - chrom: - type: keyword - normalizer: lowercase_normalizer - # chr's are very short, and the "prefix" is a completely valid value - # so, don't include in all, because many false positivies with ngrams - pos: - type: integer - trTv: - type: byte - type: - type: text - analyzer: autocomplete_english - search_analyzer: search_english_type - fields: - exact: - type: keyword - normalizer: lowercase_normalizer - discordant: - type: byte - heterozygotes: - type: keyword - heterozygosity: - type: half_float - homozygotes: - type: keyword - homozygosity: - type: half_float - missingGenos: - type: keyword - missingness: - type: half_float - ac: - type: integer - an: - type: integer - sampleMaf: - type: half_float - alt: - type: keyword - normalizer: uppercase_normalizer - ref: - type: keyword - normalizer: uppercase_normalizer - refSeq: - properties: - siteType: - type: text - analyzer: autocomplete_english - search_analyzer: search_english_func - fields: - exact: - type: keyword - normalizer: lowercase_normalizer - exonicAlleleFunction: - type: text - analyzer: autocomplete_english - search_analyzer: search_english_func - fields: - exact: - type: keyword - normalizer: lowercase_normalizer - refCodon: - type: keyword - normalizer: uppercase_normalizer - altCodon: - type: keyword - normalizer: uppercase_normalizer - refAminoAcid: - type: text - analyzer: uppercase_keyword_text - search_analyzer: uppercase_keyword_text_amino - fields: - exact: - type: keyword - normalizer: uppercase_normalizer - altAminoAcid: - type: text - analyzer: uppercase_keyword_text - search_analyzer: uppercase_keyword_text_amino - fields: - exact: - type: keyword - normalizer: uppercase_normalizer - codonPosition: - type: byte - codonNumber: - type: integer - strand: - type: keyword - name2: - type: keyword - normalizer: uppercase_normalizer - name: - type: keyword - normalizer: uppercase_normalizer - ensGene: - properties: - siteType: - type: text - analyzer: autocomplete_english - search_analyzer: search_english_func - fields: - exact: - type: keyword - normalizer: lowercase_normalizer - exonicAlleleFunction: - type: text - analyzer: autocomplete_english - search_analyzer: search_english_func - fields: - exact: - type: keyword - normalizer: lowercase_normalizer - refCodon: - type: keyword - normalizer: uppercase_normalizer - altCodon: - type: keyword - normalizer: uppercase_normalizer - refAminoAcid: - type: text - analyzer: uppercase_keyword_text - search_analyzer: uppercase_keyword_text_amino - fields: - exact: - type: keyword - normalizer: uppercase_normalizer - altAminoAcid: - type: text - analyzer: uppercase_keyword_text - search_analyzer: uppercase_keyword_text_amino - fields: - exact: - type: keyword - normalizer: uppercase_normalizer - codonPosition: - type: byte - codonNumber: - type: integer - strand: - type: keyword - name2: - type: keyword - normalizer: uppercase_normalizer - name: - type: keyword - normalizer: uppercase_normalizer - ncbiRefSeq: - properties: - siteType: - type: text - analyzer: autocomplete_english - search_analyzer: search_english_func - fields: - exact: - type: keyword - normalizer: lowercase_normalizer - exonicAlleleFunction: - type: text - analyzer: autocomplete_english - search_analyzer: search_english_func - fields: - exact: - type: keyword - normalizer: lowercase_normalizer - refCodon: - type: keyword - normalizer: uppercase_normalizer - altCodon: - type: keyword - normalizer: uppercase_normalizer - refAminoAcid: - type: text - analyzer: uppercase_keyword_text - search_analyzer: uppercase_keyword_text_amino - fields: - exact: - type: keyword - normalizer: uppercase_normalizer - altAminoAcid: - type: text - analyzer: uppercase_keyword_text - search_analyzer: uppercase_keyword_text_amino - fields: - exact: - type: keyword - normalizer: uppercase_normalizer - codonPosition: - type: byte - codonNumber: - type: integer - strand: - type: keyword - name2: - type: keyword - normalizer: uppercase_normalizer - name: - type: keyword - normalizer: uppercase_normalizer - phastCons: - type: scaled_float - scaling_factor: 100 - phyloP: - type: scaled_float - scaling_factor: 100 diff --git a/config/hg19_ensembl.clean.yml b/config/hg19_ensembl.clean.yml deleted file mode 100644 index 4686a0965..000000000 --- a/config/hg19_ensembl.clean.yml +++ /dev/null @@ -1,443 +0,0 @@ ---- -assembly: hg19 -build_author: ec2-user -build_date: 2017-10-07T17:00:00 -chromosomes: - - chr1 - - chr2 - - chr3 - - chr4 - - chr5 - - chr6 - - chr7 - - chr8 - - chr9 - - chr10 - - chr11 - - chr12 - - chr13 - - chr14 - - chr15 - - chr16 - - chr17 - - chr18 - - chr19 - - chr20 - - chr21 - - chr22 - - chrM - - chrX - - chrY -database_dir: "~" -files_dir: "~" -statistics: - dbSNPnameField: dbSNP.name - exonicAlleleFunctionField: refSeq.exonicAlleleFunction - outputExtensions: - json: .statistics.json - qc: .statistics.qc.tsv - tab: .statistics.tsv - programPath: bystro-stats - refTrackField: ref - siteTypeField: refSeq.siteType -fileProcessors: - snp: - args: --emptyField NA --minGq .95 - program: bystro-snp - vcf: - args: --emptyField NA --sample %sampleList% - program: bystro-vcf -temp_dir: "~" -tracks: - - build_author: ec2-user - build_date: 2017-09-27T18:27:00 - fetch_date: 2017-09-27T02:05:00 - local_files: - - chr1.fa.gz - - chr2.fa.gz - - chr3.fa.gz - - chr4.fa.gz - - chr5.fa.gz - - chr6.fa.gz - - chr7.fa.gz - - chr8.fa.gz - - chr9.fa.gz - - chr10.fa.gz - - chr11.fa.gz - - chr12.fa.gz - - chr13.fa.gz - - chr14.fa.gz - - chr15.fa.gz - - chr16.fa.gz - - chr17.fa.gz - - chr18.fa.gz - - chr19.fa.gz - - chr20.fa.gz - - chr21.fa.gz - - chr22.fa.gz - - chrM.fa.gz - - chrX.fa.gz - - chrY.fa.gz - name: ref - remote_dir: http://hgdownload.soe.ucsc.edu/goldenPath/hg19/chromosomes/ - remote_files: - - chr1.fa.gz - - chr2.fa.gz - - chr3.fa.gz - - chr4.fa.gz - - chr5.fa.gz - - chr6.fa.gz - - chr7.fa.gz - - chr8.fa.gz - - chr9.fa.gz - - chr10.fa.gz - - chr11.fa.gz - - chr12.fa.gz - - chr13.fa.gz - - chr14.fa.gz - - chr15.fa.gz - - chr16.fa.gz - - chr17.fa.gz - - chr18.fa.gz - - chr19.fa.gz - - chr20.fa.gz - - chr21.fa.gz - - chr22.fa.gz - - chrM.fa.gz - - chrX.fa.gz - - chrY.fa.gz - type: reference - version: 1 - - build_author: ec2-user - build_date: 2017-10-07T17:00:00 - features: - - name - - name2 - - gene - - strand - - txStart - fetch_date: 2017-10-07T16:57:00 - join: - features: - - alleleID - - phenotypeList - - clinicalSignificance - - type - - origin - - numberSubmitters - - reviewStatus - - chromStart - - chromEnd - track: clinvar - local_files: - - hg19.ensGene.chr1.gz - - hg19.ensGene.chr2.gz - - hg19.ensGene.chr3.gz - - hg19.ensGene.chr4.gz - - hg19.ensGene.chr5.gz - - hg19.ensGene.chr6.gz - - hg19.ensGene.chr7.gz - - hg19.ensGene.chr8.gz - - hg19.ensGene.chr9.gz - - hg19.ensGene.chr10.gz - - hg19.ensGene.chr11.gz - - hg19.ensGene.chr12.gz - - hg19.ensGene.chr13.gz - - hg19.ensGene.chr14.gz - - hg19.ensGene.chr15.gz - - hg19.ensGene.chr16.gz - - hg19.ensGene.chr17.gz - - hg19.ensGene.chr18.gz - - hg19.ensGene.chr19.gz - - hg19.ensGene.chr20.gz - - hg19.ensGene.chr21.gz - - hg19.ensGene.chr22.gz - - hg19.ensGene.chrM.gz - - hg19.ensGene.chrX.gz - - hg19.ensGene.chrY.gz - name: ensembl - sql_statement: - SELECT ensGene.name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds,name2,value - AS gene FROM hg19.ensGene LEFT JOIN hg19.ensemblToGeneName ON (hg19.ensGene.name=hg19.ensemblToGeneName.name) - type: gene - version: 2 - - build_author: ec2-user - build_date: 2017-09-27T18:27:00 - local_files: - - chr*.phastCons100way.wigFix.gz - name: phastCons - remote_dir: http://hgdownload.soe.ucsc.edu/goldenPath/hg19/phastCons100way/hg19.100way.phastCons/ - remote_files: - - chr1.phastCons100way.wigFix.gz - - chr2.phastCons100way.wigFix.gz - - chr3.phastCons100way.wigFix.gz - - chr4.phastCons100way.wigFix.gz - - chr5.phastCons100way.wigFix.gz - - chr6.phastCons100way.wigFix.gz - - chr7.phastCons100way.wigFix.gz - - chr8.phastCons100way.wigFix.gz - - chr9.phastCons100way.wigFix.gz - - chr10.phastCons100way.wigFix.gz - - chr11.phastCons100way.wigFix.gz - - chr12.phastCons100way.wigFix.gz - - chr13.phastCons100way.wigFix.gz - - chr14.phastCons100way.wigFix.gz - - chr15.phastCons100way.wigFix.gz - - chr16.phastCons100way.wigFix.gz - - chr17.phastCons100way.wigFix.gz - - chr18.phastCons100way.wigFix.gz - - chr19.phastCons100way.wigFix.gz - - chr20.phastCons100way.wigFix.gz - - chr21.phastCons100way.wigFix.gz - - chr22.phastCons100way.wigFix.gz - - chrX.phastCons100way.wigFix.gz - - chrY.phastCons100way.wigFix.gz - - chrM.phastCons100way.wigFix.gz - type: score - version: 1 - - build_author: ec2-user - build_date: 2017-09-27T18:27:00 - local_files: - - chr*.phyloP100way.wigFix.gz - name: phyloP - remote_dir: http://hgdownload.soe.ucsc.edu/goldenPath/hg19/phyloP100way/hg19.100way.phyloP100way/ - remote_files: - - chr1.phyloP100way.wigFix.gz - - chr2.phyloP100way.wigFix.gz - - chr3.phyloP100way.wigFix.gz - - chr4.phyloP100way.wigFix.gz - - chr5.phyloP100way.wigFix.gz - - chr6.phyloP100way.wigFix.gz - - chr7.phyloP100way.wigFix.gz - - chr8.phyloP100way.wigFix.gz - - chr9.phyloP100way.wigFix.gz - - chr10.phyloP100way.wigFix.gz - - chr11.phyloP100way.wigFix.gz - - chr12.phyloP100way.wigFix.gz - - chr13.phyloP100way.wigFix.gz - - chr14.phyloP100way.wigFix.gz - - chr15.phyloP100way.wigFix.gz - - chr16.phyloP100way.wigFix.gz - - chr17.phyloP100way.wigFix.gz - - chr18.phyloP100way.wigFix.gz - - chr19.phyloP100way.wigFix.gz - - chr20.phyloP100way.wigFix.gz - - chr21.phyloP100way.wigFix.gz - - chr22.phyloP100way.wigFix.gz - - chrX.phyloP100way.wigFix.gz - - chrY.phyloP100way.wigFix.gz - - chrM.phyloP100way.wigFix.gz - type: score - version: 1 - - build_author: ec2-user - build_date: 2017-09-27T18:27:00 - caddToBed_date: 2017-04-22T06:41:00 - liftOverCadd_date: 2017-07-28T17:35:00 - local_files: - - whole_genome_SNVs.tsv.bed.chr*.organized-by-chr.txt.sorted.txt.gz - - whole_genome_SNVs.tsv.bed.chrM.organized-by-chr.txt.sorted.txt.mapped.gz - name: cadd - sortCadd_date: 2017-04-23T15:44:00 - sort_date: 2017-01-20T16:06:00 - sorted: 1 - type: cadd - version: 1 - - build_author: ec2-user - build_date: 2017-09-27T18:27:00 - build_field_transformations: - alleleFreqs: split [,] - alleleNs: split [,] - alleles: split [,] - func: split [,] - observed: split [\/] - features: - - name - - strand - - observed - - class - - func - - alleles - - alleleNs: number - - alleleFreqs: number - fetch_date: 2017-09-27T02:12:00 - local_files: - - hg19.snp150.chr1.gz - - hg19.snp150.chr2.gz - - hg19.snp150.chr3.gz - - hg19.snp150.chr4.gz - - hg19.snp150.chr5.gz - - hg19.snp150.chr6.gz - - hg19.snp150.chr7.gz - - hg19.snp150.chr8.gz - - hg19.snp150.chr9.gz - - hg19.snp150.chr10.gz - - hg19.snp150.chr11.gz - - hg19.snp150.chr12.gz - - hg19.snp150.chr13.gz - - hg19.snp150.chr14.gz - - hg19.snp150.chr15.gz - - hg19.snp150.chr16.gz - - hg19.snp150.chr17.gz - - hg19.snp150.chr18.gz - - hg19.snp150.chr19.gz - - hg19.snp150.chr20.gz - - hg19.snp150.chr21.gz - - hg19.snp150.chr22.gz - - hg19.snp150.chrM.gz - - hg19.snp150.chrX.gz - - hg19.snp150.chrY.gz - name: dbSNP - sql_statement: SELECT * FROM hg19.snp150 - type: sparse - version: 1 - - based: 1 - build_author: ec2-user - build_date: 2017-09-27T18:27:00 - build_field_transformations: - chrom: chr . - clinicalSignificance: split [;] - origin: split [;] - phenotypeList: split [;] - reviewStatus: split [;] - type: split [;] - build_row_filters: - Assembly: == GRCh37 - features: - - alleleID: number - - phenotypeList - - clinicalSignificance - - type - - origin - - numberSubmitters: number - - reviewStatus - - referenceAllele - - alternateAllele - fetch_date: 2017-10-07T16:57:00 - fieldMap: - "#AlleleID": alleleID - AlternateAllele: alternateAllele - Chromosome: chrom - ClinicalSignificance: clinicalSignificance - NumberSubmitters: numberSubmitters - Origin: origin - PhenotypeIDS: phenotypeIDs - PhenotypeList: phenotypeList - ReferenceAllele: referenceAllele - ReviewStatus: reviewStatus - Start: chromStart - Stop: chromEnd - Type: type - local_files: - - variant_summary.txt.gz - name: clinvar - remote_files: - - ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz - type: sparse - version: 1 - - build_author: ec2-user - build_date: 2017-09-27T18:27:00 - build_row_filters: - AS_FilterStatus: == PASS - features: - - alt - - id - - af: number - - an: number - - an_afr: number - - an_amr: number - - an_asj: number - - an_eas: number - - an_fin: number - - an_nfe: number - - an_oth: number - - an_male: number - - an_female: number - - af_afr: number - - af_amr: number - - af_asj: number - - af_eas: number - - af_fin: number - - af_nfe: number - - af_oth: number - - af_male: number - - af_female: number - fieldMap: - AF: af - AF_AFR: af_afr - AF_AMR: af_amr - AF_ASJ: af_asj - AF_EAS: af_eas - AF_FIN: af_fin - AF_Female: af_female - AF_Male: af_male - AF_NFE: af_nfe - AF_OTH: af_oth - AN: an - AN_AFR: an_afr - AN_AMR: an_amr - AN_ASJ: an_asj - AN_EAS: an_eas - AN_FIN: an_fin - AN_Female: an_female - AN_Male: an_male - AN_NFE: an_nfe - AN_OTH: an_oth - local_files: - - gnomad.genomes.r2.0.1.sites.*.vcf.gz - name: gnomad.genomes - type: vcf - version: 1 - - build_author: ec2-user - build_date: 2017-09-27T18:27:00 - build_row_filters: - AS_FilterStatus: == PASS - features: - - alt - - id - - af: number - - an: number - - an_afr: number - - an_amr: number - - an_asj: number - - an_eas: number - - an_fin: number - - an_nfe: number - - an_oth: number - - an_male: number - - an_female: number - - af_afr: number - - af_amr: number - - af_asj: number - - af_eas: number - - af_fin: number - - af_nfe: number - - af_oth: number - - af_male: number - - af_female: number - fieldMap: - AF: af - AF_AFR: af_afr - AF_AMR: af_amr - AF_ASJ: af_asj - AF_EAS: af_eas - AF_FIN: af_fin - AF_Female: af_female - AF_Male: af_male - AF_NFE: af_nfe - AF_OTH: af_oth - AN: an - AN_AFR: an_afr - AN_AMR: an_amr - AN_ASJ: an_asj - AN_EAS: an_eas - AN_FIN: an_fin - AN_Female: an_female - AN_Male: an_male - AN_NFE: an_nfe - AN_OTH: an_oth - local_files: - - gnomad.exomes.r2.0.1.sites.vcf.gz - name: gnomad.exomes - type: vcf - version: 1 -version: 2 diff --git a/config/hg19_ensembl.mapping.yml b/config/hg19_ensembl.mapping.yml deleted file mode 100644 index a2d2194fc..000000000 --- a/config/hg19_ensembl.mapping.yml +++ /dev/null @@ -1,603 +0,0 @@ -#Our own, special field, tells an interface which fields to run prefix queries on -numericalFields: - - cadd - - phastCons - - phyloP - - pos - - dbSNP.alleleNs - - dbSNP.alleleFreqs - - ensembl.codonNumber - - ensembl.codonPosition - - clinvar.alleleID - - ensembl.clinvar.alleleID - - ensembl.clinvar.chromStart - - ensembl.clinvar.numberSubmitters - - clinvar.numberSubmitters -sort: - cadd: avg - dbSNP.alleleNs: avg - dbSNP.alleleFreqs: min - ensembl.codonNumber: avg - ensembl.codonPosition: avg -booleanFields: - - discordant -post_index_settings: - index: - refresh_interval: 15s - number_of_replicas: 1 -index_settings: - index: - refresh_interval: -1 - number_of_replicas: 0 - number_of_shards: 12 - codec: best_compression - analysis: - normalizer: - lowercase_normalizer: - type: custom - filter: - - lowercase - - asciifolding - uppercase_normalizer: - type: custom - filter: - - uppercase - - asciifolding - filter: - catenate_filter: - type: word_delimiter - catenate_words: true - catenate_numbers: true - catenate_all: true - preserve_original: false - generate_word_parts: false - stem_english_possessive: true - generate_number_parts: false - split_on_numerics: false - split_on_case_change: false - catenate_filter_split: - type: word_delimiter - catenate_words: true - catenate_numbers: true - catenate_all: true - preserve_original: false - generate_word_parts: true - stem_english_possessive: true - generate_number_parts: false - split_on_numerics: false - split_on_case_change: true - english_stemmer: - type: stemmer - language: light_english - english_possessive_stemmer: - type: stemmer - language: possessive_english - english_minimal_stemmer: - type: stemmer - language: minimal_english - search_synonym_filter: - type: synonym - synonyms_path : "analysis/search-synonyms.txt" - amino_synonym_filter: - type: synonym - synonyms_path : "analysis/amino-synonyms.txt" - type_synonym_filter: - type: synonym - synonyms_path : "analysis/type-synonyms.txt" - dbSNP_func_synonyms: - type: synonym - synonyms_path : "analysis/dbsnp-func-synonyms.txt" - dbSNP_class_synonyms: - type: synonym - synonyms_path : "analysis/dbsnp-class-synonyms.txt" - exonic_allele_function_search_synonyms: - type: synonym - synonyms_path : "analysis/exonic-allele-function-search-synonyms.txt" - site_type_synonym_filter: - type: synonym - synonyms_path : "analysis/site-type-synonyms.txt" - codon_map_synonym_filter: - type: synonym - synonyms_path : "analysis/codon-map-synonyms.txt" - description_synonyms: - type: synonym - synonyms_path : "analysis/refseq-description-synonyms.txt" - disease_synonyms: - type: synonym - synonyms_path : "analysis/disease-synonyms.txt" - autocomplete_filter: - type: edge_ngram - min_gram: 1 - max_gram: 30 - token_chars: - - letter - - digit - # english_stop: - # type: stop - # stopwords: - # - a - # - an - # - and - # - are - # - as - # - at - # - be - # - but - # - by - # - for - # - if - # - in - # - into - # - is - # - it - # - of - # - on - # - or - # - has - # - such - # - that - # - the - # - their - # - then - # - there - # - these - # - they - # - this - # - to - # - was - # - will - # - with - analyzer: - autocomplete_english: - type: custom - tokenizer: whitespace - filter: - - lowercase - - asciifolding - #- english_stop - - catenate_filter - - english_stemmer - - autocomplete_filter - autocomplete_english_split: - type: custom - tokenizer: whitespace - filter: - - lowercase - - asciifolding - #- english_stop - - catenate_filter_split - - english_stemmer - - autocomplete_filter - search_english: - type: custom - tokenizer: whitespace - filter: - - lowercase - - asciifolding - #- english_stop - - catenate_filter - - english_stemmer - - search_synonym_filter - search_english_split: - type: custom - tokenizer: whitespace - filter: - - lowercase - - asciifolding - #- english_stop - - catenate_filter_split - - english_stemmer - - search_synonym_filter - search_english_type: - type: custom - tokenizer: whitespace - filter: - - lowercase - - asciifolding - #- english_stop - - catenate_filter - - english_stemmer - - type_synonym_filter - - dbSNP_class_synonyms - search_english_description_synonyms: - type: custom - tokenizer: whitespace - filter: - - lowercase - - asciifolding - #- english_stop - - catenate_filter_split - - english_stemmer - - description_synonyms - - disease_synonyms - search_english_class: - type: custom - tokenizer: whitespace - filter: - - lowercase - - asciifolding - #- english_stop - - catenate_filter - - english_stemmer - - dbSNP_class_synonyms - search_english_func: - type: custom - tokenizer: whitespace - filter: - - lowercase - - asciifolding - #- english_stop - - catenate_filter - - english_stemmer - - dbSNP_func_synonyms - lowercase_keyword: - type: custom - tokenizer: keyword - filter: - - lowercase - - asciifolding - #- english_stop - - catenate_filter - - english_stemmer - lowercase_keyword_codon: - type: custom - tokenizer: keyword - filter: - - lowercase - - asciifolding - #- english_stop - - catenate_filter - - english_stemmer - - codon_map_synonym_filter - - amino_synonym_filter - - search_synonym_filter - lowercase_keyword_amino: - type: custom - tokenizer: keyword - filter: - - lowercase - - asciifolding - #- english_stop - - catenate_filter - - english_stemmer - - amino_synonym_filter - - search_synonym_filter -mappings: - _all: - enabled: false - properties: - chrom: - type: keyword - normalizer: lowercase_normalizer - # chr's are very short, and the "prefix" is a completely valid value - # so, don't include in all, because many false positivies with ngrams - pos: - type: integer - trTv: - type: byte - type: - type: text - analyzer: autocomplete_english - search_analyzer: search_english_type - fields: - exact: - type: keyword - normalizer: lowercase_normalizer - discordant: - type: boolean - heterozygotes: - type: keyword - heterozygosity: - type: half_float - homozygotes: - type: keyword - homozygosity: - type: half_float - missingGenos: - type: keyword - missingness: - type: half_float - sampleMaf: - type: half_float - alt: - type: keyword - normalizer: uppercase_normalizer - ref: - type: keyword - normalizer: uppercase_normalizer - ensembl: - properties: - siteType: - type: text - analyzer: autocomplete_english - #dbSNP func fields are similar to out siteType and exonicAlleleFunction fields - search_analyzer: search_english_func - fields: - exact: - type: keyword - normalizer: lowercase_normalizer - exonicAlleleFunction: - type: text - analyzer: autocomplete_english - search_analyzer: search_english_func - fields: - exact: - type: keyword - normalizer: lowercase_normalizer - refCodon: - type: keyword - normalizer: uppercase_normalizer - altCodon: - type: keyword - normalizer: uppercase_normalizer - refAminoAcid: - type: text - analyzer: lowercase_keyword - search_analyzer: lowercase_keyword_amino - altAminoAcid: - type: text - analyzer: lowercase_keyword - search_analyzer: lowercase_keyword_amino - codonPosition: - type: byte - codonNumber: - type: integer - strand: - type: keyword - name2: - type: keyword - normalizer: uppercase_normalizer - gene: - type: keyword - normalizer: uppercase_normalizer - name: - type: keyword - normalizer: uppercase_normalizer - clinvar: - properties: - alleleID: - type: integer - #phenotypeList and clinicalSignificance are more like traditional unstructured text fields - #I want them to be very easy to search - #TODO: remove high-frequency words, stopwords without screwing up all other field search - phenotypeList: - type: text - analyzer: autocomplete_english_split - search_analyzer: search_english_description_synonyms - clinicalSignificance: - type: text - analyzer: autocomplete_english_split - search_analyzer: search_english_split - type: - type: text - analyzer: autocomplete_english - search_analyzer: search_english_class - fields: - exact: - type: keyword - normalizer: lowercase_normalizer - origin: - type: text - analyzer: autocomplete_english_split - search_analyzer: search_english_split - numberSubmitters: - type: short - reviewStatus: - type: text - analyzer: autocomplete_english_split - search_analyzer: search_english_split - chromStart: - type: integer - chromEnd: - type: integer - phastCons: - type: scaled_float - scaling_factor: 1000 - phyloP: - type: half_float - cadd: - type: half_float - dbSNP: - properties: - name: - type: keyword - normalizer: lowercase_normalizer - strand: - type: keyword - observed: - type: keyword - normalizer: uppercase_normalizer - class: - type: text - analyzer: autocomplete_english - search_analyzer: search_english_class - fields: - exact: - type: keyword - normalizer: lowercase_normalizer - func: - type: text - analyzer: autocomplete_english - search_analyzer: search_english_func - fields: - exact: - type: keyword - normalizer: lowercase_normalizer - alleles: - type: keyword - normalizer: uppercase_normalizer - alleleNs: - type: scaled_float - scaling_factor: 10 - alleleFreqs: - type: half_float - clinvar: - properties: - alleleID: - type: integer - #phenotypeList and clinicalSignificance are more like traditional unstructured text fields - #I want them to be very easy to search - #TODO: remove high-frequency words, stopwords without screwing up all other field search - phenotypeList: - type: text - analyzer: autocomplete_english_split - search_analyzer: search_english_description_synonyms - fields: - exact: - type: keyword - normalizer: lowercase_normalizer - clinicalSignificance: - type: text - analyzer: autocomplete_english_split - search_analyzer: search_english_split - fields: - exact: - type: keyword - normalizer: lowercase_normalizer - type: - type: text - analyzer: autocomplete_english - search_analyzer: search_english_class - fields: - exact: - type: keyword - normalizer: lowercase_normalizer - origin: - type: text - analyzer: autocomplete_english_split - search_analyzer: search_english_split - numberSubmitters: - type: short - reviewStatus: - type: text - analyzer: autocomplete_english_split - search_analyzer: search_english_split - referenceAllele: - type: keyword - normalizer: uppercase_normalizer - alternateAllele: - type: keyword - normalizer: uppercase_normalizer - gnomad: - properties: - genomes: - properties: - alt: - type: keyword - normalizer: uppercase_normalizer - id: - type: keyword - normalizer: lowercase_normalizer - trTv: - type: byte - af: - type: half_float - # gnomad genomes has a max of 30,992 alleles (15,496 samples), fits short - an: - type: short - an_afr: - type: short - an_amr: - type: short - an_asj: - type: short - an_eas: - type: short - an_fin: - type: short - an_nfe: - type: short - an_oth: - type: short - an_male: - type: short - an_female: - type: short - af_afr: - type: half_float - af_amr: - type: half_float - af_asj: - type: half_float - af_eas: - type: half_float - af_fin: - type: half_float - af_nfe: - type: half_float - af_oth: - type: half_float - af_male: - type: half_float - af_female: - type: half_float - exomes: - properties: - alt: - type: keyword - normalizer: uppercase_normalizer - id: - type: keyword - normalizer: lowercase_normalizer - trTv: - type: byte - # ac: - # type: integer - af: - type: half_float - an: - type: integer - # ac_afr: - # type: integer - # ac_amr: - # type: integer - # ac_asj: - # type: integer - # ac_eas: - # type: integer - # ac_fin: - # type: integer - # ac_nfe: - # type: integer - # ac_oth: - # type: integer - # ac_male: - # type: integer - # ac_female: - # type: integer - an_afr: - type: integer - an_amr: - type: integer - an_asj: - type: integer - an_eas: - type: integer - an_fin: - type: integer - an_nfe: - type: integer - an_oth: - type: integer - an_male: - type: integer - an_female: - type: integer - af_afr: - type: half_float - af_amr: - type: half_float - af_asj: - type: half_float - af_eas: - type: half_float - af_fin: - type: half_float - af_nfe: - type: half_float - af_oth: - type: half_float - af_male: - type: half_float - af_female: - type: half_float \ No newline at end of file diff --git a/config/hg38-small.clean.yml b/config/hg38-small.clean.yml deleted file mode 100644 index 0033ac459..000000000 --- a/config/hg38-small.clean.yml +++ /dev/null @@ -1,218 +0,0 @@ ---- -assembly: hg38 -build_author: ec2-user -build_date: 2018-09-07T19:32:00 -chromosomes: - - chr1 - - chr2 - - chr3 - - chr4 - - chr5 - - chr6 - - chr7 - - chr8 - - chr9 - - chr10 - - chr11 - - chr12 - - chr13 - - chr14 - - chr15 - - chr16 - - chr17 - - chr18 - - chr19 - - chr20 - - chr21 - - chr22 - - chrM - - chrX - - chrY -database_dir: ~ -fileProcessors: - snp: - args: --emptyField NA --minGq .95 - program: bystro-snp - vcf: - args: --emptyField NA --sample %sampleList% - program: bystro-vcf -files_dir: ~ -statistics: - dbSNPnameField: dbSNP.name - exonicAlleleFunctionField: refSeq.exonicAlleleFunction - outputExtensions: - json: .statistics.json - qc: .statistics.qc.tsv - tab: .statistics.tsv - programPath: bystro-stats - refTrackField: ref - siteTypeField: refSeq.siteType -temp_dir: ~ -tracks: - outputOrder: - - ref - - refSeq - - nearest.refSeq - - cadd - tracks: - - build_author: ec2-user - build_date: 2018-09-07T19:32:00 - local_files: - - chr*.fa.gz - name: ref - type: reference - utils: - - args: - remoteDir: http://hgdownload.soe.ucsc.edu/goldenPath/hg38/chromosomes/ - remoteFiles: - - chr1.fa.gz - - chr2.fa.gz - - chr3.fa.gz - - chr4.fa.gz - - chr5.fa.gz - - chr6.fa.gz - - chr7.fa.gz - - chr8.fa.gz - - chr9.fa.gz - - chr10.fa.gz - - chr11.fa.gz - - chr12.fa.gz - - chr13.fa.gz - - chr14.fa.gz - - chr15.fa.gz - - chr16.fa.gz - - chr17.fa.gz - - chr18.fa.gz - - chr19.fa.gz - - chr20.fa.gz - - chr21.fa.gz - - chr22.fa.gz - - chrM.fa.gz - - chrX.fa.gz - - chrY.fa.gz - completed: 2017-11-24T02:27:00 - name: fetch - version: 28 - - build_author: ec2-user - build_date: 2018-09-07T19:32:00 - dist: true - features: - - name2 - - name - from: txStart - local_files: - - hg38.kgXref.chr*.with_dbnsfp.gz - name: nearest.refSeq - to: txEnd - type: nearest - version: 2 - - build_author: ec2-user - build_date: 2018-09-07T19:32:00 - local_files: - - whole_genome_SNVs.tsv.chr1.organized-by-chr.txt.sorted.txt.gz - - whole_genome_SNVs.tsv.chr10.organized-by-chr.txt.sorted.txt.gz - - whole_genome_SNVs.tsv.chr11.organized-by-chr.txt.sorted.txt.gz - - whole_genome_SNVs.tsv.chr12.organized-by-chr.txt.sorted.txt.gz - - whole_genome_SNVs.tsv.chr13.organized-by-chr.txt.sorted.txt.gz - - whole_genome_SNVs.tsv.chr14.organized-by-chr.txt.sorted.txt.gz - - whole_genome_SNVs.tsv.chr15.organized-by-chr.txt.sorted.txt.gz - - whole_genome_SNVs.tsv.chr16.organized-by-chr.txt.sorted.txt.gz - - whole_genome_SNVs.tsv.chr17.organized-by-chr.txt.sorted.txt.gz - - whole_genome_SNVs.tsv.chr18.organized-by-chr.txt.sorted.txt.gz - - whole_genome_SNVs.tsv.chr19.organized-by-chr.txt.sorted.txt.gz - - whole_genome_SNVs.tsv.chr2.organized-by-chr.txt.sorted.txt.gz - - whole_genome_SNVs.tsv.chr20.organized-by-chr.txt.sorted.txt.gz - - whole_genome_SNVs.tsv.chr21.organized-by-chr.txt.sorted.txt.gz - - whole_genome_SNVs.tsv.chr22.organized-by-chr.txt.sorted.txt.gz - - whole_genome_SNVs.tsv.chr3.organized-by-chr.txt.sorted.txt.gz - - whole_genome_SNVs.tsv.chr4.organized-by-chr.txt.sorted.txt.gz - - whole_genome_SNVs.tsv.chr5.organized-by-chr.txt.sorted.txt.gz - - whole_genome_SNVs.tsv.chr6.organized-by-chr.txt.sorted.txt.gz - - whole_genome_SNVs.tsv.chr7.organized-by-chr.txt.sorted.txt.gz - - whole_genome_SNVs.tsv.chr8.organized-by-chr.txt.sorted.txt.gz - - whole_genome_SNVs.tsv.chr9.organized-by-chr.txt.sorted.txt.gz - - whole_genome_SNVs.tsv.chrX.organized-by-chr.txt.sorted.txt.gz - - whole_genome_SNVs.tsv.chrY.organized-by-chr.txt.sorted.txt.gz - name: cadd - sorted: 1 - type: cadd - utils: - - args: - remoteFiles: - - http://krishna.gs.washington.edu/download/CADD/v1.4/GRCh38/whole_genome_SNVs.tsv.gz - completed: 2018-09-06T03:52:00 - name: fetch - - completed: 2018-09-06T05:39:00 - name: SortCadd - version: 19 - - build_author: ec2-user - build_date: 2018-09-07T19:32:00 - build_field_transformations: - description: split [;] - ensemblID: split [;] - kgID: split [;] - mRNA: split [;] - protAcc: split [;] - rfamAcc: split [;] - spDisplayID: split [;] - spID: split [;] - tRnaName: split [;] - features: - - name - - name2 - local_files: - - hg38.kgXref.chr8.with_dbnsfp.gz - - hg38.kgXref.chr4.with_dbnsfp.gz - - hg38.kgXref.chr3.with_dbnsfp.gz - - hg38.kgXref.chr1.with_dbnsfp.gz - - hg38.kgXref.chr6.with_dbnsfp.gz - - hg38.kgXref.chr2.with_dbnsfp.gz - - hg38.kgXref.chr5.with_dbnsfp.gz - - hg38.kgXref.chr7.with_dbnsfp.gz - - hg38.kgXref.chr10.with_dbnsfp.gz - - hg38.kgXref.chr9.with_dbnsfp.gz - - hg38.kgXref.chr16.with_dbnsfp.gz - - hg38.kgXref.chr11.with_dbnsfp.gz - - hg38.kgXref.chr12.with_dbnsfp.gz - - hg38.kgXref.chr14.with_dbnsfp.gz - - hg38.kgXref.chr15.with_dbnsfp.gz - - hg38.kgXref.chr13.with_dbnsfp.gz - - hg38.kgXref.chr18.with_dbnsfp.gz - - hg38.kgXref.chrY.with_dbnsfp.gz - - hg38.kgXref.chrM.with_dbnsfp.gz - - hg38.kgXref.chr17.with_dbnsfp.gz - - hg38.kgXref.chr22.with_dbnsfp.gz - - hg38.kgXref.chr21.with_dbnsfp.gz - - hg38.kgXref.chrX.with_dbnsfp.gz - - hg38.kgXref.chr19.with_dbnsfp.gz - - hg38.kgXref.chr20.with_dbnsfp.gz - name: refSeq - type: gene - utils: - - args: - connection: - database: hg38 - sql: - SELECT r.*, (SELECT GROUP_CONCAT(DISTINCT(NULLIF(x.kgID, '')) SEPARATOR - ';') FROM kgXref x WHERE x.refseq=r.name) AS kgID, (SELECT GROUP_CONCAT(DISTINCT(NULLIF(x.description, - '')) SEPARATOR ';') FROM kgXref x WHERE x.refseq=r.name) AS description, - (SELECT GROUP_CONCAT(DISTINCT(NULLIF(e.value, '')) SEPARATOR ';') FROM knownToEnsembl - e JOIN kgXref x ON x.kgID = e.name WHERE x.refseq = r.name) AS ensemblID, - (SELECT GROUP_CONCAT(DISTINCT(NULLIF(x.tRnaName, '')) SEPARATOR ';') FROM - kgXref x WHERE x.refseq=r.name) AS tRnaName, (SELECT GROUP_CONCAT(DISTINCT(NULLIF(x.spID, - '')) SEPARATOR ';') FROM kgXref x WHERE x.refseq=r.name) AS spID, (SELECT - GROUP_CONCAT(DISTINCT(NULLIF(x.spDisplayID, '')) SEPARATOR ';') FROM kgXref - x WHERE x.refseq=r.name) AS spDisplayID, (SELECT GROUP_CONCAT(DISTINCT(NULLIF(x.protAcc, - '')) SEPARATOR ';') FROM kgXref x WHERE x.refseq=r.name) AS protAcc, (SELECT - GROUP_CONCAT(DISTINCT(NULLIF(x.mRNA, '')) SEPARATOR ';') FROM kgXref x WHERE - x.refseq=r.name) AS mRNA, (SELECT GROUP_CONCAT(DISTINCT(NULLIF(x.rfamAcc, - '')) SEPARATOR ';') FROM kgXref x WHERE x.refseq=r.name) AS rfamAcc FROM - refGene r WHERE chrom=%chromosomes%; - completed: 2018-09-07T14:04:00 - name: fetch - - args: - geneFile: /mnt/bystro-files/dbnsfp//dbNSFP3.5_gene.complete - completed: 2018-09-07T14:05:00 - name: refGeneXdbnsfp - version: 28 -version: 215 diff --git a/config/mm10.clean.yml b/config/mm10.clean.yml deleted file mode 100644 index c1d112f9a..000000000 --- a/config/mm10.clean.yml +++ /dev/null @@ -1,160 +0,0 @@ ---- -assembly: mm10 -chromosomes: - - chr1 - - chr2 - - chr3 - - chr4 - - chr5 - - chr6 - - chr7 - - chr8 - - chr9 - - chr10 - - chr11 - - chr12 - - chr13 - - chr14 - - chr15 - - chr16 - - chr17 - - chr18 - - chr19 - - chrM - - chrX - - chrY -database_dir: "~" -files: "~" -files_dir: ~ -statistics: - dbSNPnameField: dbSNP.name - exonicAlleleFunctionField: refSeq.exonicAlleleFunction - outputExtensions: - json: .statistics.json - qc: .statistics.qc.tab - tab: .statistics.tab - refTrackField: ref - siteTypeField: refSeq.siteType - programPath: bystro-stats -temp_dir: "~" -fileProcessors: - snp: - args: --emptyField NA --minGq .95 - program: bystro-snp - vcf: - args: --emptyField NA --sample %sampleList% --keepPos --keepId - program: bystro-vcf -tracks: - - name: ref - remote_dir: http://hgdownload.soe.ucsc.edu/goldenPath/mm10/chromosomes/ - remote_files: - - chr1.fa.gz - - chr2.fa.gz - - chr3.fa.gz - - chr4.fa.gz - - chr5.fa.gz - - chr6.fa.gz - - chr7.fa.gz - - chr8.fa.gz - - chr9.fa.gz - - chr10.fa.gz - - chr11.fa.gz - - chr12.fa.gz - - chr13.fa.gz - - chr14.fa.gz - - chr15.fa.gz - - chr16.fa.gz - - chr17.fa.gz - - chr18.fa.gz - - chr19.fa.gz - - chrM.fa.gz - - chrX.fa.gz - - chrY.fa.gz - type: reference - - features: - - kgID - - mRNA - - spID - - spDisplayID - - refseq - - protAcc - - description - - rfamAcc - - name - - name2 - name: refSeq - sql_statement: - SELECT * FROM mm10.refGene LEFT JOIN mm10.kgXref ON mm10.kgXref.refseq - = mm10.refGene.name - type: gene - version: 1 - - name: phastCons - remote_dir: http://hgdownload.cse.ucsc.edu/goldenPath/mm10/phastCons60way/mm10.60way.phastCons/ - remote_files: - - chr1.phastCons60way.wigFix.gz - - chr2.phastCons60way.wigFix.gz - - chr3.phastCons60way.wigFix.gz - - chr4.phastCons60way.wigFix.gz - - chr5.phastCons60way.wigFix.gz - - chr6.phastCons60way.wigFix.gz - - chr7.phastCons60way.wigFix.gz - - chr8.phastCons60way.wigFix.gz - - chr9.phastCons60way.wigFix.gz - - chr10.phastCons60way.wigFix.gz - - chr11.phastCons60way.wigFix.gz - - chr12.phastCons60way.wigFix.gz - - chr13.phastCons60way.wigFix.gz - - chr14.phastCons60way.wigFix.gz - - chr15.phastCons60way.wigFix.gz - - chr16.phastCons60way.wigFix.gz - - chr17.phastCons60way.wigFix.gz - - chr18.phastCons60way.wigFix.gz - - chr19.phastCons60way.wigFix.gz - - chrX.phastCons60way.wigFix.gz - - chrY.phastCons60way.wigFix.gz - - chrM.phastCons60way.wigFix.gz - type: score - - name: phyloP - remote_dir: http://hgdownload.cse.ucsc.edu/goldenPath/mm10/phyloP60way/mm10.60way.phyloP60way/ - remote_files: - - chr1.phyloP60way.wigFix.gz - - chr2.phyloP60way.wigFix.gz - - chr3.phyloP60way.wigFix.gz - - chr4.phyloP60way.wigFix.gz - - chr5.phyloP60way.wigFix.gz - - chr6.phyloP60way.wigFix.gz - - chr7.phyloP60way.wigFix.gz - - chr8.phyloP60way.wigFix.gz - - chr9.phyloP60way.wigFix.gz - - chr10.phyloP60way.wigFix.gz - - chr11.phyloP60way.wigFix.gz - - chr12.phyloP60way.wigFix.gz - - chr13.phyloP60way.wigFix.gz - - chr14.phyloP60way.wigFix.gz - - chr15.phyloP60way.wigFix.gz - - chr16.phyloP60way.wigFix.gz - - chr17.phyloP60way.wigFix.gz - - chr18.phyloP60way.wigFix.gz - - chr19.phyloP60way.wigFix.gz - - chrX.phyloP60way.wigFix.gz - - chrY.phyloP60way.wigFix.gz - - chrM.phyloP60way.wigFix.gz - type: score - - build_field_transformations: - alleleFreqs: split [,] - alleleNs: split [,] - alleles: split [,] - func: split [,] - observed: split [\/] - features: - - name - - strand - - observed - - class - - func - - alleles - - alleleNs: number - - alleleFreqs: number - name: dbSNP - sql_statement: SELECT * FROM mm10.snp142 - type: sparse diff --git a/config/mm10.mapping.yml b/config/mm10.mapping.yml deleted file mode 120000 index 317a04219..000000000 --- a/config/mm10.mapping.yml +++ /dev/null @@ -1 +0,0 @@ -./hg19.mapping.yml \ No newline at end of file diff --git a/config/mm9.clean.yml b/config/mm9.clean.yml deleted file mode 100644 index b43d055e5..000000000 --- a/config/mm9.clean.yml +++ /dev/null @@ -1,154 +0,0 @@ ---- -assembly: mm9 -chromosomes: - - chr1 - - chr2 - - chr3 - - chr4 - - chr5 - - chr6 - - chr7 - - chr8 - - chr9 - - chr10 - - chr11 - - chr12 - - chr13 - - chr14 - - chr15 - - chr16 - - chr17 - - chr18 - - chr19 - - chrM - - chrX - - chrY -database_dir: "~" -files: "~" -files_dir: ~ -statistics: - dbSNPnameField: dbSNP.name - exonicAlleleFunctionField: refSeq.exonicAlleleFunction - outputExtensions: - json: .statistics.json - qc: .statistics.qc.tab - tab: .statistics.tab - refTrackField: ref - siteTypeField: refSeq.siteType - programPath: bystro-stats -temp_dir: "~" -fileProcessors: - snp: - args: --emptyField NA --minGq .95 - program: bystro-snp - vcf: - args: --emptyField NA --keepId --keepPos - program: bystro-vcf -tracks: - - name: ref - remote_dir: http://hgdownload.soe.ucsc.edu/goldenPath/mm9/chromosomes/ - remote_files: - - chr1.fa.gz - - chr2.fa.gz - - chr3.fa.gz - - chr4.fa.gz - - chr5.fa.gz - - chr6.fa.gz - - chr7.fa.gz - - chr8.fa.gz - - chr9.fa.gz - - chr10.fa.gz - - chr11.fa.gz - - chr12.fa.gz - - chr13.fa.gz - - chr14.fa.gz - - chr15.fa.gz - - chr16.fa.gz - - chr17.fa.gz - - chr18.fa.gz - - chr19.fa.gz - - chrM.fa.gz - - chrX.fa.gz - - chrY.fa.gz - type: reference - - features: - - kgID - - mRNA - - spID - - spDisplayID - - refseq - - protAcc - - description - - name - - name2 - name: refSeq - sql_statement: - SELECT * FROM mm9.refGene LEFT JOIN mm9.kgXref ON mm9.kgXref.refseq - = mm9.refGene.name - type: gene - - name: phastCons - remote_dir: http://hgdownload.cse.ucsc.edu/goldenPath/mm9/phastCons30way/vertebrate/ - remote_files: - - chr1.data.gz - - chr2.data.gz - - chr3.data.gz - - chr4.data.gz - - chr5.data.gz - - chr6.data.gz - - chr7.data.gz - - chr8.data.gz - - chr9.data.gz - - chr10.data.gz - - chr11.data.gz - - chr12.data.gz - - chr13.data.gz - - chr14.data.gz - - chr15.data.gz - - chr16.data.gz - - chr17.data.gz - - chr18.data.gz - - chr19.data.gz - - chrX.data.gz - - chrY.data.gz - - chrM.data.gz - type: score - - name: phyloP - remote_dir: http://hgdownload.cse.ucsc.edu/goldenPath/mm9/phyloP30way/vertebrate/ - remote_files: - - chr1.phyloP30way.wigFix.gz - - chr2.phyloP30way.wigFix.gz - - chr3.phyloP30way.wigFix.gz - - chr4.phyloP30way.wigFix.gz - - chr5.phyloP30way.wigFix.gz - - chr6.phyloP30way.wigFix.gz - - chr7.phyloP30way.wigFix.gz - - chr8.phyloP30way.wigFix.gz - - chr9.phyloP30way.wigFix.gz - - chr10.phyloP30way.wigFix.gz - - chr11.phyloP30way.wigFix.gz - - chr12.phyloP30way.wigFix.gz - - chr13.phyloP30way.wigFix.gz - - chr14.phyloP30way.wigFix.gz - - chr15.phyloP30way.wigFix.gz - - chr16.phyloP30way.wigFix.gz - - chr17.phyloP30way.wigFix.gz - - chr18.phyloP30way.wigFix.gz - - chr19.phyloP30way.wigFix.gz - - chrX.phyloP30way.wigFix.gz - - chrY.phyloP30way.wigFix.gz - - chrM.phyloP30way.wigFix.gz - type: score - - build_field_transformations: - func: split [,] - observed: split [\/] - features: - - name - - strand - - observed - - class - - func - - avHet: number - - avHetSE: number - name: dbSNP - sql_statement: SELECT * FROM mm9.snp128 - type: sparse diff --git a/config/mm9.mapping.yml b/config/mm9.mapping.yml deleted file mode 120000 index 317a04219..000000000 --- a/config/mm9.mapping.yml +++ /dev/null @@ -1 +0,0 @@ -./hg19.mapping.yml \ No newline at end of file diff --git a/config/rheMac8.clean.yml b/config/rheMac8.clean.yml deleted file mode 100644 index f16e7ee08..000000000 --- a/config/rheMac8.clean.yml +++ /dev/null @@ -1,66 +0,0 @@ ---- -assembly: rheMac8 -chromosomes: -- chr1 -- chr2 -- chr3 -- chr4 -- chr5 -- chr6 -- chr7 -- chr8 -- chr9 -- chr10 -- chr11 -- chr12 -- chr13 -- chr14 -- chr15 -- chr16 -- chr17 -- chr18 -- chr19 -- chr20 -- chrM -- chrX -- chrY -database_dir: '~' -files: '~' -files_dir: ~ -fileProcessors: - snp: - args: --emptyField NA --minGq .95 - program: bystro-snp - vcf: - args: --emptyField NA --keepId --keepPos - program: bystro-vcf -statistics: - exonicAlleleFunctionField: refSeq.exonicAlleleFunction - outputExtensions: - json: .statistics.json - qc: .statistics.qc.tab - tab: .statistics.tab - refTrackField: ref - siteTypeField: refSeq.siteType - programPath: bystro-stats -temp_dir: '~' -fileProcessors: - snp: - args: --emptyField NA --minGq .95 - program: bystro-snp - vcf: - args: --emptyField NA --keepId --keepPos - program: bystro-vcf -tracks: -- name: ref - remote_dir: http://hgdownload.soe.ucsc.edu/goldenPath/rheMac8/bigZips/ - remote_files: - - rheMac8.fa.gz - type: reference -- features: - - name - - name2 - name: refSeq - sql_statement: SELECT * FROM rheMac8.refGene - type: gene - diff --git a/config/rheMac8.mapping.yml b/config/rheMac8.mapping.yml deleted file mode 120000 index 317a04219..000000000 --- a/config/rheMac8.mapping.yml +++ /dev/null @@ -1 +0,0 @@ -./hg19.mapping.yml \ No newline at end of file diff --git a/config/rn6.clean.yml b/config/rn6.clean.yml deleted file mode 100644 index f306b1859..000000000 --- a/config/rn6.clean.yml +++ /dev/null @@ -1,71 +0,0 @@ ---- -assembly: rn6 -chromosomes: - - chr1 - - chr2 - - chr3 - - chr4 - - chr5 - - chr6 - - chr7 - - chr8 - - chr9 - - chr10 - - chr11 - - chr12 - - chr13 - - chr14 - - chr15 - - chr16 - - chr17 - - chr18 - - chr19 - - chr20 - - chrM - - chrX -database_dir: "~" -files: "~" -files_dir: ~ -statistics: - exonicAlleleFunctionField: refSeq.exonicAlleleFunction - outputExtensions: - json: .statistics.json - qc: .statistics.qc.tab - tab: .statistics.tab - refTrackField: ref - siteTypeField: refSeq.siteType - programPath: bystro-stats -temp_dir: "~" -fileProcessors: - snp: - args: --emptyField NA --minGq .95 - program: bystro-snp - vcf: - args: --emptyField NA --keepId --keepPos - program: bystro-vcf -tracks: - - name: ref - remote_dir: http://hgdownload.cse.ucsc.edu/goldenPath/rn6/bigZips/ - remote_files: - - rn6.fa.gz - type: reference - - features: - - name - - name2 - name: refSeq - sql_statement: SELECT * FROM rn6.refGene - type: gene - - local_files: - - rn6.phastCons20way.wigFix.gz - name: phastCons - remote_dir: http://hgdownload.cse.ucsc.edu/goldenPath/rn6/phastCons20way/ - remote_files: - - rn6.phastCons20way.wigFix.gz - type: score - - local_files: - - rn6.phyloP20way.wigFix.gz - name: phyloP - remote_dir: http://hgdownload.cse.ucsc.edu/goldenPath/rn6/phyloP20way/ - remote_files: - - rn6.phyloP20way.wigFix.gz - type: score diff --git a/config/rn6.mapping.yml b/config/rn6.mapping.yml deleted file mode 120000 index 317a04219..000000000 --- a/config/rn6.mapping.yml +++ /dev/null @@ -1 +0,0 @@ -./hg19.mapping.yml \ No newline at end of file diff --git a/config/sacCer3.clean.yml b/config/sacCer3.clean.yml deleted file mode 100644 index 9b08c8df0..000000000 --- a/config/sacCer3.clean.yml +++ /dev/null @@ -1,93 +0,0 @@ ---- -assembly: sacCer3 -chromosomes: - - chrI - - chrII - - chrIII - - chrIV - - chrIX - - chrM - - chrV - - chrVI - - chrVII - - chrVIII - - chrX - - chrXI - - chrXII - - chrXIII - - chrXIV - - chrXV - - chrXVI -database_dir: "~" -files: "~" -files_dir: ~ -statistics: - exonicAlleleFunctionField: sgd.exonicAlleleFunction - outputExtensions: - json: .statistics.json - qc: .statistics.qc.tab - tab: .statistics.tab - refTrackField: ref - siteTypeField: sgd.siteType - programPath: bystro-stats -temp_dir: "~" -fileProcessors: - snp: - args: --emptyField NA --minGq .95 - program: bystro-snp - vcf: - args: --emptyField NA --keepId --keepPos - program: bystro-vcf -tracks: - - name: ref - remote_dir: http://hgdownload.soe.ucsc.edu/goldenPath/sacCer3/chromosomes/ - remote_files: - - chrI.fa.gz - - chrII.fa.gz - - chrIII.fa.gz - - chrIV.fa.gz - - chrIX.fa.gz - - chrM.fa.gz - - chrV.fa.gz - - chrVI.fa.gz - - chrVII.fa.gz - - chrVIII.fa.gz - - chrX.fa.gz - - chrXI.fa.gz - - chrXII.fa.gz - - chrXIII.fa.gz - - chrXIV.fa.gz - - chrXV.fa.gz - - chrXVI.fa.gz - type: reference - - features: - - name - - type - - description - - proteinID - name: sgd - sql_statement: - SELECT sgdGene.name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds,proteinID,type,description - FROM sacCer3.sgdGene LEFT JOIN sgdDescription ON sgdGene.name = sgdDescription.name - type: gene - - name: phastCons - remote_dir: http://hgdownload.cse.ucsc.edu/goldenPath/sacCer3/phastCons7way/ - remote_files: - - sacCer3.chrI.wigFixed.gz - - sacCer3.chrII.wigFixed.gz - - sacCer3.chrIII.wigFixed.gz - - sacCer3.chrIV.wigFixed.gz - - sacCer3.chrIX.wigFixed.gz - - sacCer3.chrM.wigFixed.gz - - sacCer3.chrV.wigFixed.gz - - sacCer3.chrVI.wigFixed.gz - - sacCer3.chrVII.wigFixed.gz - - sacCer3.chrVIII.wigFixed.gz - - sacCer3.chrX.wigFixed.gz - - sacCer3.chrXI.wigFixed.gz - - sacCer3.chrXII.wigFixed.gz - - sacCer3.chrXIII.wigFixed.gz - - sacCer3.chrXIV.wigFixed.gz - - sacCer3.chrXV.wigFixed.gz - - sacCer3.chrXVI.wigFixed.gz - type: score diff --git a/config/sacCer3.mapping.yml b/config/sacCer3.mapping.yml deleted file mode 100644 index 5cec874db..000000000 --- a/config/sacCer3.mapping.yml +++ /dev/null @@ -1,360 +0,0 @@ -#Our own, special field, tells an interface which fields to run prefix queries on -numericalFields: - - phastCons - - pos - - sgd.codonNumber - - sgd.codonPosition -sort: - sgd.codonNumber: avg - sgd.codonPosition: avg -booleanFields: - - discordant -# These only refer to searchable exact fields -# These should be case sensitive, they are, by definition, exact -hasExactFields: - - type - - sgd.name - -post_index_settings: - index: - refresh_interval: 15s - number_of_replicas: 1 -index_settings: - index: - refresh_interval: -1 - number_of_replicas: 0 - number_of_shards: 9 - codec: best_compression - analysis: - normalizer: - lowercase_normalizer: - type: custom - filter: - - lowercase - - asciifolding - uppercase_normalizer: - type: custom - filter: - - uppercase - - asciifolding - filter: - catenate_filter: - type: word_delimiter - catenate_words: true - catenate_numbers: true - catenate_all: true - preserve_original: false - generate_word_parts: false - stem_english_possessive: true - generate_number_parts: false - split_on_numerics: false - split_on_case_change: false - catenate_filter_split: - type: word_delimiter - catenate_words: true - catenate_numbers: true - catenate_all: true - preserve_original: false - generate_word_parts: true - stem_english_possessive: true - generate_number_parts: false - split_on_numerics: false - split_on_case_change: true - - english_stemmer: - type: stemmer - language: light_english - english_possessive_stemmer: - type: stemmer - language: possessive_english - english_minimal_stemmer: - type: stemmer - language: minimal_english - search_synonym_filter: - type: synonym - synonyms_path : "analysis/search-synonyms.txt" - amino_synonym_filter: - type: synonym - synonyms_path : "analysis/amino-synonyms.txt" - type_synonym_filter: - type: synonym - synonyms_path : "analysis/type-synonyms.txt" - dbSNP_func_synonyms: - type: synonym - synonyms_path : "analysis/dbsnp-func-synonyms.txt" - dbSNP_class_synonyms: - type: synonym - synonyms_path : "analysis/dbsnp-class-synonyms.txt" - exonic_allele_function_search_synonyms: - type: synonym - synonyms_path : "analysis/exonic-allele-function-search-synonyms.txt" - site_type_synonym_filter: - type: synonym - synonyms_path : "analysis/site-type-synonyms.txt" - codon_map_synonym_filter: - type: synonym - synonyms_path : "analysis/codon-map-synonyms.txt" - description_synonyms: - type: synonym - synonyms_path : "analysis/refseq-description-synonyms.txt" - disease_synonyms: - type: synonym - synonyms_path : "analysis/disease-synonyms.txt" - autocomplete_filter: - type: edge_ngram - min_gram: 1 - max_gram: 30 - token_chars: - - letter - - digit - english_stop: - type: stop - stopwords: - - a - - an - - and - - are - - as - - at - - be - - but - - by - - for - - if - - in - - into - - is - - it - - of - - on - - or - - has - - such - - that - - the - - their - - then - - there - - these - - they - - this - - to - - was - - will - - with - analyzer: - autocomplete_english: - type: custom - tokenizer: whitespace - filter: - - lowercase - - asciifolding - #- english_stop - - catenate_filter - - english_stemmer - - autocomplete_filter - autocomplete_english_split: - type: custom - tokenizer: whitespace - filter: - - lowercase - - asciifolding - #- english_stop - - catenate_filter_split - - english_stemmer - - autocomplete_filter - search_english: - type: custom - tokenizer: whitespace - filter: - - lowercase - - asciifolding - #- english_stop - - catenate_filter - - english_stemmer - - search_synonym_filter - search_english_split: - type: custom - tokenizer: whitespace - filter: - - lowercase - - asciifolding - #- english_stop - - catenate_filter_split - - english_stemmer - - search_synonym_filter - search_english_type: - type: custom - tokenizer: whitespace - filter: - - lowercase - - asciifolding - #- english_stop - - catenate_filter - - english_stemmer - - type_synonym_filter - - dbSNP_class_synonyms - search_english_description_synonyms: - type: custom - tokenizer: whitespace - filter: - - lowercase - - asciifolding - #- english_stop - - catenate_filter_split - - english_stemmer - - description_synonyms - - disease_synonyms - search_english_class: - type: custom - tokenizer: whitespace - filter: - - lowercase - - asciifolding - #- english_stop - - catenate_filter - - english_stemmer - - dbSNP_class_synonyms - search_english_func: - type: custom - tokenizer: whitespace - filter: - - lowercase - - asciifolding - #- english_stop - - catenate_filter - - english_stemmer - - dbSNP_func_synonyms - uppercase_keyword_text: - type: custom - tokenizer: keyword - filter: - - uppercase - - asciifolding - uppercase_keyword_text_codon: - type: custom - tokenizer: keyword - filter: - - uppercase - - asciifolding - - codon_map_synonym_filter - - amino_synonym_filter - uppercase_keyword_text_amino: - type: custom - tokenizer: keyword - filter: - - uppercase - - asciifolding - - amino_synonym_filter -mappings: - _all: - enabled: false - properties: - chrom: - type: keyword - normalizer: lowercase_normalizer - # chr's are very short, and the "prefix" is a completely valid value - # so, don't include in all, because many false positivies with ngrams - pos: - type: integer - trTv: - type: byte - type: - type: text - analyzer: autocomplete_english - search_analyzer: search_english_type - fields: - exact: - type: keyword - normalizer: lowercase_normalizer - discordant: - type: boolean - heterozygotes: - type: keyword - heterozygosity: - type: half_float - homozygotes: - type: keyword - homozygosity: - type: half_float - missingGenos: - type: keyword - missingness: - type: half_float - sampleMaf: - type: half_float - alt: - type: keyword - normalizer: uppercase_normalizer - ref: - type: keyword - normalizer: uppercase_normalizer - sgd: - properties: - siteType: - type: text - analyzer: autocomplete_english - #dbSNP func fields are similar to out siteType and exonicAlleleFunction fields - search_analyzer: search_english_func - fields: - exact: - type: keyword - normalizer: lowercase_normalizer - exonicAlleleFunction: - type: text - analyzer: autocomplete_english - search_analyzer: search_english_func - fields: - exact: - type: keyword - normalizer: lowercase_normalizer - refCodon: - type: keyword - normalizer: uppercase_normalizer - altCodon: - type: keyword - normalizer: uppercase_normalizer - refAminoAcid: - type: text - analyzer: uppercase_keyword_text - search_analyzer: uppercase_keyword_text_amino - fields: - exact: - type: keyword - normalizer: uppercase_normalizer - altAminoAcid: - type: text - analyzer: uppercase_keyword_text - search_analyzer: uppercase_keyword_text_amino - fields: - exact: - type: keyword - normalizer: uppercase_normalizer - codonPosition: - type: byte - codonNumber: - type: integer - strand: - type: keyword - # kgID is in the form uc001uub.1 , seems to always be lowercase - type: - type: keyword - normalizer: lowercase_normalizer - proteinID: - type: keyword - normalizer: uppercase_normalizer - name: - type: text - analyzer: autocomplete_english - search_analyzer: search_english - fields: - exact: - type: keyword - normalizer: uppercase_normalizer - description: - type: text - analyzer: autocomplete_english_split - search_analyzer: search_english_description_synonyms - phastCons: - type: scaled_float - scaling_factor: 1000 \ No newline at end of file diff --git a/perl/ANNOTATION.md b/perl/ANNOTATION.md index 4e0accc08..095991b35 100644 --- a/perl/ANNOTATION.md +++ b/perl/ANNOTATION.md @@ -1,46 +1,93 @@ -# Annotation Fields (Human Assembly hg38 and hg19) +# Bystro High Dimensional Genetics Annotator Documentation -

Annotation Field Description

+## What is the Bystro Annotator? -##### _Italicized fields_ are custom Bystro fields. All others are sourced as described. +The Bystro Annotator is the fastest and most comprehensive data curation and labeling library in the world for genetic data. It take 1 or more VCF ([Variant Call Format](https://samtools.github.io/hts-specs/VCFv4.2.pdf)) or SNP ([PEMapper/PECaller](https://www.pnas.org/doi/full/10.1073/pnas.1618065114)) files as input, and outputs a cleaned and thoroughly labeled (annotated) representation of your data, along with a genotype dosage matrix in the [Arrow Feather V2/IPC format](https://arrow.apache.org/docs/python/feather.html), and a set of statistics on the data. -
+Bystro Annotator processes both variants and sample genotypes. It is capable of processing millions of samples and billions of mutations on commodity hardware such as a laptop or a workstation. It is roughly **100,000** times faster than [Variant Effect Predictor](https://www.ensembl.org/info/docs/tools/vep/index.html) (VEP), **1,000** times faster than [Annovar](https://doc-openbio.readthedocs.io/projects/annovar/en/latest/), and **100** times faster than [Illumina Connected Annotations](https://developer.illumina.com/illumina-connected-annotations), all while outputting more annotations than any of these tools. What takes VEP years to do, Bystro can do in minutes to hours, all without requiring multiple servers. -#### Summary of the Bystro Annotator +Bystro's performance isn't just about speed, it's also about comprehensiveness. For instance, because of it's performance, Bystro can afford to provide complete annotations, both genome and exome wide, from gnomad v4.1, for all populations and subpopulations. **No other tool can do this**. -Bystro takes 1 or more VCF ([Variant Call Format](https://samtools.github.io/hts-specs/VCFv4.2.pdf)) or SNP ([PEMapper/PECaller](https://www.pnas.org/doi/full/10.1073/pnas.1618065114) output) files, performs quality control, generates statistics, and outputs an annotated file that is in an easy-to-parse tab-separated format. +## Running Your First Annotation -Each row in the output file corresponds to a single variant (mutation), and contains a set of annotations that describe the site's genomic context, functional impact, allele frequencies in various populations, and more. +See the [INSTALL.md#configuring-the-bystro-annotator](../INSTALL.md#configuring-the-bystro-annotator) section for instructions on how to configure# Bystro Annotator -The annotations are divided into several categories, each of which is described in detail below. +```sh +bystro-annotate.pl --config ~/bystro/config/hg38.yml --threads 32 --input gnomad.genomes.v4.0.sites.chr22.vcf.bgz --output test/my_annotation --compress gz +``` -#### Output Data Structures +The above command will annotate the `gnomad.genomes.v4.0.sites.chr22.vcf.bgz` file with the hg38 database, using 32 threads, and output the results to `test`, and will use `my_annotation` as the prefix for output files. + +The result of this command will be: + +```sh +Created completion file +{ + "error" : null, + "totalProgress" : 8599234, + "totalSkipped" : 0, + "results" : { + "header" : "my_annotation.annotation.header.json", + "sampleList" : "my_annotation.sample_list", + "annotation" : "my_annotation.annotation.tsv.gz", + "dosageMatrixOutPath" : "my_annotation.dosage.feather", + "config" : "hg38.yml", + "log" : "my_annotation.annotation.log.txt", + "statistics" : { + "qc" : "my_annotation.statistics.qc.tsv", + "json" : "my_annotation.statistics.json", + "tab" : "my_annotation.statistics.tsv" + } + } +} +``` -Bystro's output format is designed to retain and reflect complex relationships within genomic annotations. Here are key aspects of how we store the data: +Explanation of the output: -1. **Array-Based Fields**: Each annotation field is an array. For fields with multiple values (e.g., transcripts, gene names), the values are separated by semicolons (`;`). The order of the values is maintained across related fields to preserve relationships between the data points. For example: +- `my_annotation.annotation.header.json`: The header of the annotated dataset +- `my_annotation.sample_list`: The list of samples in the annotated dataset +- `my_annotation.annotation.tsv.gz`: A block gzipped TSV file with one row per variant and one column per annotation. Can be decompressed with `bgzip` or any program compatible with the gzip format, like `gzip` and `pigz`. +- `my_annotation.dosage.feather`: The dosage matrix file, where the first column is the `locus` column in the format "chr:pos:ref:alt", and columns following that are sample columns, with the dosage of the variant for that sample (0 for homozygous reference, 1 for 1 copy of the alternate allele, 2 for 2, and so on). -1 indicates missing genotypes. The dosage is the expected number of alternate alleles, given the genotype. This is useful for downstream analyses like imputation, or for calculating polygenic risk scores + - This file is in the [Arrow feather format](https://arrow.apache.org/docs/python/feather.html), also known as the "IPC" format. This is an ultra-efficient format for machine learning, and is widely supported, in Python libraries like [Pandas](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_feather.html), [Polars](https://docs.pola.rs/api/python/stable/reference/api/polars.read_ipc.html), [PyArrow](https://arrow.apache.org/docs/python/generated/pyarrow.feather.read_feather.html), as well as languages like [R](https://arrow.apache.org/docs/r/reference/read_feather.html) and [Julia](https://github.com/apache/arrow-julia) +- `hg38.yml`: The configuration file used for the annotation. You can use this to either re-build the Bystro Annotation Database from scratch, or to re-run the annotation with the same configuration +- `my_annotation.annotation.log.txt`: The log file for the annotation +- `my_annotation.statistics.tsv`: A TSV file with sample-wise statistics on the annotation +- `my_annotation.statistics.qc.tsv`: A TSV file that lists any samples that failed quality control checks, currently defined as being outside 3 standard deviations from the mean on any of the sample-wise statistics +- `my_annotation.statistics.json`: A JSON file with the same sample-wise statistics on the annotation +- 'totalProgress': The number of variants processed; this is the number of variants passed to the Bystro annotator by the bystro-vcf pre-processor, which performs primary quality control checks, such as excluding sites that have no samples with non-missing genotypes, or which are not FILTER=PASS in the input VCF. We also exclude sites that are not in the Bystro Annotation Database, and sites that are not in the Bystro Annotation Database that are not in the input VCF. In more detail: + - Variants must have FILTER value of PASS or " . " + - Variants and ref must be ACTG (no structural variants retained) + - Multiallelics are split into separate records, and annotated separately + - MNPs are split into separate SNPs and annotated separately + - Indels are left-aligned + - The first base of an indel must be the reference base after multiallelic decomposition and left-alignment + - If genotypes are provided, entirely missing sites are dropped - - `refSeq.name` might contain `NM1;NM2`, and `refSeq.name2` (gene symbols for these transcripts) could be `Gene1;Gene1`. This ensures the first transcript, `NM1`, corresponds to `Gene1`, and the second transcript, `NM2`, also corresponds to `Gene1`. These relationships are maintained across all fields within a track. +## Let's Take a Closer Look at the Annotation Output + +The Bystro annotation outputs is a tab-separated file with one header row, and then N rows of annotated variants, one variant per row. The annotations are divided into several categories, each of which is described in detail in the [Bystro Annotation Fields](#bystro-annotation-fields) section. -2. **Nested Arrays**: Some fields may be nested arrays, where the individual entries are further divided using forward slashes (`/`). For example, if a transcript has alternate IDs, you may see `NM1a/NM1b;NM2`, indicating two alternate IDs for the first transcript. This way we can maintain the relationships between the order of fields. +## Bystro Annotation Output In Depth -3. **Insertions and Deletions**: For transcript-level annotations like `refSeq.*` (refSeq transcript annotations), `nearest.*`, and `nearestTss.*`, insertions and deletions affecting multiple nucleotides are separated by pipes (`|`). This allows reporting a transcript consequence per disrupted base. +One of the key advantages of Bystro's design is that it outputs data in such a complete manner that it is possible to re-create the source files used for annotation from the Bystro annotation output. -4. **Reserved Delimiters**: Reserved delimiters like `;`, `/`, and `|` are essential for maintaining the structure of relationships in the data. If these characters appear in the source data, they are replaced with commas to ensure they can still be used effectively as delimiters in the output. +Bystro's output formats are designed to retain and reflect complex nested relationships between variant descriptions. Here are key aspects of how we output the data: -This structure enables precise retention of complex data relationships across multiple annotation fields while maintaining a highly structured and parseable output. +1. **Array-Based Fields**: Each annotation field is an array. For fields with multiple values (e.g., transcripts, gene names), the values are separated by semicolons (`;`). The order of the values is maintained across related fields to preserve relationships between the data points. For example: -#### What Information Can Bystro Output? + - `refSeq.name` might contain `NM1;NM2`, and `refSeq.name2` (gene symbols for these transcripts) could be `Gene1;Gene1`. This ensures the first transcript, `NM1`, corresponds to `Gene1`, and the second transcript, `NM2`, also corresponds to `Gene1`. These relationships are maintained across all fields within a track. -Bystro is a general-purpose annotation engine, and has no restrictions on the types of annotations/feature labels it can output. To output annotations, the user must provide a Bystro Database, which is a lightning-fast embedded key-value database that is keyed on genomic position, and a YAML configuration file that describes the fields to be output and where to source them from. +2. **Nested Arrays**: Some fields may be nested arrays, where the individual entries are further divided using forward slashes (`/`). For example, if a transcript has alternate IDs, you may see `NM1a/NM1b;NM2`, indicating two alternate IDs for the first transcript (NM1a and NM1b) and 1 for the 2nd (NM2). This way we can maintain the relationships between the order of fields. -- A Bystro Database is a high-performance key-value database that uses the Lightning Memory Map Database (LMDB) engine. It supports millions of lookups per second on a single machine, and can be used to store and retrieve annotations for millions of variants. +3. **Insertions and Deletions**: For transcript-level annotations like `refSeq.*` (refSeq transcript annotations), `nearest.*`, and `nearestTss.*` (nearest gene by transcript boundaries and by distance to the transcription start site respectively), insertions and deletions affecting multiple nucleotides are separated by pipes (`|`). This allows reporting a transcript consequence per disrupted base. -- Bystro Databases can be re-created from the YAML configuration file corresponding to that database, and new databases with different information can be created by editing the YAML configuration file, and re-running the Bystro Database creation process. +4. **Reserved Delimiters**: The reserved delimiters described in points 1-3 (`;`, `/`, and `|`) will be stripped and replaced with a comma if found in source inputs to the Bystro Annotation Database. -- To create a Bystro Database, the user needs to provide a YAML configuration file that specifies all of the source file locations, the location to write the database, and the tracks/fields to output, and then runs `bystro-build.pl --config /path/to/config`. This will create a Bystro Database that can be used to annotate VCF or SNP files. +## What Information Can Bystro Annotator Output? -#### Variant Representation +Bystro Annotator is a general-purpose data curation and labeling engine, and has no restrictions on the types of annotations/feature labels it can output. + +## Variant Representations Bystro's variant representation deviates slightly from the standard VCF format in the name of simplicity. In particular, it drops the rule that the alternate allele must be ACTG, so that it can represent deletions on the base that the deletion actually occurs, rather than the base before the deletion. This has a number of surprising benefits: @@ -52,11 +99,11 @@ Bystro's variant representation deviates slightly from the standard VCF format i Bystro also has the ability to output a genotype dosage matrix in the [Arrow Feather V2/IPC format](https://arrow.apache.org/docs/python/feather.html) which is a matrix of the number of alternate alleles for each sample at each variant. -#### Comparing the VCF and Bystro Variant Representations +### Comparing the VCF and Bystro Variant Representations To run these example, you will need to have the Bystro VCF preprocessor installed. You can install it by running `go install github.com/bystrogenomics/bystro-vcf@2.2.3`. -Please note that we are not showing the full Bystro Annotator outputs, which are far too extensive to easily display here. Instead, we are showing a subset of the fields outputted by the Bystro VCF preprocessor, which is a tool that takes a VCF file and outputs a Bystro-annotator compatible TSV file, which is then used by the Perl Bystro Annotator to add annotations to the VCF file from the Bystro Database. +Please note that we are not showing the full Bystro Annotator outputs, which are far too extensive to easily display here. Instead, we are showing a subset of the fields outputted by the Bystro VCF preprocessor, which is a tool that takes a VCF file and outputs a Bystro-annotator compatible TSV file, which is then used by the Perl Bystro Annotator to add annotations to the VCF file from the Bystro Annotation Database. ``` cat ~/bystro/perl/example_vcf.tsv | bystro-vcf --keepId --emptyField "NA" --keepPos @@ -118,7 +165,7 @@ print(df) - Note that missing genotypes are reprsented as -1 in the genotype dosage matrix output - If a sample's genotype contains any missing genotypes, the sample is considered missing for the site -##### Explanation For SIMPLE_SNP +#### Explanation For SIMPLE_SNP VCF Representation: @@ -140,7 +187,7 @@ Bystro Genotype Dosage Matrix Output: The Bystro and VCF formats for simple, well-normalized SNPs are the same. In addition to the position, variant type, reference, and alternate, Bystro's VCF preprocessor (bystro-vcf) also outputs whether a variant is a transition (1), transversion (2) or neither (0), descriptive information about the genotypes, including which samples are heterozyogtes, homozygotes, or missing genotypes, vcfPos (which describes the original position in the VCF file, pre-normalization), and the VCF ID. Meanwhile the genotype dosage matrix output shows the number of alternate alleles for each sample at each variant. -##### Explanation For MULTIALLELIC_SNP +#### Explanation For MULTIALLELIC_SNP VCF Representation: @@ -157,7 +204,7 @@ Bystro Representation: The VCF representation shows two different SNPs at the same position. NA00001 and NA00002 have 1 copy of each allele, while NA00003 has 2 copies of the A>T allele and 0 copies of A>G. Bystro's representation decomposes the multiallelic site into two separate rows, one for each allele. The first row shows the A>G allele, and the second row shows the A>T allele. Since NA00001 and NA00002 are heterozygous for both A>G and A>T, on each line they are listed in the heterozygotes columns, while NA00003 is homozygous for A>T and is listed in the homozygotes column only for the A>T allele row. The zygosity and sampleMaf (sample minor allele frequency) fields are calculated based on the allele in the row. -##### Explanation For SIMPLE_INSERTION +#### Explanation For SIMPLE_INSERTION VCF Representation: @@ -173,7 +220,7 @@ Bystro Representation: The VCF representation shows an insertion of a C base after the A base at position 1. Bystro's representation shows the insertion as occurring at the A base, with the reference base being A and the alternate allele being +C. The heterozygotes column lists NA00003 as heterozygous for the insertion. -##### Explanation For INSERTION_BETWEEN_TWO_BASES +#### Explanation For INSERTION_BETWEEN_TWO_BASES VCF Representation: @@ -189,7 +236,7 @@ Bystro Representation: The VCF representation shows an insertion of CC between the A and T bases. Bystro's representation shows the insertion as occurring after the A base, with the reference base being A and the alternate allele being +CC. NA00001 and NA00002 are heterozygous for the insertion, while NA00003 is homozygous for the insertion and therefore listed in the homozygotes column. -##### Explanation For microsat1 +#### Explanation For microsat1 VCF Representation: @@ -208,7 +255,7 @@ The VCF representation shows a multiallelic site with two alleles. The first all The second allele is GTCT>GTACT, with the insertion of an "A" occuring after the "T" base at position 1234568. Again, because of the VCF format's padding rule, this representation cannot be shown directly in the VCF format, but must be inferred. Bystro normalizes the representation, showing the insertion at the correct base, 1234568. -##### Explanation For EXAMPLE_MISSING_MNP +#### Explanation For EXAMPLE_MISSING_MNP VCF Representation: @@ -226,9 +273,21 @@ Bystro Representation: The VCF representation shows a multi-nucleotide polymorphism (MNP) at position 3, where 3 bases are changed from CCC to AAA. An MNP is really 3 single nucleotide polymorphisms next to each other, typically linked on the same chromosome. Bystro decomposes the MNP into 3 separate rows, each with a single nucleotide change. The first row shows the first base change, the second row shows the second base change, and the third row shows the third base change. NA00001 was unsuccessfully typed, with 1 of its 2 chromosomes having an ambiguous or low quality genotype ("."). Bystro, to be conservative ("garbage in means garbage out"), counts this sample as having a missing genotype, and subtracts 2 from the `an` (allele number). -### Annotation Fields for Default Human Assembly hg38 and hg19 Bystro Databases +## What is the Bystro Annotation Database? + +To output annotations, the user must point Bystro Annotator at a Bystro Annotator Database, which is a high-performance embedded memory-mapped database used by the Bystro Annotator to label variants. Three default databases are provided, for Humans (hg19 and hg38), and rats (rn11). See the [INSTALL.md#databases](./INSTALL.md#databases) section for more information on how to download these databases. + +Key points: + +- A Bystro Annotation Database is a high-performance memory-mapped key-value database that uses the Lightning Memory Map Database (LMDB) engine. It supports millions of lookups per second on a single machine, and can be used to store and retrieve annotations for millions of variants. + +- Bystro Annotation Databases can be re-created from the YAML configuration file corresponding to that database, and new databases with different information can be created by editing the YAML configuration file, and re-running the Bystro Annotation Database creation process. + +- To create a Bystro Annotation Database, the user needs to provide a YAML configuration file that specifies all of the source file locations, the location to write the database, and the tracks/fields to output, and then runs `bystro-build.pl --config /path/to/config`. This will create a Bystro Annotation Database that can be used to annotate VCF or SNP files. + +## Annotation Fields for Default Human Assembly hg38 and hg19 Bystro Annotation Databases -#### Basic Fields +### Basic Fields Sourced from the input file, or calculated based on input fields from the VCF or SNP file pre-processor. @@ -287,22 +346,22 @@ Sourced from the input file, or calculated based on input fields from the VCF or `discordant` - TRUE if the reference base provided in the input VCF matches the Bystro-annotated UCSC reference, FALSE otherwise -`ref` - The Bystro-annotated reference base(s), from the 'ref' track in the Bystro Database +`ref` - The Bystro-annotated reference base(s), from the 'ref' track in the Bystro Annotation Database -- In the default Bystro Database, this is sourced from the UCSC reference genome -- In custom Bystro Databases, this can be sourced from any reference genome +- In the default Bystro Annotation Database, this is sourced from the UCSC reference genome +- In custom Bystro Annotation Databases, this can be sourced from any reference genome - In the case of insertions the `ref` will be 2 bases long, the base just before the insertion, and the one right after - In the case of deletions, the ref will be as long as the deletion, up to 32 bases (after that, the ref will be truncated)
-#### Transcript Annotations +### Transcript Annotations -In the default Bystro Database, we source transcript annotations from the UCSC refGene track, joined on other UCSC tracks: knownToEnsembl, kgXref, knownCanonical. +In the default Bystro Annotation Database, we source transcript annotations from the UCSC refGene track, joined on other UCSC tracks: knownToEnsembl, kgXref, knownCanonical. - See [refGene]('https://sc-bro.nhlbi.nih.gov/cgi-bin/hgTables?hgsid=554_JXUlabut7OUQtCyNphC8FGaeUJnj&hgta_doSchemaDb=hg38&hgta_doSchemaTable=refGene') and [kgXref]('https://sc-bro.nhlbi.nih.gov/cgi-bin/hgTables?hgsid=554_JXUlabut7OUQtCyNphC8FGaeUJnj&hgta_doSchemaDb=hg38& hgta_doSchemaTable=kgXref') for more information -- In custom Bystro Databases, these annotations can be sourced from any UCSC transcript track, and multiple such `gene` type tracks can be defined in a single Bystro Database (annotations for all will be outputted) +- In custom Bystro Annotation Databases, these annotations can be sourced from any UCSC transcript track, and multiple such `gene` type tracks can be defined in a single Bystro Annotation Database (annotations for all will be outputted) - **When a site is intergenic, all `refSeq` annotations will be `NA`** @@ -355,7 +414,7 @@ In the default Bystro Database, we source transcript annotations from the UCSC r
-#### nearest.refSeq +### nearest.refSeq The nearest transcript(s), calculated by trascript start, transcript end boundaries. Transcripts that are equidistant are all outputted. @@ -367,7 +426,7 @@ The nearest transcript(s), calculated by trascript start, transcript end boundar
-#### nearestTss.refSeq +### nearestTss.refSeq The nearest transcript(s), calculated by the distance to the nearest transcript start site (TSS). Transcripts with the same TSS are all outputted. @@ -379,13 +438,13 @@ The nearest transcript(s), calculated by the distance to the nearest transcript
-#### gnomAD Annotations +### gnomAD Annotations Annotations from the gnomAD v4.1 (hg38 assembly annotations) or v2.1.1 (hg19 assembly annotations) whole-genome set Since the data available for hg19 and hg38 differ, we will discuss them separately below. -#### hg38 gnomad.joint +### hg38 gnomad.joint Annotations from the gnomAD v4.1 (hg38 assembly annotations) joint set @@ -461,7 +520,7 @@ Annotations from the gnomAD v4.1 (hg38 assembly annotations) joint set `gnomad.joint.AN_grpmax_joint`: Total number of alleles in the genetic ancestry group with the maximum allele frequency in the joint subset -#### hg38 gnomad.genomes +### hg38 gnomad.genomes Annotations from the gnomAD v4.1 whole-genome set @@ -537,7 +596,7 @@ Annotations from the gnomAD v4.1 whole-genome set
-#### hg38 gnomad.exomes +### hg38 gnomad.exomes Annotations from gnomAD v4.1 whole-exome set @@ -657,7 +716,7 @@ Annotations from gnomAD v4.1 whole-exome set
-#### hg19 gnomad.genomes (v2.1.1 - latest release for hg19) +### hg19 gnomad.genomes (v2.1.1 - latest release for hg19) `gnomad.genomes.alt`: The Bystro VCF-preprocessor's ALT record for this gnomAD site. This should always match the row's `alt` field value @@ -725,7 +784,7 @@ Annotations from gnomAD v4.1 whole-exome set
-#### hg19 gnomad.exomes (v2.1.1 - latest release for hg19) +### hg19 gnomad.exomes (v2.1.1 - latest release for hg19) Annotations from the gnomAD v2.1.1 exome set @@ -803,7 +862,7 @@ Annotations from the gnomAD v2.1.1 exome set
-#### [dbSNP](https://www.ncbi.nlm.nih.gov/snp) +### [dbSNP](https://www.ncbi.nlm.nih.gov/snp) dbSNP 155 annotations. Descriptions taken from UCSC's [reference on dbSNP155](https://genome.ucsc.edu/cgi-bin/hgTrackUi?db=hg38&g=dbSnp155Composite) @@ -857,14 +916,14 @@ dbSNP 155 annotations. Descriptions taken from UCSC's [reference on dbSNP155](ht
-#### [cadd](http://cadd.gs.washington.edu) +### [cadd](http://cadd.gs.washington.edu) A score >=0 that indicates deleteriousness of a variant. Variants with cadd > 15 are more likely to be deleterious. See http://cadd.gs.washington.edu.
-#### [caddIndel](http://cadd.gs.washington.edu) +### [caddIndel](http://cadd.gs.washington.edu) A score >=0 that indicates deleteriousness of a variant. Variants with cadd > 15 are more likely to be deleterious. See http://cadd.gs.washington.edu. @@ -881,7 +940,7 @@ caddIndel scores are only defined for indels and MNPs. For SNPs, use the `cadd`
-#### clinvarVcf +### clinvarVcf ClinVar annotations, sourced from the ClinVar VCF dataset @@ -921,7 +980,7 @@ ClinVar annotations, sourced from the ClinVar VCF dataset
-#### (hg38-only) [LoGoFunc]( +### (hg38-only) [LoGoFunc]( https://www.ncbi.nlm.nih.gov/pmc/articles/PMC10688473/) A machine learning method for predicting pathogenic GOF, pathogenic LOF, and neutral genetic variants, trained on a broad range of gene-, protein-, and variant-level features describing diverse biological characteristics. @@ -938,7 +997,7 @@ A machine learning method for predicting pathogenic GOF, pathogenic LOF, and neu `logofunc.lof`: The LoGoFunc loss of function (LOF) score -#### (hg38-only) [GeneBass]() +### (hg38-only) [GeneBass]() `genebass.id`: The GeneBass VCF `ID` diff --git a/perl/INSTALL.md b/perl/INSTALL.md index 2e3fd3991..003d3d05e 100644 --- a/perl/INSTALL.md +++ b/perl/INSTALL.md @@ -250,7 +250,8 @@ Database configurations are stored in YAML files in the `config` directory. By d 1. [Human (hg38) database](https://s3.amazonaws.com/bystro-db/hg38_v11.tar.gz) 2. [Human (hg19) database](https://s3.amazonaws.com/bystro-db/hg19_v10.tar.gz) -3. There are no restrictions on species support, but we currently only build human genomes. Please create a GitHub issue if you would like us to support others +3. [Rat (rn7) database](https://s3.amazonaws.com/bystro-db/rn7.tar.gz) +4. There are no restrictions on species support, but for the open source Bystro Annotator we currently only build human and rat genomes, and do not guarantee that the open-source version will be up to date. Please create a GitHub issue if you would like us to support others or need updates to the current databases. ## Running your first annotation