From 0864aefb26347bc746c9b0e2f7e774634527c0aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Florian=20R=2E=20H=C3=B6lzlwimmer?= Date: Wed, 24 Jun 2020 02:56:16 +0200 Subject: [PATCH 1/3] add VariantEffect dataloader.yaml for Framepool --- Framepool/VariantEffect/dataloader.yaml | 82 +++++++++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 Framepool/VariantEffect/dataloader.yaml diff --git a/Framepool/VariantEffect/dataloader.yaml b/Framepool/VariantEffect/dataloader.yaml new file mode 100644 index 000000000..e06ff72ea --- /dev/null +++ b/Framepool/VariantEffect/dataloader.yaml @@ -0,0 +1,82 @@ +defined_as: kipoiseq.dataloaders.SingleVariantUTRDataLoader + +type: SampleIterator + +args: + gtf_file: + doc: file path; Genome annotation GTF file + example: + url: https://github.com/kipoi/kipoiseq/blob/ddeb4eefc15ebf8a9b88fca4ce99d9b315d54f34/tests/data/chr22_ENST00000319363.gtf?raw=true + md5: 8a1f158e17379773fcab21628fc3910f + name: gtf_file.gtf + fasta_file: + doc: Reference Genome sequence in fasta format + example: + url: https://github.com/kipoi/kipoiseq/blob/ddeb4eefc15ebf8a9b88fca4ce99d9b315d54f34/tests/data/chr22.fa.gz?raw=true + md5: 5ebe034256ecc5689989a96387c5a65e + name: fasta_file.fa.gz + vcf_file: + doc: Genomic variants to evaluate in VCF format + example: + url: https://github.com/kipoi/kipoiseq/blob/ddeb4eefc15ebf8a9b88fca4ce99d9b315d54f34/tests/data/chr22_ENST00000319363.vcf.gz?raw=true + md5: c45e75fb75326c2be514d2dcea52e585 + name: vcf_file.vcf.gz + vcf_file_tbi: + doc: tabix index of vcf (just to make kipoi tests work - leave as None in normal usage) + example: + url: https://github.com/kipoi/kipoiseq/blob/ddeb4eefc15ebf8a9b88fca4ce99d9b315d54f34/tests/data/chr22_ENST00000319363.vcf.gz.tbi?raw=true + md5: 9aebc88287a3d6b8517ace9e0fc427af + name: vcf_file.vcf.gz.tbi + feature_type: + doc: > + Needs to be "5UTR" + default: 5UTR + example: 5UTR + type: str + infer_from_cds: + doc: infer UTR regions from coding sequence + optional: True + default: False + type: bool + on_error_warn: + doc: print warning instead of throwing an error on malformed input + optional: True + default: True + type: bool +default_args: + feature_type: 5UTR + +output_schema: + inputs: + ref_seq: + name: ref_seq + shape: () + special_type: DNAStringSeq + doc: reference sequence of UTR + associated_metadata: ranges + alt_seq: + name: alt_seq + doc: alternative sequence of 5' UTR + shape: () + special_type: DNAStringSeq + associated_metadata: ranges, variants + metadata: + transcript_id: + type: str + doc: transcript id + variant: + chrom: + type: str + doc: chromsome of variant + pos: + type: int + doc: variant position + ref: + type: str + doc: variant reference + alt: + type: str + doc: variant alternative string + str: + type: str + doc: string representation of the variant \ No newline at end of file From 7f89e401964974c1015f5cba3228b003b0de923e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Florian=20R=2E=20H=C3=B6lzlwimmer?= Date: Wed, 24 Jun 2020 03:05:56 +0200 Subject: [PATCH 2/3] add id field in framepool veff dataloader.yaml --- Framepool/VariantEffect/dataloader.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Framepool/VariantEffect/dataloader.yaml b/Framepool/VariantEffect/dataloader.yaml index e06ff72ea..20191d2df 100644 --- a/Framepool/VariantEffect/dataloader.yaml +++ b/Framepool/VariantEffect/dataloader.yaml @@ -77,6 +77,9 @@ output_schema: alt: type: str doc: variant alternative string + id: + type: str + doc: variant id str: type: str doc: string representation of the variant \ No newline at end of file From 5d00d779b26f1b6152665ab2eb25ba25fac9d786 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Florian=20R=2E=20H=C3=B6lzlwimmer?= Date: Fri, 19 Feb 2021 17:11:28 +0100 Subject: [PATCH 3/3] add variant effect dataloader for framepool --- Framepool/VariantEffect/dataloader.py | 14 +++ Framepool/VariantEffect/dataloader.yaml | 155 ++++++++++++------------ 2 files changed, 89 insertions(+), 80 deletions(-) create mode 100644 Framepool/VariantEffect/dataloader.py diff --git a/Framepool/VariantEffect/dataloader.py b/Framepool/VariantEffect/dataloader.py new file mode 100644 index 000000000..526c1bc46 --- /dev/null +++ b/Framepool/VariantEffect/dataloader.py @@ -0,0 +1,14 @@ +from kipoiseq.dataloaders import SingleVariantUTRDataLoader + +class SingleVariantFramepoolDataloader(SingleVariantUTRDataLoader): + def __init__(self, gtf_file, fasta_file, vcf_file, vcf_file_tbi=None, infer_from_cds=False, on_error_warn=True, **kwargs): + kwargs["feature_type"] = "5UTR" + super().__init__( + gtf_file=gtf_file, + fasta_file=fasta_file, + vcf_file=vcf_file, + vcf_file_tbi=vcf_file_tbi, + infer_from_cds=infer_from_cds, + on_error_warn=on_error_warn, + **kwargs + ) diff --git a/Framepool/VariantEffect/dataloader.yaml b/Framepool/VariantEffect/dataloader.yaml index 20191d2df..cb1ff6006 100644 --- a/Framepool/VariantEffect/dataloader.yaml +++ b/Framepool/VariantEffect/dataloader.yaml @@ -1,85 +1,80 @@ -defined_as: kipoiseq.dataloaders.SingleVariantUTRDataLoader +defined_as: dataloader.py::SingleVariantFramepoolDataloader type: SampleIterator - args: - gtf_file: - doc: file path; Genome annotation GTF file - example: - url: https://github.com/kipoi/kipoiseq/blob/ddeb4eefc15ebf8a9b88fca4ce99d9b315d54f34/tests/data/chr22_ENST00000319363.gtf?raw=true - md5: 8a1f158e17379773fcab21628fc3910f - name: gtf_file.gtf - fasta_file: - doc: Reference Genome sequence in fasta format - example: - url: https://github.com/kipoi/kipoiseq/blob/ddeb4eefc15ebf8a9b88fca4ce99d9b315d54f34/tests/data/chr22.fa.gz?raw=true - md5: 5ebe034256ecc5689989a96387c5a65e - name: fasta_file.fa.gz - vcf_file: - doc: Genomic variants to evaluate in VCF format - example: - url: https://github.com/kipoi/kipoiseq/blob/ddeb4eefc15ebf8a9b88fca4ce99d9b315d54f34/tests/data/chr22_ENST00000319363.vcf.gz?raw=true - md5: c45e75fb75326c2be514d2dcea52e585 - name: vcf_file.vcf.gz - vcf_file_tbi: - doc: tabix index of vcf (just to make kipoi tests work - leave as None in normal usage) - example: - url: https://github.com/kipoi/kipoiseq/blob/ddeb4eefc15ebf8a9b88fca4ce99d9b315d54f34/tests/data/chr22_ENST00000319363.vcf.gz.tbi?raw=true - md5: 9aebc88287a3d6b8517ace9e0fc427af - name: vcf_file.vcf.gz.tbi - feature_type: - doc: > - Needs to be "5UTR" - default: 5UTR - example: 5UTR - type: str - infer_from_cds: - doc: infer UTR regions from coding sequence - optional: True - default: False - type: bool - on_error_warn: - doc: print warning instead of throwing an error on malformed input - optional: True - default: True - type: bool -default_args: - feature_type: 5UTR + gtf_file: + doc: file path; Genome annotation GTF file + example: + url: https://github.com/kipoi/kipoiseq/blob/ddeb4eefc15ebf8a9b88fca4ce99d9b315d54f34/tests/data/chr22_ENST00000319363.gtf?raw=true + md5: 8a1f158e17379773fcab21628fc3910f + name: gtf_file.gtf + fasta_file: + doc: Reference Genome sequence in fasta format + example: + url: https://github.com/kipoi/kipoiseq/blob/ddeb4eefc15ebf8a9b88fca4ce99d9b315d54f34/tests/data/chr22.fa.gz?raw=true + md5: 5ebe034256ecc5689989a96387c5a65e + name: fasta_file.fa.gz + vcf_file: + doc: Genomic variants to evaluate in VCF format + example: + url: https://github.com/kipoi/kipoiseq/blob/ddeb4eefc15ebf8a9b88fca4ce99d9b315d54f34/tests/data/chr22_ENST00000319363.vcf.gz?raw=true + md5: c45e75fb75326c2be514d2dcea52e585 + name: vcf_file.vcf.gz + vcf_file_tbi: + doc: tabix index of vcf (just to make kipoi tests work - leave as None in normal usage) + optional: True + example: + url: https://github.com/kipoi/kipoiseq/blob/ddeb4eefc15ebf8a9b88fca4ce99d9b315d54f34/tests/data/chr22_ENST00000319363.vcf.gz.tbi?raw=true + md5: 9aebc88287a3d6b8517ace9e0fc427af + name: vcf_file.vcf.gz.tbi + infer_from_cds: + doc: infer UTR regions from coding sequence + optional: True + default: False + example: False + type: bool + on_error_warn: + doc: print warning instead of throwing an error on malformed input + optional: True + default: True + example: True + type: bool output_schema: - inputs: - ref_seq: - name: ref_seq - shape: () - special_type: DNAStringSeq - doc: reference sequence of UTR - associated_metadata: ranges - alt_seq: - name: alt_seq - doc: alternative sequence of 5' UTR - shape: () - special_type: DNAStringSeq - associated_metadata: ranges, variants - metadata: - transcript_id: - type: str - doc: transcript id - variant: - chrom: - type: str - doc: chromsome of variant - pos: - type: int - doc: variant position - ref: - type: str - doc: variant reference - alt: - type: str - doc: variant alternative string - id: - type: str - doc: variant id - str: - type: str - doc: string representation of the variant \ No newline at end of file + inputs: + ref_seq: + name: ref_seq + shape: () + special_type: DNAStringSeq + doc: reference sequence of 5' UTR + associated_metadata: ranges + alt_seq: + name: alt_seq + doc: alternative sequence of 5' UTR + shape: () + special_type: DNAStringSeq + associated_metadata: ranges, variants + metadata: + transcript_id: + type: str + doc: transcript id + variant: + chrom: + type: str + doc: chromsome of variant + pos: + type: int + doc: variant position + ref: + type: str + doc: variant reference + alt: + type: str + doc: variant alternative string + id: + type: str + doc: variant id + str: + type: str + doc: string representation of the variant +