diff --git a/DeepMEL/dataloader.py b/DeepMEL/DeepMEL/dataloader.py similarity index 99% rename from DeepMEL/dataloader.py rename to DeepMEL/DeepMEL/dataloader.py index b6dba6a6b..15304fc82 100644 --- a/DeepMEL/dataloader.py +++ b/DeepMEL/DeepMEL/dataloader.py @@ -10,7 +10,6 @@ class MyDataset(Dataset): """Example re-implementation of kipoiseq.dataloaders.SeqIntervalDl - Args: intervals_file: bed3 file containing intervals fasta_file: file path; Genome sequence @@ -49,4 +48,4 @@ def __getitem__(self, idx): return { "inputs": [seq_onehot, seq_onehot_rc], "metadata": [ranges, ranges_rc] - } + } \ No newline at end of file diff --git a/DeepMEL/dataloader.yaml b/DeepMEL/DeepMEL/dataloader.yaml similarity index 97% rename from DeepMEL/dataloader.yaml rename to DeepMEL/DeepMEL/dataloader.yaml index 55a85c254..665348f70 100644 --- a/DeepMEL/dataloader.yaml +++ b/DeepMEL/DeepMEL/dataloader.yaml @@ -44,4 +44,4 @@ output_schema: # Define the dataloader output schema according to the returned v doc: Ranges describing inputs.seq - name: ranges_rc type: GenomicRanges - doc: Ranges describing inputs.seq_rc + doc: Ranges describing inputs.seq_rc \ No newline at end of file diff --git a/DeepMEL/model.yaml b/DeepMEL/DeepMEL/model.yaml similarity index 86% rename from DeepMEL/model.yaml rename to DeepMEL/DeepMEL/model.yaml index 89aabe1df..8aa5e7d8d 100644 --- a/DeepMEL/model.yaml +++ b/DeepMEL/DeepMEL/model.yaml @@ -13,11 +13,11 @@ info: authors: - name: Ibrahim Ihsan Taskiran github: itaskiran - email: ibrahimihsan.taskiran@kuleuven.vib.be + email: ibrahimihsan.taskiran@kuleuven.be - name: Liesbeth Minnoye - name: Stein Aerts doc: Model predicting melanoma-specific accessible regions - cite_as: https://doi.org/10.1101/2019.12.21.885715 + cite_as: https://doi.org/10.1101/gr.260844.120 trained_on: Accessible genomic sites. Held-out chromosome chr2. license: MIT @@ -39,4 +39,4 @@ schema: targets: name: topic shape: (24,) - doc: Topic Prediction (4-MEL, 7-MES) + doc: Topic Prediction (4-MEL, 7-MES) \ No newline at end of file diff --git a/DeepMEL/DeepMEL2/dataloader.py b/DeepMEL/DeepMEL2/dataloader.py new file mode 100644 index 000000000..15304fc82 --- /dev/null +++ b/DeepMEL/DeepMEL2/dataloader.py @@ -0,0 +1,51 @@ +from __future__ import absolute_import, division, print_function +import numpy as np +from kipoi.data import Dataset +from kipoi.metadata import GenomicRanges +from kipoiseq.dataloaders.sequence import BedDataset +from kipoiseq.extractors import FastaStringExtractor +from kipoiseq.transforms import OneHot +from kipoiseq.transforms.functional import resize_interval + + +class MyDataset(Dataset): + """Example re-implementation of kipoiseq.dataloaders.SeqIntervalDl + Args: + intervals_file: bed3 file containing intervals + fasta_file: file path; Genome sequence + """ + + def __init__(self, intervals_file, fasta_file, ignore_targets=True): + self.bt = BedDataset(intervals_file, + bed_columns=3, + ignore_targets=ignore_targets) + self.fasta_file = fasta_file + self.fasta_extractor = None + self.transform = OneHot() # one-hot encode DNA sequence + + def __len__(self): + return len(self.bt) + + def __getitem__(self, idx): + self.fasta_extractor = FastaStringExtractor(self.fasta_file) + + # get the intervals + interval, targets = self.bt[idx] + + # resize to 500bp + interval = resize_interval(interval, 500, anchor='center') + + # extract the sequence + seq = self.fasta_extractor.extract(interval) + + # one-hot encode the sequence + seq_onehot = self.transform(seq) + seq_onehot_rc = seq_onehot[::-1, ::-1] + + ranges = GenomicRanges.from_interval(interval) + ranges_rc = GenomicRanges.from_interval(interval) + + return { + "inputs": [seq_onehot, seq_onehot_rc], + "metadata": [ranges, ranges_rc] + } \ No newline at end of file diff --git a/DeepMEL/DeepMEL2/dataloader.yaml b/DeepMEL/DeepMEL2/dataloader.yaml new file mode 100644 index 000000000..665348f70 --- /dev/null +++ b/DeepMEL/DeepMEL2/dataloader.yaml @@ -0,0 +1,47 @@ +defined_as: dataloader.MyDataset # MyDataset impolemented in dataloader.py +args: # MyDataset.__init__ argument description + intervals_file: + doc: intervals file bed3 + example: + url: https://zenodo.org/record/3592452/files/input.bed?download=1 + md5: md5:58aa210c8ec9574df3f414aee10c7099 + fasta_file: + doc: Reference genome FASTA file path. + example: + url: https://zenodo.org/record/3592452/files/chr6_393135_399635.fa?download=1 + md5: md5:a6e5449b06e17f38420b2a4421199371 + ignore_targets: + doc: if True, don't return any target variables + optional: True # if not present, the "targets" will not be present + +info: + authors: + - name: Ibrahim Ihsan Taskiran + github: itaskiran + email: ibrahimihsan.taskiran@kuleuven.vib.be + doc: Data-loader returning one-hot encoded sequences given genome intervals + +dependencies: + conda: + - python=3.6 + - bioconda::pybedtools + - bioconda::pysam + - bioconda::pyfaidx + - numpy + - pandas + pip: + - kipoiseq + +output_schema: # Define the dataloader output schema according to the returned values + inputs: + - shape: (500, 4) + doc: One-hot encoded DNA sequence + - shape: (500, 4) + doc: One-hot encoded reverse-complemented DNA sequence + metadata: + - name: ranges + type: GenomicRanges + doc: Ranges describing inputs.seq + - name: ranges_rc + type: GenomicRanges + doc: Ranges describing inputs.seq_rc \ No newline at end of file diff --git a/DeepMEL/DeepMEL2/model.yaml b/DeepMEL/DeepMEL2/model.yaml new file mode 100644 index 000000000..1b0e382db --- /dev/null +++ b/DeepMEL/DeepMEL2/model.yaml @@ -0,0 +1,42 @@ +defined_as: kipoi.model.KerasModel +args: + arch: + url: https://zenodo.org/record/4590308/files/DeepMEL2.json.txt?download=1 + md5: md5:38faa21aa88a2198e6e0a2a910694a3d + weights: + url: https://zenodo.org/record/4590308/files/DeepMEL2.hdf5?download=1 + md5: md5:bf95fa12678c7f284905eddf69b4fbc9 + +default_dataloader: . + +info: + authors: + - name: Ibrahim Ihsan Taskiran + github: itaskiran + email: ibrahimihsan.taskiran@kuleuven.be + - name: Zeynep Kalender Atak + - name: Stein Aerts + doc: Specialized deep learning model on melanoma chromatin accessibility data + cite_as: https://doi.org/10.1101/2019.12.21.885806 + trained_on: Accessible genomic sites. + license: MIT + +dependencies: + conda: # install via conda + - python=3.6 + - h5py==2.10.0 + + pip: # install via pip + - keras>=2.2.4 + - tensorflow>=1.14.0 + +schema: + inputs: + - shape: (500, 4) + doc: DNA sequence + - shape: (500, 4) + doc: Reverse-complemented DNA sequence + targets: + name: topic + shape: (47,) + doc: Topic Prediction \ No newline at end of file diff --git a/DeepMEL/DeepMEL2_GABPA/dataloader.py b/DeepMEL/DeepMEL2_GABPA/dataloader.py new file mode 100644 index 000000000..15304fc82 --- /dev/null +++ b/DeepMEL/DeepMEL2_GABPA/dataloader.py @@ -0,0 +1,51 @@ +from __future__ import absolute_import, division, print_function +import numpy as np +from kipoi.data import Dataset +from kipoi.metadata import GenomicRanges +from kipoiseq.dataloaders.sequence import BedDataset +from kipoiseq.extractors import FastaStringExtractor +from kipoiseq.transforms import OneHot +from kipoiseq.transforms.functional import resize_interval + + +class MyDataset(Dataset): + """Example re-implementation of kipoiseq.dataloaders.SeqIntervalDl + Args: + intervals_file: bed3 file containing intervals + fasta_file: file path; Genome sequence + """ + + def __init__(self, intervals_file, fasta_file, ignore_targets=True): + self.bt = BedDataset(intervals_file, + bed_columns=3, + ignore_targets=ignore_targets) + self.fasta_file = fasta_file + self.fasta_extractor = None + self.transform = OneHot() # one-hot encode DNA sequence + + def __len__(self): + return len(self.bt) + + def __getitem__(self, idx): + self.fasta_extractor = FastaStringExtractor(self.fasta_file) + + # get the intervals + interval, targets = self.bt[idx] + + # resize to 500bp + interval = resize_interval(interval, 500, anchor='center') + + # extract the sequence + seq = self.fasta_extractor.extract(interval) + + # one-hot encode the sequence + seq_onehot = self.transform(seq) + seq_onehot_rc = seq_onehot[::-1, ::-1] + + ranges = GenomicRanges.from_interval(interval) + ranges_rc = GenomicRanges.from_interval(interval) + + return { + "inputs": [seq_onehot, seq_onehot_rc], + "metadata": [ranges, ranges_rc] + } \ No newline at end of file diff --git a/DeepMEL/DeepMEL2_GABPA/dataloader.yaml b/DeepMEL/DeepMEL2_GABPA/dataloader.yaml new file mode 100644 index 000000000..665348f70 --- /dev/null +++ b/DeepMEL/DeepMEL2_GABPA/dataloader.yaml @@ -0,0 +1,47 @@ +defined_as: dataloader.MyDataset # MyDataset impolemented in dataloader.py +args: # MyDataset.__init__ argument description + intervals_file: + doc: intervals file bed3 + example: + url: https://zenodo.org/record/3592452/files/input.bed?download=1 + md5: md5:58aa210c8ec9574df3f414aee10c7099 + fasta_file: + doc: Reference genome FASTA file path. + example: + url: https://zenodo.org/record/3592452/files/chr6_393135_399635.fa?download=1 + md5: md5:a6e5449b06e17f38420b2a4421199371 + ignore_targets: + doc: if True, don't return any target variables + optional: True # if not present, the "targets" will not be present + +info: + authors: + - name: Ibrahim Ihsan Taskiran + github: itaskiran + email: ibrahimihsan.taskiran@kuleuven.vib.be + doc: Data-loader returning one-hot encoded sequences given genome intervals + +dependencies: + conda: + - python=3.6 + - bioconda::pybedtools + - bioconda::pysam + - bioconda::pyfaidx + - numpy + - pandas + pip: + - kipoiseq + +output_schema: # Define the dataloader output schema according to the returned values + inputs: + - shape: (500, 4) + doc: One-hot encoded DNA sequence + - shape: (500, 4) + doc: One-hot encoded reverse-complemented DNA sequence + metadata: + - name: ranges + type: GenomicRanges + doc: Ranges describing inputs.seq + - name: ranges_rc + type: GenomicRanges + doc: Ranges describing inputs.seq_rc \ No newline at end of file diff --git a/DeepMEL/DeepMEL2_GABPA/model.yaml b/DeepMEL/DeepMEL2_GABPA/model.yaml new file mode 100644 index 000000000..f5863ee43 --- /dev/null +++ b/DeepMEL/DeepMEL2_GABPA/model.yaml @@ -0,0 +1,42 @@ +defined_as: kipoi.model.KerasModel +args: + arch: + url: https://zenodo.org/record/4590405/files/DeepMEL2_GABPA.json.txt?download=1 + md5: md5:5ee8b63b856f45bbbd412b62bf42404d + weights: + url: https://zenodo.org/record/4590405/files/DeepMEL2_GABPA.hdf5?download=1 + md5: md5:327a7fe35f1a65b07ac5e9c00de67a9f + +default_dataloader: . + +info: + authors: + - name: Ibrahim Ihsan Taskiran + github: itaskiran + email: ibrahimihsan.taskiran@kuleuven.be + - name: Zeynep Kalender Atak + - name: Stein Aerts + doc: Augmented DeepMEL2 model with GABPA ChIP-seq data + cite_as: https://doi.org/10.1101/2019.12.21.885806 + trained_on: Accessible genomic sites. + license: MIT + +dependencies: + conda: # install via conda + - python=3.6 + - h5py==2.10.0 + + pip: # install via pip + - keras>=2.2.4 + - tensorflow>=1.14.0 + +schema: + inputs: + - shape: (500, 4) + doc: DNA sequence + - shape: (500, 4) + doc: Reverse-complemented DNA sequence + targets: + name: topic + shape: (48,) + doc: Topic Prediction \ No newline at end of file