Skip to content

Commit

Permalink
Merge pull request #176 from biothings/dbnsfp-4.4a
Browse files Browse the repository at this point in the history
dbNSFP 4.4a update
  • Loading branch information
Yao Yao authored Nov 3, 2023
2 parents 39954d9 + 29364f9 commit eaad66f
Show file tree
Hide file tree
Showing 9 changed files with 4,070 additions and 1,135 deletions.
1,116 changes: 0 additions & 1,116 deletions src/hub/dataload/sources/dbnsfp/dbnsfp_mapping.py

This file was deleted.

1,070 changes: 1,070 additions & 0 deletions src/hub/dataload/sources/dbnsfp/dbnsfp_mapping_44a_v1.py

Large diffs are not rendered by default.

1,149 changes: 1,149 additions & 0 deletions src/hub/dataload/sources/dbnsfp/dbnsfp_mapping_44a_v2.py

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,7 @@


"""
this parser is for dbNSFP v4.3a downloaded from
https://sites.google.com/site/jpopgen/dbNSFP
Deprecated. This parser is for dbNSFP v4.3a downloaded from https://sites.google.com/site/jpopgen/dbNSFP
"""


Expand Down Expand Up @@ -644,7 +643,6 @@ def data_generator(input_file, version):
for row in file_reader:
row = dict(zip(header, row))

# use transposed matrix to have 1 line with N 187 columns
current_row = DbnsfpReader.map_row_to_json(row, version=version)
if previous_row and current_row:
if current_row["_id"] == previous_row["_id"]:
Expand Down
809 changes: 809 additions & 0 deletions src/hub/dataload/sources/dbnsfp/dbnsfp_parser_44a_v1.py

Large diffs are not rendered by default.

894 changes: 894 additions & 0 deletions src/hub/dataload/sources/dbnsfp/dbnsfp_parser_44a_v2.py

Large diffs are not rendered by default.

71 changes: 55 additions & 16 deletions src/hub/dataload/sources/dbnsfp/dbnsfp_upload.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
import os
import glob

from .dbnsfp_mapping import mapping
from .dbnsfp_parser import load_data_file as load_common
from .dbnsfp_mapping_44a_v1 import mapping as mapping_v1
from .dbnsfp_parser_44a_v1 import load_file as load_file_v1
from .dbnsfp_mapping_44a_v2 import mapping as mapping_v2
from .dbnsfp_parser_44a_v2 import load_file as load_file_v2

import biothings.hub.dataload.uploader as uploader
from hub.dataload.uploader import SnpeffPostUpdateUploader
from hub.dataload.storage import MyVariantIgnoreDuplicatedStorage
Expand All @@ -15,37 +18,73 @@
}


class DBNSFPBaseUploader(uploader.ParallelizedSourceUploader,
SnpeffPostUpdateUploader):
class DBNSFPBaseUploaderV1(uploader.ParallelizedSourceUploader, SnpeffPostUpdateUploader):

storage_class = MyVariantIgnoreDuplicatedStorage
GLOB_PATTERN = "dbNSFP*_variant.chr*"

@classmethod
def get_mapping(klass):
return mapping
def get_mapping(cls):
return mapping_v1

def jobs(self):
# tuple(input_file,version), where version is either hg38 or hg19)
return map(lambda e: (e, self.__class__.__metadata__["assembly"]),
glob.glob(os.path.join(self.data_folder, self.__class__.GLOB_PATTERN)))
paths = glob.glob(os.path.join(self.data_folder, self.__class__.GLOB_PATTERN))
assembly = self.__class__.__metadata__["assembly"]
return map(lambda path: (path, assembly), paths)

def load_data(self, path, assembly):
self.logger.debug("loading file " + path)
return load_file_v1(path, version=assembly)


class DBNSFPBaseUploaderV2(uploader.ParallelizedSourceUploader, SnpeffPostUpdateUploader):

storage_class = MyVariantIgnoreDuplicatedStorage
GLOB_PATTERN = "dbNSFP*_variant.chr*"

@classmethod
def get_mapping(cls):
return mapping_v2

def jobs(self):
paths = glob.glob(os.path.join(self.data_folder, self.__class__.GLOB_PATTERN))
assembly = self.__class__.__metadata__["assembly"]
return map(lambda path: (path, assembly), paths)

def load_data(self, path, assembly):
self.logger.debug("loading file " + path)
return load_file_v2(path, version=assembly)


def load_data(self, input_file, hg):
self.logger.debug("loading file " + input_file)
return load_common(input_file, version=hg)
class DBNSFPHG38UploaderV1(DBNSFPBaseUploaderV1):
name = "dbnsfp_hg38_v1"
main_source = "dbnsfp"
__metadata__ = {
"assembly": "hg38",
"src_meta": SRC_META
}


class DBNSFPHG19UploaderV1(DBNSFPBaseUploaderV1):
name = "dbnsfp_hg19_v1"
main_source = "dbnsfp"
__metadata__ = {
"assembly": "hg19",
"src_meta": SRC_META
}


class DBNSFPHG38Uploader(DBNSFPBaseUploader):
name = "dbnsfp_hg38"
class DBNSFPHG38UploaderV2(DBNSFPBaseUploaderV2):
name = "dbnsfp_hg38_v2"
main_source = "dbnsfp"
__metadata__ = {
"assembly": "hg38",
"src_meta": SRC_META
}


class DBNSFPHG19Uploader(DBNSFPBaseUploader):
name = "dbnsfp_hg19"
class DBNSFPHG19UploaderV2(DBNSFPBaseUploaderV2):
name = "dbnsfp_hg19_v2"
main_source = "dbnsfp"
__metadata__ = {
"assembly": "hg19",
Expand Down
43 changes: 43 additions & 0 deletions src/utils/dotfield.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import orjson
from biothings.utils.dotfield import merge_object


def make_object(attr, value):
"""
Create dictionary following the input dot notation and the value
Example::
make_object('a.b.c', 100) --> {a:{b:{c:100}}}, or
make_object(['a','b','c'], 100) --> {a:{b:{c:100}}}
This is an orjson implementation of biothings.utils.dotfield.make_object, for better performance.
TODO Merge into biothings.utils.dotfield if necessary. (And delete this function then.)
"""
attr_list = attr.split(".")
s = ""
for k in attr_list:
s += '{"' + k + '":'
s += orjson.dumps(value).decode("utf-8") # decoding is necessary because orjson dumps into bytes
s += "}" * (len(attr_list))
return orjson.loads(s)


def parse_dot_fields(genedoc):
"""
parse_dot_fields({'a': 1, 'b.c': 2, 'b.a.c': 3})
should return
{'a': 1, 'b': {'a': {'c': 3}, 'c': 2}}
This is a copy of biothings.utils.dotfield.parse_dot_fields. However here it uses the orjson make_object() function.
TODO If orjson make_object() function is merged to biothings.utils.dotfield, this function can be deleted.
"""
dot_fields = []
expanded_doc = {}
for key in genedoc:
if key.find(".") != -1:
dot_fields.append(key)
expanded_doc = merge_object(expanded_doc, make_object(key, genedoc[key]))
genedoc.update(expanded_doc)
for key in dot_fields:
del genedoc[key]
return genedoc
49 changes: 49 additions & 0 deletions src/utils/table.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
from dataclasses import dataclass
from itertools import groupby
from typing import Callable


@dataclass
class TableColumn:
"""
Configuration marker for each column in a tabular file.
A TableColumn object indicates that a value from the named column must be transformed before its assignment to a destination field inside a JSON doc.
E.g. TableColumn(name="AF", dest="allele_freq", transform=float) means that a value from the "AF" column must be cast to float and then be assigned to the
"allele_freq" field inside its associated JSON doc.
"""
name: str # column name
dest: str = None # destination field name
transform: Callable = None # transforming function applied to the column values
tag: str = None # tagging columns that need special prior sanity check or post-processing

@classmethod
def identity_function(cls, value):
return value

def __post_init__(self):
if self.dest is None:
# This is very common practice of determining field name.
# E.g. a value in column "SIFT_score" is often wrapped to field "sift.score" (dotfield)
self.dest = self.name.lower().replace("_", ".")

# Default transformation is identity function; therefore we don't have to check if self.transform is None.
# The choice is made because most columns have transforming function in our application.
if self.transform is None:
self.transform = self.identity_function


def create_tag_column_map(columns: list[TableColumn]):
"""
Map each tag to its associated column or columns.
Args:
columns: a list of TableColumn objects
Returns:
a dictionary of { <tag> : <list-of-columns> }
"""
tagged_columns = sorted([c for c in columns if c.tag is not None], key=lambda c: c.tag)
result = {tag: list(columns) for tag, columns in groupby(tagged_columns, lambda c: c.tag)}
return result

0 comments on commit eaad66f

Please sign in to comment.