-
Notifications
You must be signed in to change notification settings - Fork 32
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #176 from biothings/dbnsfp-4.4a
dbNSFP 4.4a update
- Loading branch information
Showing
9 changed files
with
4,070 additions
and
1,135 deletions.
There are no files selected for viewing
1,116 changes: 0 additions & 1,116 deletions
1,116
src/hub/dataload/sources/dbnsfp/dbnsfp_mapping.py
This file was deleted.
Oops, something went wrong.
1,070 changes: 1,070 additions & 0 deletions
1,070
src/hub/dataload/sources/dbnsfp/dbnsfp_mapping_44a_v1.py
Large diffs are not rendered by default.
Oops, something went wrong.
1,149 changes: 1,149 additions & 0 deletions
1,149
src/hub/dataload/sources/dbnsfp/dbnsfp_mapping_44a_v2.py
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
809 changes: 809 additions & 0 deletions
809
src/hub/dataload/sources/dbnsfp/dbnsfp_parser_44a_v1.py
Large diffs are not rendered by default.
Oops, something went wrong.
894 changes: 894 additions & 0 deletions
894
src/hub/dataload/sources/dbnsfp/dbnsfp_parser_44a_v2.py
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
import orjson | ||
from biothings.utils.dotfield import merge_object | ||
|
||
|
||
def make_object(attr, value): | ||
""" | ||
Create dictionary following the input dot notation and the value | ||
Example:: | ||
make_object('a.b.c', 100) --> {a:{b:{c:100}}}, or | ||
make_object(['a','b','c'], 100) --> {a:{b:{c:100}}} | ||
This is an orjson implementation of biothings.utils.dotfield.make_object, for better performance. | ||
TODO Merge into biothings.utils.dotfield if necessary. (And delete this function then.) | ||
""" | ||
attr_list = attr.split(".") | ||
s = "" | ||
for k in attr_list: | ||
s += '{"' + k + '":' | ||
s += orjson.dumps(value).decode("utf-8") # decoding is necessary because orjson dumps into bytes | ||
s += "}" * (len(attr_list)) | ||
return orjson.loads(s) | ||
|
||
|
||
def parse_dot_fields(genedoc): | ||
""" | ||
parse_dot_fields({'a': 1, 'b.c': 2, 'b.a.c': 3}) | ||
should return | ||
{'a': 1, 'b': {'a': {'c': 3}, 'c': 2}} | ||
This is a copy of biothings.utils.dotfield.parse_dot_fields. However here it uses the orjson make_object() function. | ||
TODO If orjson make_object() function is merged to biothings.utils.dotfield, this function can be deleted. | ||
""" | ||
dot_fields = [] | ||
expanded_doc = {} | ||
for key in genedoc: | ||
if key.find(".") != -1: | ||
dot_fields.append(key) | ||
expanded_doc = merge_object(expanded_doc, make_object(key, genedoc[key])) | ||
genedoc.update(expanded_doc) | ||
for key in dot_fields: | ||
del genedoc[key] | ||
return genedoc |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
from dataclasses import dataclass | ||
from itertools import groupby | ||
from typing import Callable | ||
|
||
|
||
@dataclass | ||
class TableColumn: | ||
""" | ||
Configuration marker for each column in a tabular file. | ||
A TableColumn object indicates that a value from the named column must be transformed before its assignment to a destination field inside a JSON doc. | ||
E.g. TableColumn(name="AF", dest="allele_freq", transform=float) means that a value from the "AF" column must be cast to float and then be assigned to the | ||
"allele_freq" field inside its associated JSON doc. | ||
""" | ||
name: str # column name | ||
dest: str = None # destination field name | ||
transform: Callable = None # transforming function applied to the column values | ||
tag: str = None # tagging columns that need special prior sanity check or post-processing | ||
|
||
@classmethod | ||
def identity_function(cls, value): | ||
return value | ||
|
||
def __post_init__(self): | ||
if self.dest is None: | ||
# This is very common practice of determining field name. | ||
# E.g. a value in column "SIFT_score" is often wrapped to field "sift.score" (dotfield) | ||
self.dest = self.name.lower().replace("_", ".") | ||
|
||
# Default transformation is identity function; therefore we don't have to check if self.transform is None. | ||
# The choice is made because most columns have transforming function in our application. | ||
if self.transform is None: | ||
self.transform = self.identity_function | ||
|
||
|
||
def create_tag_column_map(columns: list[TableColumn]): | ||
""" | ||
Map each tag to its associated column or columns. | ||
Args: | ||
columns: a list of TableColumn objects | ||
Returns: | ||
a dictionary of { <tag> : <list-of-columns> } | ||
""" | ||
tagged_columns = sorted([c for c in columns if c.tag is not None], key=lambda c: c.tag) | ||
result = {tag: list(columns) for tag, columns in groupby(tagged_columns, lambda c: c.tag)} | ||
return result |