diff --git a/src/dataload/sources/biomuta/__init__.py b/src/dataload/sources/biomuta/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/dataload/sources/biomuta/biomuta_parser.py b/src/dataload/sources/biomuta/biomuta_parser.py new file mode 100644 index 00000000..234170e3 --- /dev/null +++ b/src/dataload/sources/biomuta/biomuta_parser.py @@ -0,0 +1,135 @@ +import csv +from biothings.utils.dataload import unlist +from biothings.utils.dataload import value_convert_to_number +from biothings.utils.dataload import merge_duplicate_rows +from utils.hgvs import get_hgvs_from_vcf +from itertools import groupby + +VALID_COLUMN_NO = 17 + +'''this parser is for BioMuta v3.0(BioMuta3 Complete Dataset) downloaded from +https://hive.biochemistry.gwu.edu/cgi-bin/prd/biomuta/servlet.cgi''' + + +# convert one snp to json +def _map_line_to_json(df): + # specific variable treatment + genomic_position = clean_data(df["genomic_position"], ("-",)) + if not genomic_position: + return + genomic_position_split = genomic_position.replace("chr", "").replace("-", ":").split(":") + chrom = genomic_position_split[0] + chromStart = genomic_position_split[1] + if chrom == 'M': + chrom = 'MT' + + ref = df["ref_nuc"] + alt = df["var_nuc"] + + HGVS = get_hgvs_from_vcf(chrom, int(chromStart), ref, alt, mutant_type=False) + + index = df["index"] + uniprotkb_swiss_prot_id = clean_data(df["uniprotkb_swiss_prot_id"], ("-",)) + gene_name = clean_data(df["gene_name"], ("-",)) + refseq_nucleotide_id = clean_data(df["refseq_nucleotide_id"], ("-",)) + position_nuc = clean_data(df["position_nuc"], ("-",)) + position_aa = clean_data(df["position_aa"], ("-",)) + ref_aa = clean_data(df["ref_aa"], ("-",)) + var_aa = clean_data(df["var_aa"], ("-",)) + polyphen = clean_data(df["polyphen"], ("-",)) + pmid = clean_data(df["pmid"], ("-",)) + + cancer_type = clean_data(df["cancer_type"], ("-",)) + if cancer_type: + cancer_type_split = cancer_type.replace(" / ", ":").split(":") + assert len(cancer_type_split) == 3, "cancer_type split error : {} : {}".format(HGVS, cancer_type) + _d, doid, term = cancer_type_split + assert _d == "DOID", "cancer_type split error : {} : {}".format(HGVS, cancer_type) + else: + doid = None + term = None + + source = clean_data(df["source"], ("-",)) + vfunction = clean_data(df["function"], ("-",)) + if vfunction: + vfunction = vfunction.split("|") + status = clean_data(df["status"], ("-",)) + +# load as json data + one_snp_json = { + "_id": HGVS, + "biomuta": { + 'index': index, + 'uniprotkb_swiss_prot_id': uniprotkb_swiss_prot_id, + 'gene_name': gene_name, + 'refseq_nucleotide_id': refseq_nucleotide_id, + 'genomic_position': genomic_position, + 'position_nuc': position_nuc, + 'ref_nuc': ref, + 'var_nuc': alt, + 'position_aa': position_aa, + 'ref_aa': ref_aa, + 'var_aa': var_aa, + 'polyphen': polyphen, + 'pmid': pmid, + 'cancer_type': { + "DOID": doid, + "term": term }, + 'source': source, + 'function': vfunction, + 'status': status, + } + } + one_snp_json = value_convert_to_number(one_snp_json) + return one_snp_json + + +def clean_index(s): + return s.lower().replace("/", "_").replace("-", "_").replace("(", "_").replace(")", "").replace("#", "") + + +def clean_data(d, vals): + if d in vals: + return None + else: + return d + +# open file, parse, pass to json mapper +def load_data(input_file, version='hg19'): + open_file = open(input_file) + db_biomuta = csv.reader(open_file) + index = next(db_biomuta) + assert len(index) == VALID_COLUMN_NO, "Expecting %s columns, but got %s" % (VALID_COLUMN_NO, len(index)) + index = [clean_index(s) for s in index] + biomuta = (dict(zip(index, row)) for row in db_biomuta) + biomuta = filter(lambda row: row["index"] != "", biomuta) + json_rows = map(_map_line_to_json, biomuta) + json_rows = (row for row in json_rows if row) + json_rows = sorted(json_rows, key=lambda row: row["_id"]) + row_groups = (it for (key, it) in groupby(json_rows, lambda row: row["_id"])) + json_rows = (merge_duplicate_rows(rg, "biomuta") for rg in row_groups) + return (unlist(dict_sweep(row, vals=[None, ])) for row in json_rows) + + +def dict_sweep(d, vals=[".", "-", "", "NA", "none", " ", "Not Available", "unknown"]): + """ + @param d: a dictionary + @param vals: a string or list of strings to sweep + """ + for key, val in list(d.items()): + if val in vals: + del d[key] + elif isinstance(val, list): + val = [v for v in val if v not in vals] + for item in val: + if isinstance(item, dict): + dict_sweep(item, vals) + if len(val) == 0: + del d[key] + else: + d[key] = val + elif isinstance(val, dict): + dict_sweep(val, vals) + if len(val) == 0: + del d[key] + return d \ No newline at end of file diff --git a/src/dataload/sources/denovodb/__init__.py b/src/dataload/sources/denovodb/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/dataload/sources/denovodb/denovodb_parser.py b/src/dataload/sources/denovodb/denovodb_parser.py new file mode 100644 index 00000000..1a8709ce --- /dev/null +++ b/src/dataload/sources/denovodb/denovodb_parser.py @@ -0,0 +1,162 @@ +import csv +from biothings.utils.dataload import unlist +from biothings.utils.dataload import value_convert_to_number +from biothings.utils.dataload import merge_duplicate_rows +from utils.hgvs import get_hgvs_from_vcf +from itertools import groupby +from math import nan + +VALID_COLUMN_NO = 31 + +'''this parser is for denovo-db v1.5 downloaded from +http://denovo-db.gs.washington.edu/denovo-db/Download.jsp''' + + +# convert one snp to json +def _map_line_to_json(df): + # specific variable treatment + chrom = df["Chr"] + if chrom == 'M': + chrom = 'MT' + + position = int(df["Position"]) + ref, alt = df["Variant"].upper().split(">") + + HGVS = get_hgvs_from_vcf(chrom, position, ref, alt, mutant_type=False) + + sampleid = df["SampleID"] + studyname = df["StudyName"] + pubmedid = df["PubmedID"] + numprobands = df["NumProbands"] + numcontrols = df["NumControls"] + sequencetype = df["SequenceType"] + primaryphenotype = df["PrimaryPhenotype"] + validation = df["Validation"] + chrom = df["Chr"] + position = df["Position"] + variant = df["Variant"] + rsid = clean_rsid(df["rsID"], ("0", )) + dbsnpbuild = clean_data(df["DbsnpBuild"], ("0", )) + ancestralallele = df["AncestralAllele"] + kgenomecount = df["1000GenomeCount"] + exacfreq = df["ExacFreq"] + espaafreq = df["EspAaFreq"] + espeafreq = df["EspEaFreq"] + transcript = clean_data(df["Transcript"], ("none", "")) + codingdnasize = clean_data(df["codingDnaSize"], ("-1", )) + gene = clean_data(df["Gene"], ("NA", "")) + functionclass = clean_data(df["FunctionClass"], ("none", "")) + cdnavariant = clean_data(df["cDnaVariant"], ("NA", "")) + proteinvariant = clean_data(df["ProteinVariant"], ("NA", "")) + exon_intron = clean_data(df["Exon_Intron"], ("NA",)) + polyphen_hdiv = clean_data(df["PolyPhen_HDiv"], ("-1",)) + polyphen_hvar = clean_data(df["PolyPhen_HVar"], ("-1",)) + siftscore = clean_data(df["SiftScore"], ("-1",)) + caddscore = clean_data(df["CaddScore"], ("-1",)) + lofscore = clean_data(df["LofScore"], ("-1",)) + lrtscore = clean_data(df["LrtScore"], ("-1",)) + +# load as json data + one_snp_json = { + "_id": HGVS, + "denovodb": { + "ref": ref, + "alt": alt, + "sampleid": sampleid, + "studyname": studyname, + "pubmedid": pubmedid, + "numprobands": numprobands, + "numcontrols": numcontrols, + "sequencetype": sequencetype, + "primaryphenotype": primaryphenotype, + "validation": validation, + "position": position, + "variant": variant, + "rsid": rsid, + "dbsnpbuild": dbsnpbuild, + "ancestralallele": ancestralallele, + "1000genomecount": kgenomecount, + "exacfreq": exacfreq, + "espaafreq": espaafreq, + "espeafreq": espeafreq, + "transcript": transcript, + "codingdnasize": codingdnasize, + "gene": gene, + "functionclass": functionclass, + "cdnavariant": cdnavariant, + "proteinvariant": proteinvariant, + "exon_intron": exon_intron, + "polyphen_hdiv": polyphen_hdiv, + "polyphen_hvar": polyphen_hvar, + "siftscore": siftscore, + "caddscore": caddscore, + "lofscore": lofscore, + "lrtscore": lrtscore, + } + } + # one_snp_json = dict_sweep(unlist(value_convert_to_number(one_snp_json)), vals=[np.nan]) + + one_snp_json = value_convert_to_number(one_snp_json) + return one_snp_json + + +def clean_index(s): + return s.replace("/", "_").replace("(", "_").replace(")", "").replace("#", "") + + +def clean_data(d, vals): + if d in vals: + return nan + else: + return d + + +def clean_rsid(d, vals): + if d in vals: + return nan + else: + return "rs{}".format(d) + + +# open file, parse, pass to json mapper +def load_data(input_file, version='hg19'): + open_file = open(input_file) + db_denovodb = csv.reader(open_file, delimiter="\t") + index = next(db_denovodb) + while index[0].startswith("##"): + index = next(db_denovodb) + assert len(index) == VALID_COLUMN_NO, "Expecting %s columns, but got %s" % (VALID_COLUMN_NO, len(index)) + index = [clean_index(s) for s in index] + denovodb = (dict(zip(index, row)) for row in db_denovodb) + denovodb = filter(lambda row: row["Chr"] != "", denovodb) + json_rows = map(_map_line_to_json, denovodb) + json_rows = (row for row in json_rows if row) + json_rows = sorted(json_rows, key=lambda row: row["_id"]) + row_groups = (it for (key, it) in groupby(json_rows, lambda row: row["_id"])) + json_rows = (merge_duplicate_rows(rg, "denovodb") for rg in row_groups) + return (unlist(dict_sweep(row, vals=[nan, ])) for row in json_rows) + # return (merge_duplicate_rows(rg, "denovodb") for rg in row_groups) + + +def dict_sweep(d, vals=[".", "-", "", "NA", "none", " ", "Not Available", "unknown"]): + """ + @param d: a dictionary + @param vals: a string or list of strings to sweep + """ + for key, val in list(d.items()): + if val in vals: + del d[key] + elif isinstance(val, list): + val = [v for v in val if v not in vals] + for item in val: + if isinstance(item, dict): + dict_sweep(item, vals) + if len(val) == 0: + del d[key] + else: + d[key] = val + elif isinstance(val, dict): + dict_sweep(val, vals) + if len(val) == 0: + del d[key] + return d diff --git a/src/dataload/sources/kaviar/__init__.py b/src/dataload/sources/kaviar/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/dataload/sources/kaviar/kaviar_parser.py b/src/dataload/sources/kaviar/kaviar_parser.py new file mode 100644 index 00000000..8d362d78 --- /dev/null +++ b/src/dataload/sources/kaviar/kaviar_parser.py @@ -0,0 +1,134 @@ +import vcf +from biothings.utils.dataload import unlist +from biothings.utils.dataload import value_convert_to_number, merge_duplicate_rows, dict_sweep +from utils.hgvs import get_hgvs_from_vcf +from itertools import groupby, chain +import os, csv + +VALID_COLUMN_NO = 8 + +'''this parser is for Kaviar version 160204-Public(All variants, annotated with data sources) downloaded from +http://db.systemsbiology.net/kaviar/Kaviar.downloads.html''' + + +# convert one snp to json +def _map_line_to_json(item): + chrom = item.CHROM + chromStart = item.POS + ref = item.REF + info = item.INFO + + try: + af = info['AF'] + except: + af = None + try: + ac = info['AC'] + except: + ac = None + try: + an = info['AN'] + except: + ac = None + try: + ds = info['DS'] + except: + ds = None + + # convert vcf object to string + item.ALT = [str(alt) for alt in item.ALT] + + # if multiallelic, put all variants as a list in multi-allelic field + hgvs_list = None + if len(item.ALT) > 1: + hgvs_list = [] + for alt in item.ALT: + try: + hgvs_list.append(get_hgvs_from_vcf(chrom, chromStart, ref, alt, mutant_type=False)) + except: + hgvs_list.append(alt) + + assert len(item.ALT) == len(info['AC']), "Expecting length of item.ALT= length of info.AC, but not for %s" % (item) + assert len(item.ALT) == len(info['AF']), "Expecting length of item.ALT= length of info.AF, but not for %s" % (item) + if ds: + if len(item.ALT) != len(info['DS']): + ds_str = ",".join(info['DS']) + ds_str = ds_str.replace("NA7022,18", "NA7022_18") + ds_list = ds_str.split(",") + info['DS'] = [d.replace("NA7022_18", "NA7022,18") for d in ds_list] + assert len(item.ALT) ==len(info['DS']), "info.DS mismatch, %s: %s\n## DS: %s" % (item, info['DS']) + + for i, alt in enumerate(item.ALT): + try: + (HGVS, var_type) = get_hgvs_from_vcf(chrom, chromStart, ref, alt, mutant_type=True) + except: + continue + + if HGVS is None: + return + + # load as json data + one_snp_json = { + "_id": HGVS, + "kaviar": { + "multi-allelic": hgvs_list, + "ref": ref, + "alt": alt, + "af": info['AF'][i], + "ac": info['AC'][i], + "an": an, + "ds": info['DS'][i].split("|") if ds else None, + } + } + + yield value_convert_to_number(one_snp_json) + + +# open file, parse, pass to json mapper +def load_data(input_file): + vcf_reader = vcf.Reader(open(input_file, 'r'), strict_whitespace=True) + json_rows = map(_map_line_to_json, vcf_reader) + json_rows = chain.from_iterable(json_rows) + + if not os.path.exists("alldata.csv"): + print("Writing data") + with open("alldata.csv", "w") as f: + dbwriter = csv.writer(f) + for doc in json_rows: + dbwriter.writerow([doc['_id'], str(doc)]) + if not os.path.exists("sorted.csv"): + print("Start Sorting") + import subprocess + p = subprocess.Popen('sort alldata.csv > sorted.csv', shell=True) + os.waitpid(p.pid, 0) + print("Sorted") + + json_rows = csv.reader(open('sorted.csv')) + json_rows = (eval(row[1]) for row in json_rows) + row_groups = (it for (key, it) in groupby(json_rows, lambda row: row["_id"])) + json_rows = (merge_duplicate_rows(rg, "kaviar") for rg in row_groups) + return (unlist(dict_sweep(row, vals=[None, ])) for row in json_rows) + + +def dict_sweep(d, vals=[".", "-", "", "NA", "none", " ", "Not Available", "unknown"]): + """ + @param d: a dictionary + @param vals: a string or list of strings to sweep + """ + for key, val in list(d.items()): + if val in vals: + del d[key] + elif isinstance(val, list): + val = [v for v in val if v not in vals] + for item in val: + if isinstance(item, dict): + dict_sweep(item, vals) + if len(val) == 0: + del d[key] + else: + d[key] = val + elif isinstance(val, dict): + dict_sweep(val, vals) + if len(val) == 0: + del d[key] + return d \ No newline at end of file