-
Notifications
You must be signed in to change notification settings - Fork 0
/
GenoMe.py
185 lines (157 loc) · 7.77 KB
/
GenoMe.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
import pandas as pd
import requests
import json
import sys
import os
from bs4 import BeautifulSoup
from collections import OrderedDict
class Geno(object):
"""class to reads and interprets 23andMe raw genotype data."""
def __init__(self, raw_23andMe):
"""input path to raw 23andMe genotype data. data will be read as a form of pandas DataFrame.
>>> g = Geno("./23andMe_raw_data.txt")
>>> g.my_geno"""
# GET report as json from 23andMe API
res = requests.get("https://api.23andme.com/3/report/").content
self._report = json.loads(res)
self.my_geno = pd.read_csv(raw_23andMe, sep="\t", header=14,
names=["rsid", "chr", "pos", "genotype"],
low_memory=False)
def type_id(self, rsid, ref_population="JPT"):
"""search input rsid at SNPedia, report your possible trait related to this SNP.
>>> g.type_id("rs28897696")"""
mine = self.my_geno[self.my_geno.rsid == rsid]
if mine.empty:
return None
# your genotype. if you are a male(have "Y" chromosome),
# duplicate your X chromosome allele to make a two-lettered string
if mine.chr.values[0] in ["X", "Y"]:
if "Y" in self.my_geno.chr.unique():
mm = mine.genotype.values[0]
genotype = _sort_seq(mm + ";" + mm)
else:
mm, ff = mine.genotype.values[0]
genotype = _sort_seq(mm + ";" + ff)
# search this at snpedia
snpedia_report, g_percent = _search_rsid(rsid, genotype, ref_population)
snpedia_report = snpedia_report.strip()
print rsid, "|", genotype, "|", snpedia_report, "|", "{}% of {}".format(g_percent, ref_population)
return mine
def type_trait(self, query, ref_population="JPT"):
"""search input query at SNPedia, report all related SNPs and your genotype.
>>> g.type_trait("lung cancer")"""
rsid_list = _search_anything(query)
if not rsid_list[0].startswith("rs"):
print "try {} instead".format(rsid_list)
return
for rsid in rsid_list:
self.type_id(rsid, ref_population)
detail_url = ("https://www.snpedia.com/index.php/" + query).replace(" ", "%20")
print " * more details at", detail_url
def report_wellness(self, ref_population="JPT"):
"""return wellness report provided by 23andMe API sevice.
>>> g.report_wellness()"""
try:
height, width = map(int, os.popen('stty size', 'r').read().split())
except:
width = 100
print "="*(width//2-9), "WELLNESS REPORT", "="*(width//2-9), "\n"
for i in range(len(self._report["data"])):
with_detail = pd.DataFrame(self._report["data"][i]).dropna(subset=["details"])
if not "markers" in with_detail.index:
continue
for j in range(len(with_detail.ix["markers"].details)):
#print with_detail.report_id#with_detail.ix["markers"].details[j][u'biological_explanation']
rsid_report = with_detail.ix["markers"].details[j]["id"]
variants_report = with_detail.ix["markers"].details[j]["variants"]
for variant_dict in variants_report:
pos_report = variant_dict["end"]
pick = self.my_geno[self.my_geno.rsid == rsid_report]
if pick.empty:
continue
#print pick.pos.values[0], pos_report
#if pick.pos.values[0] == pos_report:
from_mom, from_dad = pick.genotype.values[0]
genotype = from_mom+";"+from_dad
snpedia_report, geno_percent = _search_rsid(rsid_report, genotype, ref_population)
if variant_dict["has_effect"] == True:
print "< {} >".format(with_detail.ix["markers"]["title"])
print "-" * width
print with_detail.ix["markers"].details[j]["biological_explanation"].strip()
print
print " rsID:", rsid_report
print " Yours:", genotype
if ("There is currently no text" not in snpedia_report) and (snpedia_report.strip() != ""):
print " Detail:", snpedia_report.strip(), "|", "{}% of {}".format(geno_percent, ref_population)
print "\n\n"
def _search_rsid(rsid, genotype, ref_population="JPT"):
"""search input rsid at SNPedia, return and compare SNPedia result and yours"""
genotype = _sort_seq(genotype)
snpedia_query = "https://www.snpedia.com/index.php/" + rsid
soup = BeautifulSoup(requests.get(snpedia_query).content, "lxml")
table = soup.find_all("table", attrs={"class": "sortable"})
stab_orien = soup.find_all("td")[3].text
# complement my genotype if stabilized orientation is 'minus'
if stab_orien == "minus":
genotype = _comp_seq(genotype)
if table != []:
snpedia_var_dict = OrderedDict()
for n, i in enumerate(table[0].find_all("td")):
if n % 3 == 0: k = i.text.strip().strip("(").strip(")")
if n % 3 == 1: m = i.text.strip()
if n % 3 == 2:
if len(k) == 2: continue
snpedia_var_dict[k] = (m, i.text.strip())
# save snpedia_report (result of input genotype)
try:
snpedia_report = " | ".join(snpedia_var_dict[genotype])
except KeyError:
snpedia_report = "No info about {}".format(genotype)
# save geno_percent (which percent of people in JPT population have input genotype)
try:
geno_order = snpedia_var_dict.keys().index(genotype)
#print snpedia_var_dict.keys(), genotype, rsid
for td in soup.find_all("td"):
if td.img is not None:
_chart = td.img["src"]
break
_, vals, __, ___, pops, ____, _____, ______ = _chart.split("&")
val1, val2, val3 = vals[6:].split("|")
pops = pops[11:-2].split("|")
try:
jpt_idx = len(pops)-pops.index(ref_population)-1
geno_ratio = [map(float, val1.split(","))[jpt_idx],
map(float, val2.split(","))[jpt_idx],
map(float, val3.split(","))[jpt_idx]]
#print geno_ratio, geno_order
geno_percent = geno_ratio[geno_order]
except ValueError:
geno_percent = "--"
except ValueError:
geno_percent = "--"
except UnboundLocalError:
geno_percent = "--"
else:
snpedia_report = ""
geno_percent = "--"
return snpedia_report, geno_percent
def _search_anything(query):
"""search input words at SNPedia, return and compare related genotypes and yours"""
snpedia_query = "https://www.snpedia.com/index.php/" + query
soup = BeautifulSoup(requests.get(snpedia_query).content, "lxml")
links = soup.find_all("div", attrs={"class": "mw-content-ltr"})[0].find_all("a")
rsid_list = [l.text for l in links if l.text.startswith("rs")]
rsid_list = [txt for txt in rsid_list if not txt.endswith(")")]
if rsid_list == []:
rsid_list = [l.text for l in links if l.text]
if 'search for this page title' in rsid_list:
rsid_list = ["another phenotype"]
return rsid_list
def _comp_seq(genotype):
comp_dict = {"A":"T", "T":"A", "G":"C", "C":"G", ";":";"}
return _sort_seq("".join([comp_dict[i] for i in genotype]))
def _sort_seq(genotype):
order_dict = {"A":0, "C":1, "G":2, "T":3, ";":";"}
m, _, f = [order_dict[i] for i in genotype]
if m > f: return ";".join[f, m]
else: return genotype