-
Notifications
You must be signed in to change notification settings - Fork 0
/
AncesterFinal.py
168 lines (134 loc) · 6.38 KB
/
AncesterFinal.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
import pandas as pd
import numpy as np
import gzip
import sys
'''This outputs two files, one is each species' frequency breakdown of the alt, ref, and missing for each position.
The second file is the the most frequent nucleotide in the vcfs of the ancesters (Pongo, Pan, Gorilla)'''
missing_stat = float(sys.argv[1]) #system input for missingness
vcf = sys.argv[2]
names = sys.argv[3]
vcf = gzip.open(vcf,'r')
idfile = open(names,'r')
def makeIndDictionary(line,file):
indDict = {}
specDict = {}
df = pd.read_csv(file, sep = '\t', header = None)
lineSplit = line.split('\t')
for i in range(9, len(lineSplit)):
idName = lineSplit[i]
row = df[df[0] == idName]
if len(row)>0:
species = list(row[1])[0]
indDict[i] = species
if species not in specDict:
specDict[species] = [i]
else:
specDict[species].append(i)
return indDict, specDict
for line in vcf: #getting the ids add putting in a list
line = line.decode('ASCII')
if line[:3] == '#CH':
no,Go = makeIndDictionary(line,idfile)
break
def makeTXT(vcf,ids,missing):
cols = ['Chromosome','Position','Ref','Alt', 'Alt2', 'Alt3']
for species in ids.keys():
for x in ['ref', 'alt', 'alt2', 'alt3', 'missing']:
cols.append(species + "_" + x)
output_df_dic = {}
#above is a dataframe just to easily organize the data
state_df_dic = {} #Ensembl will be added later
for line in vcf:
line = line.decode('ASCII')
if line[0] != '#':
add = [] #lest to add to freq file
state_add = [] #list to add to state file
lst = line.split() #list of values in line
chrome = lst[0] #chrom in line
pos = lst[1] #position in line
nuke_ref = lst[3] #ref in line
nuke_alt = lst[4] #alt in line
ancest_missing, ancest_tot, ancest_alt, ancest_ref = 0, 0, 0, 0 #initiate for acestors
an_alt2, an_alt3 = 0, 0
state_add.extend([chrome,pos])
## figure out how many alt alleles there are
triallelic = False
quadallelic = False
if len(nuke_alt.split(',')) == 2:
triallelic = True
altBP1, altBP2 = nuke_alt.split(',')
elif len(nuke_alt.split(',')) == 3:
quadallelic = True
altBP1, altBP2, altBP3 = nuke_alt.split(',')
if triallelic:
add.extend([chrome,pos,nuke_ref,altBP1, altBP2, np.nan]) # add values to list for output files
elif quadallelic:
add.extend([chrome,pos,nuke_ref,altBP1, altBP2, altBP3])
else:
add.extend([chrome,pos,nuke_ref, nuke_alt, np.nan, np.nan])
for species in ids: #go species by species
alt = 0
ref = 0 #reset all variables after each species
tot = 0
missing = 0
alt2, alt3 = 0, 0
use = ids[species] #get list of ids
## iterates through each species
for index in use: #go id by id for each species
data = lst[index][:3] #data for eact id
if len(data) < 2:
missing += 2
else:
ref += data.count('0')
alt += data.count('1')
missing += data.count('.')
if triallelic:
alt2 += data.count('2') #data for all species
if quadallelic:
alt2 += data.count('2')
alt3 += data.count('3')
tot += 2
if species in ['Pan','Pongo','Gorilla']: #puts in data depending on species (just ancestors)
if len(data) < 2:
ancest_missing += 2
else:
ancest_ref += data.count('0')
ancest_alt += data.count('1')
ancest_missing += data.count('.')
if triallelic:
an_alt2 += data.count('2')
if quadallelic:
an_alt2 += data.count('2')
an_alt3 += data.count('3')
ancest_tot += 2
if len(use) != tot/2: #check 1
print("Ids don't match len of list")
exit()
if round(alt/tot+ref/tot+missing/tot+alt2/tot+alt3/tot,3) != 1: #check 2
print(chrome, pos, alt/tot+ref/tot+missing/tot+alt2/tot+alt3/tot)
print('Frequencies does not equal 1')
exit()
add.extend([ref/tot, alt/tot, alt2/tot, alt3/tot, missing/tot]) #add data to list
ancest_missing_freq = ancest_missing/ancest_tot #get freq of missing in pan,pongo,gorilla
freqs = [ancest_ref/ancest_tot, ancest_alt/ancest_tot, an_alt2/ancest_tot, an_alt3/ancest_tot, ancest_missing_freq]
maxFreq = max(freqs)
maxIndex = freqs.index(maxFreq)
if ancest_missing_freq >= missing_stat or (maxIndex == 4):
ancest_state = 'NA' #when the ancestor nucleotide will be missing
elif maxIndex == 0:
ancest_state = nuke_ref #when ancestor nucleotide will be the ref
elif maxIndex == 1:
if triallelic or quadallelic:
ancest_state = altBP1
else:
ancest_state = nuke_alt #when the ancestor nucleotide will be alt
elif maxIndex == 2:
ancest_state = altBP2
elif maxIndex == 3:
ancest_state = altBP3
state_add.append(ancest_state)
output_df_dic[len(output_df_dic)] = add #add list to dataframe
state_df_dic[len(state_df_dic)] = state_add #add list to dataframe for ancestor state
pd.DataFrame.from_dict(output_df_dic, orient='index', columns=cols).to_csv('Frequencies.csv',index=False) #convert dataframe to a text file
pd.DataFrame.from_dict(state_df_dic,orient='index',columns=['Chromosome','Position','Nucleotide']).to_csv('State.csv',index=False) #convert data frame to text file
makeTXT(vcf,Go,missing_stat)