-
Notifications
You must be signed in to change notification settings - Fork 3
/
headder.py
167 lines (143 loc) · 5.67 KB
/
headder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
#!/usr/bin/env
from modeller import *
from modeller.optimizers import molecular_dynamics, conjugate_gradients
from modeller.automodel import *
from modeller.scripts import complete_pdb
from Bio import PDB as pdb
import re
import csv
import os, glob
import shutil
"""
headder.py
header adder
Adds a new header to completed structure files by putting in the missing residue and its corresponding number in the sequence.
"""
#function to add lines to top of the file
def line_prepender(filename, line):
with open(filename, 'r+') as f:
content = f.read()
f.seek(0, 0)
f.write(line.rstrip('\r\n') + '\n' + content)
#class to specify each element in CSV
class PDB_info(object):
"""
This class is used to assign meaning to specific elements in a given row of the .csv file
"""
def __init__(self, row):
self.id = row[0] #id number of the pdb file
self.protein = row[1] #protein name the pdb file is associated with
self.complete = row[2] #yes or give missing residues
self.conformation = row[3] #active or inactive?
self.mutation = row[4] #is there a mutation? If so, what are the details?
#reads in info from csv file
datafile = open('./temp.csv', 'r') #Opens the structures file for reading
datareader = csv.reader(datafile) #reads structures file
data = [] #initializes a list called data
for row in datareader:
data.append(row) #adds an element to data for each row in structures.csv
#parses csv data using PDB_info class
pdb_info = [PDB_info(item) for item in data]
for i in range (1, len(pdb_info)):
#assigns variable names to pdb_info elements
pdb_name = pdb_info[i].id #saves given pdb name as a variable
protein_name = pdb_info[i].protein #saves given protein name as a variable
complete = pdb_info[i].complete #saves yes or no for complete
structure_conf = pdb_info[i].conformation #saves active or inactive for conformation
mutation = pdb_info[i].mutation
ppb = pdb.PPBuilder() #peptide class to get sequence
last = 10000
#gives location of the pdb file
pdb_file = './PDBs/'+pdb_name+'.pdb'
parser = pdb.PDBParser()
struct = parser.get_structure("name",pdb_file) #read in pdb file using PDBParser
#gets name of the structure file
if structure_conf == 'active':
if complete == 'yes':
structure_file = './actives/complete/'+protein_name+'_active.pdb'
else:
structure_file = './actives/incomplete/'+protein_name+'_active.pdb'
else:
if complete == 'yes':
structure_file = './inactives/complete/'+protein_name+'_inactive.pdb'
else:
structure_file = './inactives/incomplete/'+protein_name+'_inactive.pdb'
#regular expression for the header (COMPLETE THIS)
fp = open(pdb_file)
lines = fp.readlines()
good_lines = []
for n in range(0, 10):
if re.match('HEADER', lines[n]) is not None:
#get rid of new line character
better_line = re.sub('\\n', '', lines[n])
good_lines.append(better_line)
elif re.match('REMARK 300', lines[n]) is not None:
#get rid of new line character
better_line = re.sub('\\n', '', lines[n])
good_lines.append(better_line)
else: break
rev_good_lines = []
for i in reversed(good_lines):
rev_good_lines.append(i)
print rev_good_lines
#concat the header and structure file (COMPLETE THIS)
#get missing residue ranges
structure_sequence = ''
first_range = []
last_range = []
for seq in ppb.build_peptides(struct):
print seq.get_sequence()
#read in the full sequence from the pdb file
full_sequence = ''
first = lines[0]
header = re.split('HEADER\W+',first)[1] #uses a modified version of PDB file
header_list = header.split(':')
first_res = int(header_list[1])
last_res = int(header_list[3])
for seq in ppb.build_peptides(struct):
#use this re to get the chain breaks
search = re.search('start=([0-9]{1,5}).+end=([0-9]{1,5})',"{0}".format(seq))
first_range.append(search.groups()[0])
last_range.append(search.groups()[1])
first = search.groups()[0]
diff = int(first)-int(last)-1
if(diff > 0): #put in dashes for missing residues
structure_sequence += diff*'-'
last = search.groups()[1]
structure_sequence += seq.get_sequence()
#print (int(first)-21),(int(last)-21)
print structure_sequence
first_range = map(int, first_range) #makes this an integer array
last_range = map(int, last_range) #makes this an integer array
first_res_in_range = first_range.pop(0) #gets rid of the first element
last_res_in_range = last_range.pop(-1) #gets rid of the last element
first_missing = [x + 1 for x in last_range] #will use this to make missing residue ranges
last_missing = [x - 1 for x in first_range] #will use this to make missing residue ranges
for i in range(0, len(first_missing)):
print first_missing[i], last_missing[i] + 1
#parse sequence into counter (res #, res)
for index in range(1,10):
split_line = re.split('REMARK 300 ',lines[index])
if split_line[0] == '':
full_sequence += split_line[1]
full_sequence.rstrip('\n')
print full_sequence
full_sequence_list = list(full_sequence)
mis_res_list = []
for i in range(0, len(first_missing)):
start = first_missing[i] - first_res
end = last_missing[i] + 1 - first_res
numbers = range(start, end)
print numbers
for j in range(start, end):
real_start = j - start
mis_res_list.append(str(numbers[real_start])+': '+full_sequence_list[j])
print mis_res_list
joiner = ', '.join(mis_res_list)
line_prepender(structure_file, mutation)
line_prepender(structure_file, 'Mutations:')
line_prepender(structure_file, joiner)
line_prepender(structure_file, 'Missing Residues:')
for m in range(0, len(rev_good_lines)):
line_prepender(structure_file, rev_good_lines[m])
#print missing res w/ corresponding number and concat