-
Notifications
You must be signed in to change notification settings - Fork 7
/
parse_acr_aca_with_cdd.py
executable file
·156 lines (129 loc) · 6.78 KB
/
parse_acr_aca_with_cdd.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
#!/usr/bin/python3
'''
*****************************************************************************************************
Purpose: Uses candidate Acr/Aca loci and CDD's that infer pathogenicity to attempt to filter false positive Acr/Aca locus.
Author:
Javi Gomez - https://github.com/rtomyj wrote the version 1.0.
Haidong Yi - https://github.com/haidyi revised the codes, remove some bugs, wrote the version 2.0.
Project:
This work is advised by Dr. Yanbin Yin at UNL - [email protected]
*****************************************************************************************************
'''
from collections import defaultdict
from sys import path as sys_path
sys_path.append('dependencies/PyGornism/')
from organism import Organism
'''
Purpose:
Uses the candidate Acr/Aca loci as a starting point to obtain neighboring proteins.
Goes up stream from first Acr/Aca protein in locus with a value of PROTEIN_UP_DOWN.
Goes down stream from last Acr/Aca protei in locus with a value of PROTEIN_UP_DOWN.
Creates a dict containing locus number as a value and locus proteins of neighbors.
Creates an faa string of the locus proteins of neighbors.
Arguments:
candidateAcrs - list of lists containing proteins in the inner list. Each protein list is an Acr/Aca locus.
ORGANISM_SUBJECT - Organism object that contains information of the organism being parsed.
PROTEIN_UP_DOWN - Number of proteins get upstream and downstream.
Returns:
neighborsFaaStr - faa representation of the newly acquired neighbors
NEIGHBORHOOD_NUM_maps_NEIGHBORHOOD_WP - dict containing newly acquired neighbors (Protein objects) maped by the locus number
'''
def get_acr_neighbors(candidateAcrs, ORGANISM_SUBJECT, PROTEIN_UP_DOWN):
neighborsFaaStr = ""
NEIGHBORHOOD_NUM_maps_NEIGHBORHOOD_WP = defaultdict(list)
'''
Goes through the candidate Acr/Aca loci.
Finds the neighboring proteins surrounding the locus.
Obtains the FAA string representation of each neighbors.
'''
from formated_output import get_faa
for neighborhoodNum, locus in enumerate(candidateAcrs):
startProtein, endProtein = locus[0], locus[len(locus) - 1] # first and last protein of locus
# downstream = ORGANISM_SUBJECT.get_downstream_neighbors(PROTEIN_UP_DOWN, startProtein, inclusive=False) # gets downstream neighbors
# upstream = ORGANISM_SUBJECT.get_upstream_neighbors(PROTEIN_UP_DOWN, endProtein) # gets upstream neighbors
downstream = ORGANISM_SUBJECT.get_downstream_neighbors(PROTEIN_UP_DOWN, endProtein, inclusive=False)
upstream = ORGANISM_SUBJECT.get_upstream_neighbors(PROTEIN_UP_DOWN, startProtein, inclusive=False)
'''
Combines downstream, upstream and the remaining Acr/Aca proteins (neglecting start/end)
'''
# neighborhood = downstream[:]
# neighborhood.extend(locus[1: len(locus) -1])
# neighborhood.extend(upstream[:])
neighborhood = upstream[:]
neighborhood.extend(locus[:])
neighborhood.extend(downstream[:])
for protein in neighborhood: # adds neighbor list to neighbor dict
NEIGHBORHOOD_NUM_maps_NEIGHBORHOOD_WP[neighborhoodNum].append(protein.id)
neighborsFaaStr += '# Neighborhood for locus {0} starting with {1} and ending with {2}\n'.format(neighborhoodNum, startProtein, endProtein)
neighborsFaaStr += get_faa(neighborhood) # continues building faa string
neighborsFaaStr += '\n\n'
return neighborsFaaStr, NEIGHBORHOOD_NUM_maps_NEIGHBORHOOD_WP
'''
Purpose:
Uses rpsblast+ using CDD's to filter out candidate Acr/Aca loci
Arguments:
candidateAcrs - list of lists containing proteins in the inner list. Each protein list is an Acr/Aca locus
ORGANISM_SUBJECT - Organism object that contains information of the organism being parsed
NEIGHBORHOOD_FAA_PATH - Path to file containing FAA's of neighborhoods
CDD_RESULTS_PATH - Where to store psiblast results
CDD_DB_PATH - Path to CDD's being used
PROTEIN_UP_DOWN - Number of proteins get upstream and downstream
MIN_NUM_PROTEINS_MATCH_CDD - Minimum number of proteins that had CDD hits needed to keep a locus
Returns:
result from parse_cdd_results()
'''
def use_cdd(candidateAcrs, ORGANISM_SUBJECT, NEIGHBORHOOD_FAA_PATH, CDD_RESULTS_PATH, CDD_DB_PATH, PROTEIN_UP_DOWN, MIN_NUM_PROTEINS_MATCH_CDD, isProdigalUsed, BLAST_TYPE, THREADS_NUM):
with open(NEIGHBORHOOD_FAA_PATH, 'w') as handle:
neighborsFaaStr, NEIGHBORHOOD_NUM_maps_NEIGHBORHOOD_WP = get_acr_neighbors(candidateAcrs, ORGANISM_SUBJECT, PROTEIN_UP_DOWN)
handle.write(neighborsFaaStr)
from subprocess import call as execute
if BLAST_TYPE == 'blastp':
execute(['blastp', '-query', NEIGHBORHOOD_FAA_PATH, '-db', CDD_DB_PATH, '-evalue', '.01', '-outfmt', '7', '-num_threads', THREADS_NUM, '-out', CDD_RESULTS_PATH])
elif BLAST_TYPE == 'rpsblast':
execute(['rpsblast+', '-query', NEIGHBORHOOD_FAA_PATH, '-db', CDD_DB_PATH, '-evalue', '.01', '-outfmt', '7', '-out', CDD_RESULTS_PATH])
return parse_cdd_results(CDD_RESULTS_PATH, NEIGHBORHOOD_NUM_maps_NEIGHBORHOOD_WP, MIN_NUM_PROTEINS_MATCH_CDD, isProdigalUsed)
'''
Purpose:
Goes through the CDD results and determines which Acr/Aca regions to keep
Arguments:
CDD_RESULTS_PATH - Where to store psiblast results
NEIGHBORHOOD_NUM_maps_NEIGHBORHOOD_WP - dict containing newly acquired neighbors (Protein objects) maped by the locus number
MIN_NUM_PROTEINS_MATCH_CDD - Minimum number of proteins that had CDD hits needed to keep a locus
Returns:
goodNeighborhoods - list containing the locus number of candidate Acr/Aca regions that passed CDD parsing
'''
def parse_cdd_results(CDD_RESULTS_PATH, NEIGHBORHOOD_NUM_maps_NEIGHBORHOOD_WP, MIN_NUM_PROTEINS_MATCH_CDD, isProdigalUsed):
uniqueWPHits = dict()
'''
Traverses through psiblast results
'''
with open(CDD_RESULTS_PATH, 'r', 512) as handle:
for line in handle:
if not line.startswith('#'):
cols = line.rstrip().split('\t')
proteinInfo = cols[0].split('|')
if isProdigalUsed:
wp = '-'.join(proteinInfo[0:2])
else:
wp = proteinInfo[1]
'''
Select the most significant rpsblast hit and put them into a dict.
'''
if wp not in uniqueWPHits.keys():
uniqueWPHits[wp] = {'qid': cols[0], 'sid': cols[1], 'evalue': cols[10]}
NEIGHBORHOOD_NUM_maps_CDD_HITS = defaultdict(lambda: 0)
for wp in uniqueWPHits.keys():
for neighborhoodNum, neighborhoodWP in NEIGHBORHOOD_NUM_maps_NEIGHBORHOOD_WP.items():
if wp in neighborhoodWP:
NEIGHBORHOOD_NUM_maps_CDD_HITS[neighborhoodNum] += 1
'''
If MIN_NUM_PROTEINS_MATCH_CDD == 0 it means we shouldn't even run rpsblast
'''
if MIN_NUM_PROTEINS_MATCH_CDD == 0:
goodNeighborhoods = list(NEIGHBORHOOD_NUM_maps_NEIGHBORHOOD_WP.keys())
return goodNeighborhoods, uniqueWPHits
goodNeighborhoods = list()
for neighborhoodNum in NEIGHBORHOOD_NUM_maps_CDD_HITS:
if NEIGHBORHOOD_NUM_maps_CDD_HITS[neighborhoodNum] >= MIN_NUM_PROTEINS_MATCH_CDD:
goodNeighborhoods.append(neighborhoodNum)
return goodNeighborhoods, uniqueWPHits