-
Notifications
You must be signed in to change notification settings - Fork 1
/
Step1_Bayesian_usearch.py
179 lines (141 loc) · 6.81 KB
/
Step1_Bayesian_usearch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
# -*- coding: utf-8 -*-
"""
Python: Bayesian Optimization for Usearch parameters values selection
Usearch v9
@author: [email protected]
"""
# BAYESIAN OPTIMIZATION FOR USEARCH PARAMETERS
import numpy as np
from skopt import gp_minimize
from skopt.space import Real, Categorical
from skopt.utils import use_named_args
import os
import time
import argparse
# ARGUMENTS
parser = argparse.ArgumentParser()
parser.add_argument("--database", help="File fasta with database sequences. Example: /path/to/database/database.fasta.")
parser.add_argument("--file_intervals", help="File with intervals for parameters. Example in Example_intervals.txt.")
parser.add_argument("--trim_primers", help="Trim primers in reads if you use database without primers. Optios: YES | NO.")
parser.add_argument("--path_usearch_control", help="Path of usearch and control raw data separated by \",\" without white spaces. Example: /paht/to/usearch,/path/to/reads_control.")
args = parser.parse_args()
#OPEN FILE WITH INTERVALS OF VALUES FOR EACH PARAMETER
try:
file_intervals = open(args.file_intervals, "r")
except:
print("Error: The file " + args.file_intervals + " could not be openned.")
# READ THE FILE
try:
file_intervals_read = file_intervals.readlines()
except:
print("Error: The file " + args.file_intervals + " could not be read.")
# CONSTRUCTION OF INTERVALS DICTIONARY
try:
intervals = {}
for line in file_intervals_read:
name = line.split("\t")[0].strip()
value1 = float(line.split("\t")[1].split(",")[0].strip())
value2 = float(line.split("\t")[1].split(",")[1].strip())
intervals[name] = (value1, value2)
except:
print("Error: The intervals dictionary could not be constructed.")
# CONSTRUCTION OF SPACE FOR BAYESIAN OPTIMIZATION
try:
space = [Real(intervals['diff'][0], intervals['diff'][1], name='diff'),
Real(intervals['pct'][0], intervals['pct'][1], name='pct'),
Real(intervals['maxee'][0], intervals['maxee'][1], name='maxee'),
Real(intervals['amp'][0], intervals['amp'][1], name='amp'),
Categorical([intervals['id'][0], intervals['id'][1]], name = 'id')
]
except:
print("Error: The space for Bayesian Optimization could not be constructed.")
# PATH FOR USEARCH AND CONTROL RAW DATA
PATHUSEARCH = args.path_usearch_control_database.split(",")[0].strip()
PATHCONTROL = args.path_usearch_control_database.split(",")[1].strip()
# THE FUNCTION TO OPTIMIZE
class Usearch:
def __init__(self, database):
self.database = database
def set_params(self, **params): #**params
self.diff = params["diff"]
self.pct = params["pct"]
self.maxee = params["maxee"]
self.amp = params["amp"]
self.id = params["id"]
def predict(self):
start = time.time()
#merge command.
os.system(PATHUSEARCH + "/usearch9 -fastq_mergepairs " + PATHCONTROL + "/*_R1*.fastq -relabel @ -fastq_maxdiffs " + str(self.diff) + " -fastq_maxdiffpct " + str(self.pct) + " -fastqout merge.fq")
if args.trim_primers == "YES":
#Remove the first and last nuecleotides assuming that they are primer sequences.
os.system(PATHUSEARCH + "/usearch9 -fastx_truncate merge.fq -stripleft 21 -stripright 20 -fastqout merge_strip.fq")
#filter command.
if args.trim_primers == "YES":
file_merge = "merge_strip.fq"
else:
file_merge = "merge.fq"
os.system(PATHUSEARCH + "/usearch9 -fastq_filter " + file_merge + " -fastq_maxee " + str(self.maxee) + " -fastaout filter.fa")
#uniques command.
os.system(PATHUSEARCH + "/usearch9 -fastx_uniques filter.fa -fastaout unicos.fa -sizeout")
#unoise command.
os.system(PATHUSEARCH + "/usearch9 -unoise2 unicos.fa -fastaout unoise.fa -minampsize " + str(self.amp) + " -unoise_alpha 3 -log stat_log_unoise.txt")
#search command.
os.system(PATHUSEARCH + "/usearch9 " + str(self.id) + " -db " + self.database + " -strand both -otutabout search_database.txt")
#number of otus matched in database.
try:
fileSearch = open("search_database.txt", "r")
fileSearch_read = file.readlines()
except:
print("Error: The search database file could not be openned or read.")
reads = 0
for line in fileSearch_read[1:]:
for col in range(1, len(line.split("\t"))):
reads = reads + int(line.split("\t")[col].strip())
#The Bayesian optimization considers the lowest result as the most optimal. In our case, the best result is the highest, so we return -result.
print("Running time: ", time.time() - start)
return -reads
@use_named_args(space)
def objetive(**params):
model = Usearch(args.database)
model.set_params(**params)
return model.predict()
#%% OPTIMIZATION
res = gp_minimize(objetive, # the function for the optimization
space, # limits for each dimension
acq_func="EI", # adquisition function
#acq_func="PI",
#acq_func="LCB",
n_calls=100, # number of evaluations
n_random_starts=5, # number of initial random points
noise=0.1**2, # noise level
random_state=123) # seed
#%% RESULTS
print("Optimal values: ", res.x)
print("Optimal function value: ", res.fun)
print("Samples or observations: ", res.x_iters) # check the length of x_iters
print("Obtained values: ", res.func_vals) # the minimum of these values must match res.fun
print("Search space: ", res.space)
try:
file_result = open("Bayesian_usearch.txt", "w+")
except:
print("Error: The file for Bayesian results could not be created.")
file_result.write("Optimal values: " + str(res.x) + "\n")
file_result.write("Optimal function value: " + str(res.fun) + "\n")
file_result.write("Samples or observations: " + str(res.x_iters) + "\n")
file_result.write("Obtained values: " + str(res.func_vals) + "\n")
file_result.write("Search space: " + str(res.space) + "\n")
#%% CONVERGENCY
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
plt.figure(figsize=(5,5))
from skopt.plots import plot_convergence
plot_convergence(res)
plt.savefig("Bayesian.png")
#For the representation of convergence plot in R.
n_calls = len(res.x_iters)
mins = [np.min(res.func_vals[:i]) for i in range(1, n_calls + 1)]
file_convergency = open("Bayesian_data_res.txt", "w+")
file_convergency.write("Number of calls n" + "\t" + "minf(x) after n calls" + "\n")
for n in range(1, n_calls + 1):
file_convergency.write(str(n) + "\t" + str(mins[n - 1]) + "\n")