-
Notifications
You must be signed in to change notification settings - Fork 1
/
simulate.py
84 lines (65 loc) · 2.17 KB
/
simulate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import sys
try:
fasta = open(sys.argv[1],"r")
coverage = int(sys.argv[2])
readLength = int(sys.argv[3])
errorRate = float(sys.argv[4])
assert(errorRate < 1 and errorRate >= 0)
except(IndexError):
print("""Too few arguments found.
This file takes in a fasta file, a coverage value,
a read length, and an error rate.""")
exit(1)
except(ValueError):
print("Coverage and read length must be integers. Error rate must be a float.")
exit(1)
except(FileNotFoundError):
print("FASTA file not found. Try again.")
exit(1)
except(AssertionError):
print("Error rate must be between 0 and 1. You gave a value of ", errorRate)
exit(1)
from random import randint
from random import random
#'parse' the FASTA file by just skipping the 1st line
fasta.readline()
fastaSeq = fasta.read()
fasta.close()
fastaSeq = fastaSeq.replace("\n","").upper()
#print(fastaSeq)
if len(fastaSeq) < readLength:
print("you want a read length greater than the sequence given.")
exit(1)
# calculate N using G (length of FASTA), C (coverage), L (length of the reads)
N = len(fastaSeq) * coverage // readLength
# if it's not an int, round up
# N times...
reads = []
for i in range(0,N):
# choose a random index from the FASTA
start = randint(-readLength,len(fastaSeq)-1)
read = ""
# splice something of length L
end = min(start + readLength,len(fastaSeq))
if start < 0:
start = 0
for ch in fastaSeq[start:end]:
if random() < errorRate:
# add an error at a corresponding rate to the error rate
nucleotides = ["A","T","C","G"]
nucleotides.remove(ch)
# chooses a random other nucleotide
read += nucleotides[randint(0,2)]
else:
read += ch
# add the read to our set of reads
reads.append(read)
#later...
# handle the edge case indices better
# introduce the necessary frequency of errors before storing the read
# output all of our reads into reads.txt
with open("reads.txt","w") as output:
for read in reads:
output.write(read)
output.write("\n")
output.close()