-
Notifications
You must be signed in to change notification settings - Fork 0
/
db_to_gff.py
167 lines (148 loc) · 6.72 KB
/
db_to_gff.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
#!/usr/bin/python
# Script: db_to_gff.py
# Author: Daniel Desiro'
"""
Description:
Get information from a gffutils database and create a new gff file.
Usage:
db_to_gff.py -g <gff_file> -d <database_file>
Source:
https://github.com/desiro/gffDB/blob/master/db_to_gff.py
Reference:
http://pythonhosted.org/gffutils/contents.html
"""
import gffutils
import argparse
import sqlite3
import sys
## main function
def main(gffFile, dataBase, batchSearch, all, features, source, seqid, start, end, strand, within):
# opening the database
try:
db = gffutils.FeatureDB(dbfn=dataBase)
except sqlite3.DatabaseError:
print("Error: Can't open database")
sys.exit()
# getting all data
if all:
data = db.all_features()
printData(data, gffFile, source, 'w')
# getting data from batch file
elif batchSearch:
batchQuery(db, gffFile, batchSearch, source)
# getting single request
else:
singleQuery(db, gffFile, features, source, seqid, start, end, strand, within)
## check items
def batchQuery(db, gffFile, batchSearch, source):
with open(batchSearch,'r') as batch:
first = True
lineNum = 1
for line in batch:
lineList = line.split('\t')
# check if the batch file is consistent
if len(lineList) < 6:
print("Error: Batch file line " + str(lineNum) + " has less than 6 entries")
sys.exit()
seqid, features, start, end, strand, within = lineList
# remove new line character
within = within.strip()
# check if the items are consistent
b_seqid, b_feature, b_start, b_end, b_strand, b_within = checkItems(seqid, features, start, end, strand, within)
# use best search strategy for the requested task
if b_seqid or b_start or b_end:
data = db.region(seqid=b_seqid, start=b_start, end=b_end, strand=b_strand, featuretype=b_feature, completely_within=b_within)
elif b_feature:
data = db.features_of_type(b_feature, strand=b_strand)
else:
data = db.all_features(strand=b_strand)
# print data
if first:
printData(data, gffFile, source, 'w')
first = False
else:
printData(data, gffFile, source, 'a')
lineNum += 1
## check items
def singleQuery(db, gffFile, features, source, seqid, start, end, strand, within):
# check if the items are consistent
c_seqid, c_feature, c_start, c_end, c_strand, c_within = checkItems(seqid, features, start, end, strand, within)
featureList = None
if c_feature:
featureList = c_feature.split(',')
if len(featureList) > 1:
c_feature = featureList
# use best search strategy for the requested task
if c_seqid or c_start or c_end:
data = db.region(seqid=c_seqid, start=c_start, end=c_end, strand=c_strand, featuretype=c_feature, completely_within=c_within)
elif c_feature:
data = db.features_of_type(c_feature, strand=c_strand)
else:
data = db.all_features(strand=c_strand)
printData(data, gffFile, source, 'w')
## check items
def checkItems(seqid, feature, start, end, strand, within):
# check seqid
if not seqid or str(seqid) == '.':
seqid = None
# check features
if not feature or str(feature) == '.':
feature = None
# check start
if not start or str(start) == '.':
start = None
else:
try:
n = int(start)
except ValueError:
print("Error: Start \'" + start + "\' is not convertible to integer")
sys.exit()
# check end
if not end or str(end) == '.':
end = None
else:
try:
n = int(end)
except ValueError:
print("Error: End \'" + end + "\' is not convertible to integer")
sys.exit()
# check strand
if not strand or str(strand) == '.':
strand = None
elif not (str(strand) == '+' or str(strand) == '-' or str(strand) == 'x'):
print("Error: Strand \'" + strand + "\' is not a valid strand item")
sys.exit()
# check within
if within == True or str(within) == 'w':
within = True
else:
within = False
return seqid, feature, start, end, strand, within
## print data
def printData(data, gffFile, source, write):
# open the gff file
with open(gffFile,write) as file:
for entry in data:
if source and entry.source != source:
try:
next(data)
except StopIteration:
pass
else:
file.write(str(entry) + "\n")
if __name__ == '__main__':
parser = argparse.ArgumentParser(prog = 'db_to_gff.py', description = 'Get information from a gffutils database and create a new gff file.', prefix_chars='-+', epilog="")
parser.add_argument('--version', action='version', version='%(prog)s 0.1')
parser.add_argument('--gff', '-g', dest='gffFile', required=True, help='GFF file name')
parser.add_argument('--database', '-d', dest='dataBase', required=True, help='gffutils database')
parser.add_argument('--batch', '-b', dest='batchSearch', help='tabular file with search data; format: seqid - feature - start - end - strand - within; values can be empty or use \'.\' as a place holder; always provide 5 tabs; use \'x\' for unstranded features; use \'w\' for within')
parser.add_argument('--all', '-a', dest='all', action='store_true', help='returns all db entries')
parser.add_argument('--features', '-f', dest='features', help='returns all entries with the requested features; can be a comma separated list')
parser.add_argument('--source', '-so', dest='source', help='returns features with the requested source')
parser.add_argument('--seqid', '-sq', dest='seqid', help='returns features with the requested seqid')
parser.add_argument('--start', '-s', dest='start', help='only returns features that start with this region or after this region')
parser.add_argument('--end', '-e', dest='end', help='only returns features that end with this region or before this region')
parser.add_argument('--strand', '-r', dest='strand', choices=['+', '-', 'x'], help='returns only features in strand direction; \'x\' returns unstranded features')
parser.add_argument('--within', '-w', dest='within', action='store_true', help='forces the feature to be completely within the provided --start and/or --end region')
options = parser.parse_args()
main(options.gffFile, options.dataBase, options.batchSearch, options.all, options.features, options.source, options.seqid, options.start, options.end, options.strand, options.within)