-
Notifications
You must be signed in to change notification settings - Fork 3
/
union.py
139 lines (114 loc) · 5.23 KB
/
union.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
#!/usr/bin/python
import os.path
import sys
import optparse
import vcfClass
from vcfClass import *
import tools
from tools import *
import intersect
from intersect import setVcfPriority
from intersect import writeVcfRecord
if __name__ == "__main__":
main()
# Calculate the union of the two vcf files.
def unionVcf(v1, v2, priority, outputFile):
success1 = v1.getRecord()
success2 = v2.getRecord()
currentReferenceSequence = v1.referenceSequence
# Finish when the end of either file has been reached.
while success1 or success2:
# If the end of the first vcf file is reached, write out the
# remaining records from the second vcf file.
if not success1:
outputFile.write(v2.record)
success2 = v2.getRecord()
# If the end of the second vcf file is reached, write out the
# remaining records from the first vcf file.
if not success2:
outputFile.write(v1.record)
success1 = v1.getRecord()
if v1.referenceSequence == v2.referenceSequence and v1.referenceSequence == currentReferenceSequence:
if v1.position == v2.position:
writeVcfRecord(priority, v1, v2, outputFile)
success1 = v1.getRecord()
success2 = v2.getRecord()
elif v2.position > v1.position:
success1 = v1.parseVcf(v2.referenceSequence, v2.position, True, outputFile)
elif v1.position > v2.position: success2 = v2.parseVcf(v1.referenceSequence, v1.position, True, outputFile)
else:
if v1.referenceSequence == currentReferenceSequence: success1 = v1.parseVcf(v2.referenceSequence, v2.position, True, outputFile)
elif v2.referenceSequence == currentReferenceSequence: success2 = v2.parseVcf(v1.referenceSequence, v1.position, True, outputFile)
# If the last record for a reference sequence is the same for both vcf
# files, they will both have referenceSequences different from the
# current reference sequence. Change the reference sequence to reflect
# this and proceed.
else:
if v1.referenceSequence != v2.referenceSequence:
print >> sys.stderr, "ERROR: Reference sequences for both files are unexpectedly different."
print >> sys.stderr, "Check that both files contain records for the following reference sequences:"
print >> sys.stderr, "\t", v1.referenceSequence, " and ", v2.referenceSequence
exit(1)
currentReferenceSequence = v1.referenceSequence
def main():
# Parse the command line options
usage = "Usage: vcfPytools.py union [options]"
parser = optparse.OptionParser(usage = usage)
parser.add_option("-i", "--in",
action="append", type="string",
dest="vcfFiles", help="input vcf files")
parser.add_option("-o", "--out",
action="store", type="string",
dest="output", help="output vcf file")
parser.add_option("-f", "--priority-file",
action="store", type="string",
dest="priorityFile", help="output record from this file")
(options, args) = parser.parse_args()
# Check that multiple vcf files are given.
if options.vcfFiles == None:
parser.print_help()
print >> sys.stderr, "\nTwo input vcf files (-i) are required for performing union."
exit(1)
elif len(options.vcfFiles) != 2:
print >> sys.stderr, "Two input vcf files are required for performing union."
# Set the output file to stdout if no output file was specified.
outputFile, writeOut = setOutput(options.output)
# If no priority is given to either file (from the -p command line
# option), set priorityQuality to True. In this case, the record
# written to the output file will be that with the higest quality.
# If a priority is given, check that the file is one of the input
# vcf files.
priority = setVcfPriority(options.priorityFile, options.vcfFiles) # intersect.py
v1 = vcf() # Define vcf object.
v2 = vcf() # Define vcf object.
# Open the vcf files.
v1.openVcf(options.vcfFiles[0])
v2.openVcf(options.vcfFiles[1])
# Read in the header information.
v1.parseHeader(options.vcfFiles[0], writeOut)
v2.parseHeader(options.vcfFiles[1], writeOut)
if priority == 3:
v3 = vcf() # Generate a new vcf object that will contain the header information of the new file.
mergeHeaders(v1, v2, v3) # tools.py
v1.processInfo = True
v2.processInfo = True
else: checkDataSets(v1, v2)
# Check that the header for the two files contain the same samples.
if v1.samplesList != v2.samplesList:
print >> sys.stderr, "vcf files contain different samples (or sample order)."
exit(1)
else:
taskDescriptor = "##vcfPytools=union " + v1.filename + ", " + v2.filename
if priority == 3: writeHeader(outputFile, v3, False, taskDescriptor)
elif (priority == 2 and v2.hasHeader) or not v1.hasHeader: writeHeader(outputFile, v2, False, taskDescriptor) # tools.py
else: writeHeader(outputFile, v1, False, taskDescriptor) # tools.py
# Calculate the union.
unionVcf(v1, v2, priority, outputFile)
# Check that the input files had the same list of reference sequences.
# If not, it is possible that there were some problems.
checkReferenceSequenceLists(v1.referenceSequenceList, v2.referenceSequenceList) # tools.py
# Close the vcf files.
v1.closeVcf(options.vcfFiles[0])
v2.closeVcf(options.vcfFiles[1])
# Terminate the program cleanly.
return 0