-
Notifications
You must be signed in to change notification settings - Fork 10
/
label_sub_pos.py
66 lines (53 loc) · 2.02 KB
/
label_sub_pos.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import sys
import re
import os
import getopt
################ Comand-line arguments ################
peptide_column="Peptide" #default name to look for peptide column in input file
protein_id_column = "Protein" #default name to look for protein accessions in input file
splitcar = "_" #default spliting by '_'
if len(sys.argv[1:])<=1: ### Indicates that there are insufficient number of command-line arguments
print("Warning! wrong command, please read the mannual in Readme.txt.")
print("Example: python lab_sub_pos.py --input_psm PSM_filename --output output_filename")
else:
options, remainder = getopt.getopt(sys.argv[1:],'', ['input_psm=','output=','splitchar='])
for opt, arg in options:
if opt == '--input_psm': input_file=arg
elif opt == '--output': output_file=arg
elif opt == '--splitchar': splitchar=arg
else:
print("Warning! Command-line argument: %s not recognized. Exiting..." % opt); sys.exit()
input1=open(input_file,"r") # vardb search psm table
header1=input1.readline().strip().split("\t")
try:
pep_index=header1.index(peptide_column)
pro_index=header1.index(protein_id_column)
except ValueError:
print("Peptide, Protein columns are not in input table"); sys.exit()
output=open(output_file,"w") # 1 mismatch novel peptide PSM
newheader=header1+["sub_pos"]
output.write("\t".join(newheader)+"\n")
for line in input1:
row=line.strip().split("\t")
pro=row[pro_index]
sub_pos="NA"
acc=pro.split(";")[0]
acc=re.sub(r'\([^)]*\)', '', acc) # remove text within parentheses
if acc[:6]=="CanPro":
sub_pos=acc.split("_")[-1]
elif acc[:6]=="COSMIC":
sub_pos=acc.split(":")[-1]
else:
splitheader = acc.split(splitchar)
if len(splitheader) == 1:
continue
try:
int(splitheader[-1])
except:
continue
else:
sub_pos = splitheader[-1]
row.append(sub_pos)
output.write("\t".join(row)+"\n")
input1.close()
output.close()