-
Notifications
You must be signed in to change notification settings - Fork 1
/
summarize_per_read.py
executable file
·38 lines (33 loc) · 1.17 KB
/
summarize_per_read.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import pandas as pd
import sys
from glob import glob
def getsp(x):
res = []
for y in x:
h = y[1:-1].split("'")
res.extend([z for z in h if z.strip()])
return res
if len(sys.argv) < 2 or '-h' in sys.argv or '--help' in sys.arv:
print('Usage:')
print("python summarize_per_read.py '*_output_12S_Numver_unique_species_"
"per_read.tsv' 12S")
print('**NOTE: pay imporant attention to the quotes over the pattern')
sys.exit()
pattern = sys.argv[1]
prefix = sys.argv[2]
# patter is how to look for the number_unique_species. It should match the ls
# unix command.
gl = glob(pattern)
summ = []
order = ['Sample', 'Count unique reads', 'Sum of unique taxa',
'Species in sample']
for fn in gl:
sample = fn.split('_')[0]
df = pd.read_table(fn, sep='\t')
ntax = df['No. unique taxa'].sum()
nreads = df.qseqid.nunique()
arr = repr(getsp(df['Unique taxa'].unique()))
summ.append({'Sample': sample, 'Count unique reads': nreads,
'Sum of unique taxa': ntax, 'Species in sample': arr})
pd.DataFrame(summ).to_csv('%s_summary.tsv' % prefix, sep='\t', index=False,
columns=order)