-
Notifications
You must be signed in to change notification settings - Fork 0
/
ncbi_genome_download_rename.py
64 lines (49 loc) · 1.86 KB
/
ncbi_genome_download_rename.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import argparse
import pandas as pd
import os
import sys
# OPTIONS #####################################################################
parser = argparse.ArgumentParser(description='XXX')
parser.add_argument('-m',
'--metadata',
help="Metadata file",
required=True)
parser.add_argument(
'--out_dir',
help="Directory to move the files to",
required=True)
parser.add_argument('--filetype',
help="File type",
default="faa",
required=False)
args = parser.parse_args()
args.out_dir = os.path.abspath(args.out_dir) + '/'
if not os.path.exists(args.out_dir):
print("\nCreating directory " + args.out_dir)
os.makedirs(args.out_dir)
# Read and Sort Stats File ####################################################
genomes = pd.read_csv(args.metadata, sep="\t")
for genome, row in genomes.iterrows():
name = str(row['organism_name'])
# if pd.notna(row['infraspecific_name']):
# name = name + " " + str(row['infraspecific_name'])
# if pd.notna(row['isolate']):
# name = name + " " + str(row['isolate'])
name = name + " " + str(row['asm_name'])
name = name.replace(" ", "_")
name = name.replace("strain=", "")
name = name.replace("(", "")
name = name.replace(")", "")
name = name.replace("*", "_")
name = name.replace("/", "_")
genomes.loc[genome, 'NAME'] = name
# check for duplicates
if genomes['NAME'].duplicated().any():
print(genomes['NAME'].duplicated() == True)
sys.exit("Ooops, looks like there are some duplicate genomes")
genomes = genomes.set_index('NAME')
for name, row in genomes.iterrows():
print(name, row['local_filename'], sep="\t")
command = "gunzip -qkc " + row['local_filename'] + \
" > " + args.out_dir + name + '.' + args.filetype
os.system(command)