-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_to_ftp.py
executable file
·125 lines (115 loc) · 5.15 KB
/
data_to_ftp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#!usr/bin/env -python
import os
import sys
import argparse
import ConfigParser
import subprocess
from ftplib import FTP
DESCRIPTION = """\
A stand alone script to deliver the data to a ftp server provided by USER outside
Sweden or from some other reason who dont have acces to UPPMAX. It is neccesary
before running this script the fake/real delivery to am UPPMAX projects INBOX
should be done.
[ftp]
domain: xxxx
port: xxxx
username: xxxx
password: xxxx
project: xxxx
uppmaxid: xxxx"""
config_file_format = """Input config file should be formated as mentioned in description, especially the
key/header names should be same as in the example, else the list 'config_keys'
in source code should be changed to match the key names as in config file"""
## command line arugments for the script ##
parser = argparse.ArgumentParser(description=DESCRIPTION,formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('config_file',type=str,help=config_file_format)
parser.add_argument('--exclude_sample',type=str,default=None,help="Samples to be exculded from tranfser, can be single sample \
name or multiple sample name seperated by comma(,).")
parser.add_argument('--only_sample',type=str,default=None,help="Sample(s) that have to copied ignoring other in the INBOX. \
Multiple sample names are given by comma separated.")
parser.add_argument('--no_reports',default=False,action='store_true',help="Dont tranfer/copy report in INBOX")
parser.add_argument('--no_md5check',default=False,action='store_true',help="Dont check for md5sum files for fastq files, by default it does.")
args = parser.parse_args()
## method to check and parse config file ##
def get_config_info(config_file,check_keys):
config = ConfigParser.ConfigParser()
config_path = config.read(config_file)
try:
return {k:config.get('ftp',k) for k in check_keys}
except:
sys.exit("Check the keys in config file, see help for more info regarding the format")
## method to create a md5sum file for given name and given file ##
def do_md5sum(fq,md5):
try:
cmd = subprocess.Popen(["md5sum", fq],stdout=subprocess.PIPE)
cmd.wait()
md5_op = cmd.communicate()[0].strip()
with open(md5,'w') as op_file:
op_file.write(md5_op)
return True
except:
return False
config_keys = ["domain", "port", "username", "password", "project", "uppmaxid"]
config_file = args.config_file
exculde_samples = args.exclude_sample.split(',') if args.exclude_sample else []
config_info = get_config_info(config_file,config_keys)
proj = config_info.get('project')
sm_cnt = 0
pj_dir = "/proj/{}/INBOX/{}".format(config_info.get('uppmaxid'),proj)
os.chdir(pj_dir)
if args.only_sample:
samples = args.only_sample.split(',')
else:
samples = [item for item in os.listdir(os.getcwd()) if os.path.isdir(item) and item not in exculde_samples]
reports = [item for item in os.listdir(os.getcwd()) if item.endswith('.pdf')] if not args.no_reports else []
print "Total {} samples and {} reports are going to be copied now for project {}".format(len(samples),len(reports),proj)
## opening a ftp session ##
ftp = FTP()
ftp.connect(config_info.get('domain'),config_info.get('port'))
ftp.login(config_info.get('username'),config_info.get('password'))
ftp.cwd('/')
try:
ftp.mkd(proj)
print "Project folder has been created for {} in remote..".format(proj)
except:
print "Project folder already exists for {} in remote..".format(proj)
pass
ftp.cwd(proj)
## transfer reports if not otherwise mentioned
for report in reports:
with open(report,'rb') as rep:
ftp.storbinary("STOR {}".format(report),rep)
## transfer fastq files for samples
for sam in samples:
os.chdir(pj_dir)
ftp.cwd("/{}".format(proj))
try:
ftp.mkd(sam)
ftp.cwd(sam)
print "Process for samples {} have started..".format(sam)
except:
print "Sample {} already exists in remote.. skipping..".format(sam)
continue
flowcell = os.listdir(sam)
for fc in flowcell:
os.chdir(os.path.join(pj_dir,sam,fc))
ftp.cwd("/{}/{}".format(proj,sam))
ftp.mkd(fc)
ftp.cwd(fc)
fq_files = [fl for fl in os.listdir(os.getcwd()) if fl.endswith('.fastq.gz')]
for fq in fq_files:
if not args.no_md5check:
md5 = "{}.md5".format(fq)
if not os.path.exists(md5):
fq_md5 = do_md5sum(fq,md5)
if not fq_md5:
sys.exit("Could not create md5sum for sample {} in flowcell {} and file {}".format(sam,fc,fq))
with open(md5, 'rb') as md5_file:
ftp.storbinary("STOR {}".format(md5),md5_file)
with open(fq,'rb') as fq_file:
ftp.storbinary("STOR {}".format(fq),fq_file)
print "Data uploaded for sample {} in flowcell {}".format(sam,fc)
print "All data for sample {} is now completed..".format(sam)
sm_cnt += 1
ftp.quit()
print "Transfer done, total {}/{} samples proccessed".format(sm_cnt,len(samples))