forked from cslarsen/impute-me
-
Notifications
You must be signed in to change notification settings - Fork 0
/
functions.R
6091 lines (4765 loc) · 290 KB
/
functions.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
set_conf<-function(
request,
value=NULL
){
#' set configuration parameters
#'
#' Function that sets the environment variables, either from defaults
#' individually or from a file ()
#'
#' @param request the name of the requested variable to set. Special values are 'defaults' and 'set_from_file'.
#' @param value the value to assign to the requested variable. For set_from_file it is the path of the file to read in.
#'
default_configuration<-list(
"max_imputations_per_node"=1, #the max number of parallel imputations to run
"max_imputations_in_queue" =200, #the max number of imputations allowed waiting in a queue
"server_role"='Hub', #the role of this computer, can be either Hub or Node
"hub_address"='', #if server_role is Node, then the IP-address of the Hub is required
"from_email_password"='', #optional password for sending out emails
"from_email_address"='', #optional email-address/username for sending out emails
"routinely_delete_this"='', #delete these parts in routine 14-day data deletion. May put in 'link' and/or 'data', which is the default online. But docker-running defaults to not deleting anything.
"paypal"='https://www.paypal.me/lfolkersenimputeme/5', #suggest to donate to this address in emails
"bulk_node_count"=1, #count of bulk-nodes, used only for calculating timings in receipt mail
"error_report_mail"='', #optional email-address to send (major) errors to
"seconds_wait_before_start"=0, #a delay that is useful only with CPU-credit systems
"running_as_docker"=TRUE, #adapt to docker running
"autostart_supercronic"=TRUE, #Attempt to auto-start cron-jobs with the prepare_individual_genome function (makes it easier for casual docker users, but may confuse things in pipeline running)
"max_imputation_chunk_size"=1000, #how much stuff to put into the memory in each chunk. Lower values results in slower running with less memory-requirements.
"minimum_required_variant_in_vcf_count"=200000, #The least amount of matching variants acceptable for passing as a whole-genome sequencing vcf
"block_double_uploads_by_md5sum"=FALSE, #If the upload interface should give an error when the exact same file is being uploaded twice (by md5sum).
"plink_algorithms_to_run"='SCORESUM_PLINK_2_0_DOSAGE_MATRIX', #What types of plink algorithms to run (see prs module export_script.R for details)
"modules_to_compute"='AllDiseases,autoimmuneDiseases,BRCA,drugResponse,ethnicity,rareDiseases,ukbiobank,prs', #Select the modules to pre-run (defaults to all folders in the repo if set to NULL).
"cron_logs_path"='/home/ubuntu/logs/cron_logs/', #Path to write cron-logs to. Can be empty.
"submission_logs_path"='/home/ubuntu/logs/submission/', #Path to write submission logs to (information about user errors, etc). Can be empty.
"shiny_logs_path"='/home/ubuntu/logs/shiny/', #Path to write shiny-logs to (the shiny-server system's logs). Can be empty.
"misc_files_path"='/home/ubuntu/misc_files/', #Path to misc-files to (e.g. md5sums, height-scores, imputemany-registry etc). Can be empty, but then the default configuration.R will be written.
"data_path"='/home/ubuntu/data/', #Path to write processed final data (imputed bulk data, prs-jsons, etc)
"programs_path"='/imputeme/programs/', #Path where to look for programs and imputation-reference. Is not changed during processing and rarely in updates, and must be pre-loaded.
"prs_dir_path"='/imputeme/prs_dir/', #Path where to look for prs-weights and frequency data. Is not changed during processing but often in updates, and must be pre-loaded.
"imputations_path"='/home/ubuntu/imputations/', #Landing folder for microarray data to be imputed. Can be empty, must be writeable.
"vcfs_path"='/home/ubuntu/vcfs/', #Landing folder for sequencing data to be processed without imputation. Can be empty, must be writeable.
"code_path"='/imputeme/code/impute-me/', #Path of the entire impute-me github code repository. Cannot be empty. Must be writeable for the ./www/ folder exposition-system through the shiny-server to work, otherwise immutable.
"uploads_for_imputemany_path"='/home/ubuntu/uploads_for_imputemany/', #Path to where uploads to imputemany are saved before being split
"bulk_imputations_path"='/home/ubuntu/bulk_imputations/', #Working folder for bulk imputation
"verbose"=1 #how much info to put into logs (min 0, max 10)
)
#set all according to list above
if(request == "defaults"){
if(!is.null(value))stop("When setting defaults, value must be given as NULL")
do.call(Sys.setenv, default_configuration)
if(default_configuration[["verbose"]]>1){
print(paste0(Sys.time(),": Setting ",length(default_configuration)," environment variables. Any previous values were overwritten."))
}
#set all according to file given as value
}else if(request == "set_from_file"){
if(!file.exists(value))stop("Setting defaults from file, but can't find the file at the path given by value")
source(value)
missing<-vector()
#loop over read in parameters, register which is missing and change the rest in default list from above
for(i in 1:length(default_configuration)){
if(exists(names(default_configuration)[i])){
default_configuration[[i]] <- get(names(default_configuration)[i])
}else{
missing<-c(missing,names(default_configuration)[i])
}
}
do.call(Sys.setenv, default_configuration)
#report changes
if(default_configuration[["verbose"]]>3){
if(length(missing)>0){
print(paste0(Sys.time(),": Setting ",length(default_configuration)-length(missing)," environment variables from file ",value,". Setting ",length(missing)," values to hard-coded defaults because they were missing: ",paste(missing,collapse=", ")))
}else{
print(paste0(Sys.time(),": Setting ",length(default_configuration)-length(missing)," environment variables from file ",value))
}
}
}else{
if(is.null(value))stop("Can't set_conf a value of NULL")
if(!request %in% names(default_configuration))stop(paste0("Didn't recognize ",request," as an imputeme environment variable. Please select only one of: ", paste(sort(names(default_configuration)),collapse=", ")))
previous_value<-Sys.getenv(request)
names(value)<-request
do.call(Sys.setenv, as.list(value))
print(paste0(Sys.time(),": Setting environment variable ",request," to ",value,". Previous value was ",previous_value))
}
}
get_conf<-function(
request
){
#' get configuration parameters
#'
#' Function that reads the environment variables, reformats to R-classes, and
#' makes choices wether to stop for missing values
#'
#'
#' @param request the name of the requested variable
#'
#' @return The value of the requested variable, either from the configuration_file or as a default value if not available in configuration file.
#always import verbose
verbose<-as.numeric(Sys.getenv("verbose"))
#reform class as necessary
#numeric
if(request %in% c("max_imputations_per_node","max_imputations_in_queue","bulk_node_count","seconds_wait_before_start","max_imputation_chunk_size","verbose","minimum_required_variant_in_vcf_count")){
assign(request,as.numeric(Sys.getenv(request)))
#logical
}else if(request %in% c("running_as_docker","block_double_uploads_by_md5sum","autostart_supercronic")){
assign(request,as.logical(Sys.getenv(request)))
}else if(request %in% c("modules_to_compute","plink_algorithms_to_run","routinely_delete_this")){
#comma-separated lists to vectors
assign(request,strsplit(Sys.getenv(request),",")[[1]])
#character
}else(
assign(request,Sys.getenv(request))
)
#then do checkups for each of the variables (could probably be skipped later, but checks were already
#written)
if(request == "verbose"){
#already done
}else if(request == "max_imputations_per_node"){
if(!is.numeric(max_imputations_per_node))stop("max_imputations_per_node not numeric")
if(length(max_imputations_per_node)!=1)stop("max_imputations_per_node not length 1")
if(max_imputations_per_node <= 0 | max_imputations_per_node > 1000)stop("max_imputations_per_node must be between 1 and 1000")
}else if(request == "max_imputations_in_queue"){
if(!is.numeric(max_imputations_in_queue))stop("max_imputations_in_queue not numeric")
if(length(max_imputations_in_queue)!=1)stop("max_imputations_in_queue not length 1")
if(max_imputations_in_queue <= 0 | max_imputations_in_queue > 100000)stop("max_imputations_in_queue must be between 1 and 100000")
}else if(request == "server_role"){
if(!is.character(server_role))stop("server_role not character")
if(length(server_role)!=1)stop("server_role not length 1")
if(!server_role%in%c("Hub","Node"))stop("server_role not Hub or Node")
}else if(request == "hub_address"){
if(!is.character(hub_address))stop("hub_address not character")
if(length(hub_address)!=1)stop("hub_address not length 1")
}else if(request == "from_email_password"){
if(!is.character(from_email_password ))stop("from_email_password not character")
if(length(from_email_password )!=1)stop("from_email_password not length 1")
}else if(request == "from_email_address"){
if(!is.character(from_email_address))stop("from_email_address not character")
if(length(from_email_address)!=1)stop("from_email_address not length 1")
}else if(request == "routinely_delete_this"){
if(!is.character(routinely_delete_this))stop("routinely_delete_this not character")
if(!all(routinely_delete_this%in%c("link","data")))stop("routinely_delete_this not equal to link, data or both")
}else if(request == "paypal"){
if(!is.character(paypal))stop("paypal not character")
if(length(paypal)!=1)stop("paypal not length 1")
}else if(request == "bulk_node_count"){
if(!is.numeric(bulk_node_count ))stop("bulk_node_count not numeric")
if(length(bulk_node_count)!=1)stop("bulk_node_count not length 1")
if(bulk_node_count <= 0 | bulk_node_count > 100)stop("bulk_node_count must be between 1 and 100")
}else if(request == "error_report_mail"){
if(!is.character(error_report_mail))stop("error_report_mail not character")
if(length(error_report_mail)!=1)stop("error_report_mail not length 1")
}else if(request == "seconds_wait_before_start"){
if(!is.numeric(seconds_wait_before_start))stop("seconds_wait_before_start not numeric")
if(length(seconds_wait_before_start)!=1)stop("seconds_wait_before_start not length 1")
}else if(request == "running_as_docker"){
if(!is.logical(running_as_docker))stop("running_as_docker not logical")
if(length(running_as_docker)!=1)stop("running_as_docker not length 1")
}else if(request == "autostart_supercronic"){
if(!is.logical(autostart_supercronic))stop("autostart_supercronic not logical")
if(length(autostart_supercronic)!=1)stop("autostart_supercronic not length 1")
}else if(request == "max_imputation_chunk_size"){
if(!is.numeric(max_imputation_chunk_size))stop("max_imputation_chunk_size not numeric")
if(length(max_imputation_chunk_size)!=1)stop("max_imputation_chunk_size not length 1")
}else if(request == "minimum_required_variant_in_vcf_count"){
if(!is.numeric(minimum_required_variant_in_vcf_count))stop("minimum_required_variant_in_vcf_count not numeric")
if(length(minimum_required_variant_in_vcf_count)!=1)stop("minimum_required_variant_in_vcf_count not length 1")
if(minimum_required_variant_in_vcf_count < 1000)stop("minimum_required_variant_in_vcf_count must be at least 1000")
}else if(request == "block_double_uploads_by_md5sum"){
if(!is.logical(block_double_uploads_by_md5sum))stop("block_double_uploads_by_md5sum not logical")
if(length(block_double_uploads_by_md5sum)!=1)stop("block_double_uploads_by_md5sum not length 1")
}else if(request == "modules_to_compute"){
if(!is.character(modules_to_compute))stop("modules_to_compute not character")
if(!all(modules_to_compute%in%list.files(get_conf("code_path"))))stop(paste("A request was made for modules_to_compute that were not found in ",get_conf("code_path"),":",paste(modules_to_compute,collapse=", ")))
}else if(request == "plink_algorithms_to_run"){
if(!is.character(plink_algorithms_to_run))stop("plink_algorithms_to_run not character")
}else if(request == "cron_logs_path"){
if(!is.character(cron_logs_path))stop("cron_logs_path not character")
if(length(cron_logs_path)!=1)stop("cron_logs_path not length 1")
}else if(request == "submission_logs_path"){
if(!is.character(submission_logs_path))stop("submission_logs_path not character")
if(length(submission_logs_path)!=1)stop("submission_logs_path not length 1")
}else if(request == "shiny_logs_path"){
if(!is.character(shiny_logs_path))stop("shiny_logs_path not character")
if(length(shiny_logs_path)!=1)stop("shiny_logs_path not length 1")
}else if(request == "misc_files_path"){
if(!is.character(misc_files_path))stop("misc_files_path not character")
if(length(misc_files_path)!=1)stop("misc_files_path not length 1")
}else if(request == "data_path"){
if(!is.character(data_path))stop("data_path not character")
if(length(data_path)!=1)stop("data_path not length 1")
}else if(request == "programs_path"){
if(!is.character(programs_path))stop("programs_path not character")
if(length(programs_path)!=1)stop("programs_path not length 1")
}else if(request == "prs_dir_path"){
if(!is.character(prs_dir_path))stop("prs_dir_path not character")
if(length(prs_dir_path)!=1)stop("prs_dir_path not length 1")
}else if(request == "imputations_path"){
if(!is.character(imputations_path))stop("imputations_path not character")
if(length(imputations_path)!=1)stop("imputations_path not length 1")
}else if(request == "vcfs_path"){
if(!is.character(vcfs_path))stop("vcfs_path not character")
if(length(vcfs_path)!=1)stop("vcfs_path not length 1")
}else if(request == "code_path"){
if(!is.character(code_path))stop("code_path not character")
if(length(code_path)!=1)stop("code_path not length 1")
}else if(request == "uploads_for_imputemany_path"){
if(!is.character(uploads_for_imputemany_path))stop("uploads_for_imputemany_path not character")
if(length(uploads_for_imputemany_path)!=1)stop("uploads_for_imputemany_path not length 1")
}else if(request == "bulk_imputations_path"){
if(!is.character(bulk_imputations_path))stop("bulk_imputations_path not character")
if(length(bulk_imputations_path)!=1)stop("bulk_imputations_path not length 1")
}else if(request == "version"){
#note this is special - since it is *not* taken from configuration file, but hard-coded
version <- "v1.0.7"
}else{
stop(paste0("Unknown request:",request,". Try to write one of the following instead: autostart_supercronic, block_double_uploads_by_md5sum, bulk_imputations_path, bulk_node_count, code_path, cron_logs_path, data_path, error_report_mail, from_email_address, from_email_password, hub_address, imputations_path, max_imputation_chunk_size, max_imputations_in_queue, max_imputations_per_node, minimum_required_variant_in_vcf_count, misc_files_path, modules_to_compute, paypal, plink_algorithms_to_run, programs_path, prs_dir_path, routinely_delete_this, running_as_docker, seconds_wait_before_start, server_role, shiny_logs_path, submission_logs_path, uploads_for_imputemany_path, vcfs_path, verbose, version"))
}
#export the request
return(get(request))
}
prepare_individual_genome<-function(
path,
email=NULL,
updateProgress=NULL,
protect_from_deletion=FALSE,
filename=NULL,
predefined_uniqueID=NULL,
overrule_vcf_checks=FALSE
){
#' prepare individual genome
#'
#' This is the main data-receiving function. It performs all checks that can be made
#' in web-speed, meaning maximum a few seconds. If these are passed, the file, plus
#' meta-information is saved in the queueing area, for processing and further checks.
#' In web-running this function is called in the www.impute.me/imputeme interface,
#' and will immediately decide if a file-submission is accepted or not. It is also the
#' main function for accessing submissions programatically e.g. in docker running-
#'
#' @param path The path to an input file. Can be either microarray format or vcf.
#' @param email The user-email to report back to. Optional.
#' @param updateProgress The progress update tracker for the shiny interface. Optional.
#' @param protect_from_deletion A switch carried down the analysis, that can be set as TRUE for debugging purposes. When TRUE, final results will not be deleted after 14 days, and less cleaning of part-calculations will be done.
#' @param filename An optional filename of the sample. This is useful to have separate for e.g. shiny file submission, where it otherwise may be called e.g. /tmp/RtmpAKCi8i/9527843b29213cdef70532ff/0.gz. If it is not set, it will default to basename(path).
#' @param predefined_uniqueID An optional pre-defined uniqueID. This is only used when accessing the function from outside of the usual web-upload interface, and bypasses the random-ID step. Still has to obey the usual 12-digit alphanumeric rules.
#' @param overrule_vcf_checks A logical indicating if the (strict) vcf-files should be bypassed. Don't do this when accepting web-input, since you get a lot of low-pass and exon-seq vcfs then. But can be done in a controlled environment.
#'
#' @return A short text string with a ready message. Additionally, the file provided as path will be available for processing in the standard-file structure, either in ~/imputations or in ~/vcfs as relevant.
#loading libraries
library("tools")
suppressWarnings(library("shiny"))
#set logging level
verbose <- get_conf("verbose")
#check that input path is ok and accesible
if(class(path)!="character")stop(paste("path must be character, not",class(path)))
if(length(path)!=1)stop(paste("path must be length 1, not",length(path)))
if(!file.exists(path))stop(paste("Did not find file at path:",path))
if(substr(path,1,1)!="/")path<-normalizePath(path)
#Check filename is ok and set to basename(path) if not given
#the actual file will anyway be renamed as <uniqueID>_raw_data.txt so it is not an
#essential variable (except the file-ending), but it does need to be cleaned for special characters
#for storing and mailing back to users
if(is.null(filename))filename<-basename(path)
if(class(filename)!="character")stop(paste("filename must be character, not",class(filename)))
if(length(filename)!=1)stop(paste("filename must be length 1, not",length(filename)))
filename<-gsub("\\ ","_",filename)
filename<-gsub("[\\$\\&\\+\\,\\:\\;\\=\\?\\@\\#\\\"\\\']","",filename)
#check if this sample should be protected_from_deletion
if(class(protect_from_deletion)!="logical")stop(paste("protect_from_deletion must be logical, not",class(protect_from_deletion)))
if(length(protect_from_deletion)!=1)stop(paste("protect_from_deletion must be length 1, not",length(protect_from_deletion)))
#check the updateProgress object - set to a NULL-returning function if not given.
if(is.null(updateProgress))updateProgress<-function(detail,value,max){return(NULL)}
if(class(updateProgress)!="function")stop(paste("updateProgress must be function, not",class(updateProgress)))
#check the user-inputted email, set to the default error_report_mail if not given
if(is.null(email))email<-get_conf("error_report_mail")
if(class(email)!="character")stop(paste("email must be character, not",class(email)))
if(length(email)!=1)stop(paste("email must be length 1, not",length(email)))
if(!get_conf("running_as_docker")){
if( email == "" | sub("[A-Z0-9._%+-]+@[A-Z0-9.-]+\\.[A-Z]{2,4}","",toupper(email)) != ""){
stop(safeError(paste("a real email adress is needed:",email)))
}
}
#updating progress
updateProgress(detail = "Check mail, check queue, check md5sum",value=1,max=4)
#check for too many ongoing imputations
if(verbose>0)print(paste0(Sys.time(),": Checking for too many ongoing imputations"))
s<-list.files(get_conf("imputations_path"))
if(length(grep("^imputation_folder",s)) >= get_conf("max_imputations_in_queue")){
m<-c(format(Sys.time(),"%Y-%m-%d-%H-%M-%S"),"too_many_jobs",email,length(grep("^imputation_folder",s)))
m<-paste(m,collapse="\t")
write(m,file=paste0(get_conf("submission_logs_path"),"submission_log.txt"),append=TRUE)
stop(safeError(paste("Too many imputations are already in progress. Cannot start a new one. The only solution to this is to wait a few days until the queues are shorter.")))
}
#check for vcf file
if(length(grep("\\.vcf\\.gz$",filename))==1 | length(grep("\\.vcf$",filename))==1 | suppressWarnings(length(grep("fileformat=VCF",readLines(path,n=1)))>0)){
is_vcf_file <- TRUE
}else{
is_vcf_file<-FALSE
}
#Create uniqueID and check that it is unique
if(is.null(predefined_uniqueID)){
uniqueID <- paste("id_",sample(1000:9000,1),sample(10000:90000,1),sep="")
numberOfLetters<-sample(c(1,1,2,3),1)
if(numberOfLetters>0){
positionsToInsertLetter<-sample(5:(nchar(uniqueID)-1),numberOfLetters)
l<-c(LETTERS,letters)
l<-l[!l%in%c("o","O")] #I hate it when O is in
for(x in positionsToInsertLetter){
substr(uniqueID,x,x)<-sample(l,1)
}
}
}else{ #if pre-supplied, check that is according to ID-rules.
uniqueID<-predefined_uniqueID
if(nchar(uniqueID)!=12)stop(safeError("uniqueID must have 12 digits"))
if(length(grep("^id_",uniqueID))==0)stop(safeError("uniqueID must start with 'id_'"))
}
if(uniqueID%in%list.files(get_conf("data_path"))){ #Also check for pre-existing uniqueIDs and stop if so (never happened though)
m<-c(format(Sys.time(),"%Y-%m-%d-%H-%M-%S"),"double_id",email,uniqueID)
m<-paste(m,collapse="\t")
write(m,file=paste0(get_conf("submission_logs_path"),"submission_log.txt"),append=TRUE)
stop(safeError("Problem with unique ID generation. Please re-load and try again."))
}
if(verbose>0)print(paste0(Sys.time(),": Created uniqueID ",uniqueID))
#This blocks prepares a microarray-based sample directly for imputation
#the next if-block will instead prepare a vcf-based sample in a separate vcf-area
if(!is_vcf_file){
#create imputation folder.
if(verbose>0)print(paste0(Sys.time(),": Create imputation folder for ",uniqueID))
homeFolderShort<-paste("imputation_folder",uniqueID,sep="_")
homeFolder<-paste0(get_conf("imputations_path"),homeFolderShort,"/")
dir.create(homeFolder,recursive=T)
write.table("Job is not ready yet",file=paste0(homeFolder,"job_status.txt"),col.names=F,row.names=F,quote=F)
#unzipping (or not) and moving to new place
newTempPath <- paste(homeFolder,paste(uniqueID,"_raw_data",sep=""),sep="/")
newUnzippedPath <- paste(homeFolder,paste(uniqueID,"_raw_data.txt",sep=""),sep="/")
file.copy(path, newTempPath)
gunzipResults<-unzip(newTempPath,exdir=homeFolder)
if(length(gunzipResults)==1){ #then its a zip file
if(verbose>0)print(paste0(Sys.time(),": Unzipping and moving to new place"))
file.rename(gunzipResults, newUnzippedPath)
}else{ #then it's probably not
#check if it is a gz file
if(verbose>0)print(paste0(Sys.time(),": Handling non-zip file and moving to new place"))
filetype<-system(paste("file ", newTempPath),intern=T)
if(length(grep("gzip compressed",filetype))==1){
#if it's a gzip file we generally fail them, because it contains the most
#odd custom formats. However we have to allow for FTDNA uploads, because
#since 2020-10-20 they started using .gz as default
if(length(grep("autosomal_o37",filename,ignore.case = T))>0){
zcat_result<-system(paste0("zcat ", newTempPath, " | sed 's/,/\\t/g' | grep '^rs' > ",newUnzippedPath),intern=T)
if(length(zcat_result)!=0){
if(!protect_from_deletion)unlink(homeFolder,recursive=T)
m<-c(format(Sys.time(),"%Y-%m-%d-%H-%M-%S"),"gzip_ftdna_problem",email,uniqueID,filename)
m<-paste(m,collapse="\t")
write(m,file=paste0(get_conf("submission_logs_path"),"submission_log.txt"),append=TRUE)
stop(safeError("This file looked like a FTDNA file but could not accurately be handled as such. This is a rare error, but we unfortunately cannot proceed."))
}
#if it's a non FTDNA gz file we fail it
}else{
if(!protect_from_deletion)unlink(homeFolder,recursive=T)
m<-c(format(Sys.time(),"%Y-%m-%d-%H-%M-%S"),"gzip_file",email,uniqueID,filename)
m<-paste(m,collapse="\t")
write(m,file=paste0(get_conf("submission_logs_path"),"submission_log.txt"),append=TRUE)
stop(safeError("Don't submit gz-files. Only uncompressed text or zip-files. If you already know what a gz file is, this should be easy for you. Please format as tab separated text files."))
}
}else{
#otherwise just rename
if(verbose>0)print(paste0(Sys.time(),": Moving file to new place"))
file.rename(newTempPath, newUnzippedPath)
}
}
path <- newUnzippedPath
#checking if it is a consistent file
if(verbose>0)print(paste0(Sys.time(),": checking if it is a consistent file"))
testRead<-try(read.table(path,nrow=10,stringsAsFactors=F))
if(class(testRead)=="try-error"){
m<-c(format(Sys.time(),"%Y-%m-%d-%H-%M-%S"),"general_data_file_problem",email,uniqueID,filename)
m<-paste(m,collapse="\t")
write(m,file=paste0(get_conf("submission_logs_path"),"submission_log.txt"),append=TRUE)
if(!protect_from_deletion)unlink(homeFolder,recursive=T)
stop(safeError("Your file didn't seem like genomic data at all. It must contain many rows, one per SNP, with information about your genotype. Please write an email if you think this is a mistake and that this file format should be supported."))
}
#updating progress
updateProgress(detail = "Consistency checks",value=2,max=4)
#checking if there is at least 100k lines (otherwise imputation wouldn't be possible anyway)
cmd1 <- paste0("wc -l ",path)
lines<- as.numeric(sub(" .+$","",system(cmd1,intern=T)))
if(lines < 100000){
m<-c(format(Sys.time(),"%Y-%m-%d-%H-%M-%S"),"too_few_lines_error",email,uniqueID)
m<-paste(m,collapse="\t")
write(m,file=paste0(get_conf("submission_logs_path"),"submission_log.txt"),append=TRUE)
if(!protect_from_deletion)unlink(homeFolder,recursive=T)
stop(safeError(paste0("Your file only had ",lines," lines. That doesn't look like a genome-wide microarray input file. Genome-wide microarray files have many formats and come from many places (23andme, myheritage, ancestrycom, etc), but they always have hundreds of thousands of measurements")))
}
#running the alternative format converters
if(ncol(testRead)==5){
#This could be an ancestry.com file. Check that first
testRead2<-read.table(path,nrow=10,stringsAsFactors=F,header=T)
if(unique(sub("[0-9]+$","",testRead2[,1])[1])!="rs"){
m<-c(format(Sys.time(),"%Y-%m-%d-%H-%M-%S"),"ancestry_problem",email,uniqueID)
m<-paste(m,collapse="\t")
write(m,file=paste0(get_conf("submission_logs_path"),"submission_log.txt"),append=TRUE)
if(!protect_from_deletion)unlink(homeFolder,recursive=T)
stop(safeError("Your file seemed like ancestry.com data, but didn't have rs IDs in column 1"))
}
#ok, this is probably an ancestry.com file. Let's reformat.
reformat_outcome<-try(format_ancestry_com_as_23andme(path))
}else if(ncol(testRead)==1){
#this could be myheritage. Let's try with that
reformat_outcome<-try(format_myheritage_as_23andme(path))
}else{
reformat_outcome<-"didn't try"
}
if(class(reformat_outcome)=="try-error"){
m<-c(format(Sys.time(),"%Y-%m-%d-%H-%M-%S"),"reformat_error",email,uniqueID,filename)
m<-paste(m,collapse="\t")
write(m,file=paste0(get_conf("submission_logs_path"),"submission_log.txt"),append=TRUE)
if(!protect_from_deletion)unlink(homeFolder,recursive=T)
stop(safeError("Your file didn't seem to match any of our import algorithms. If you think this data type should be supported, then you are welcome to write an email and attach a snippet of the data for our inspection."))
}
#after reformat attempts, perform one more test read and consider
testRead2<-read.table(path,nrow=10,stringsAsFactors=F)
if(ncol(testRead2)!=4){
m<-c(format(Sys.time(),"%Y-%m-%d-%H-%M-%S"),"test_read_4_columns",email,uniqueID)
m<-paste(m,collapse="\t")
write(m,file=paste0(get_conf("submission_logs_path"),"submission_log.txt"),append=TRUE)
if(!protect_from_deletion)unlink(homeFolder,recursive=T)
stop(safeError("Your file didn't have 4 columns (or 5 for ancestry.com data). If you think this data type should be supported, then you are welcome to write an email and attach a snippet of the data for our inspection."))
}
if(unique(sub("[0-9]+$","",testRead2[,1])[2])!="rs"){
if(!protect_from_deletion)unlink(homeFolder,recursive=T)
m<-c(format(Sys.time(),"%Y-%m-%d-%H-%M-%S"),"test_read_no_rs_id",email,uniqueID,filename)
m<-paste(m,collapse="\t")
write(m,file=paste0(get_conf("submission_logs_path"),"submission_log.txt"),append=TRUE)
if(!protect_from_deletion)unlink(homeFolder,recursive=T)
stop(safeError("Your file didn't have rs IDs in column 1, line 2. If you think this data type should be supported, then you are welcome to write an email and attach a snippet of the data for our inspection."))
}
#This block prepares a vcf file in a separate vcf-file holding area for later processing
}else if(is_vcf_file){
#create vcf folder
if(verbose>0)print(paste0(Sys.time(),": create vcf folder for ",uniqueID))
homeFolderShort<-paste("vcf_folder",uniqueID,sep="_")
homeFolder<-paste0(get_conf("vcfs_path"),homeFolderShort,"/")
dir.create(homeFolder,recursive=T)
write.table("Job is not ready yet",file=paste0(homeFolder,"job_status.txt"),col.names=F,row.names=F,quote=F)
#unzipping (or not) and moving to new place
if(verbose>0)print(paste0(Sys.time(),": unzipping (or not) and moving to new place"))
newTempPath <- paste(homeFolder,paste(uniqueID,"_raw_data",sep=""),sep="/")
newReadyPath <- paste(homeFolder,paste(uniqueID,"_raw_data.vcf.gz",sep=""),sep="/")
file.copy(path, newTempPath)
#file.rename(path, newTempPath) #better from space/speed perspective - but unstable across file-systems
#check if it's gz file - if it is we juse it as-is
if(substr(filename, nchar(filename)-2,nchar(filename)) == ".gz" | length(grep("gzip",system(paste("file ", newTempPath),intern=T))>0)){
filetype<-system(paste("file ", newTempPath),intern=T)
if(length(grep("gzip",filetype))==1){
file.rename(newTempPath, newReadyPath)
}else{
m<-c(format(Sys.time(),"%Y-%m-%d-%H-%M-%S"),"gzip_wrong_file",email,uniqueID,filename)
m<-paste(m,collapse="\t")
write(m,file=paste0(get_conf("submission_logs_path"),"submission_log.txt"),append=TRUE)
if(!protect_from_deletion)unlink(homeFolder,recursive=T)
stop(safeError("The submitted file ended in .gz, but was determined to not be gzipped file. It could therefore not be processed and have been deleted."))
}
}else{
#otherwise we bgzip it
cmd<-paste0("bgzip ",newTempPath)
system(cmd)
file.rename(paste0(newTempPath,".gz"), newReadyPath)
}
path <- newReadyPath
#updating progress
updateProgress(detail = "Consistency checks",value=2,max=4)
#checking if it is a consistent file
if(verbose>0)print(paste0(Sys.time(),": checking if it is a consistent file"))
testRead<-try(read.table(path,nrow=100,stringsAsFactors=F))
if(class(testRead)=="try-error"){
m<-c(format(Sys.time(),"%Y-%m-%d-%H-%M-%S"),"general_data_file_problem_vcf",email,uniqueID,filename)
m<-paste(m,collapse="\t")
write(m,file=paste0(get_conf("submission_logs_path"),"submission_log.txt"),append=TRUE)
if(!protect_from_deletion)unlink(homeFolder,recursive=T)
stop(safeError("Your file didn't seem like genomic data at all. The filename ended in vcf, so we expected a vcf file, but this did not seem to be the case."))
}
#read header
testReadHeader<-try(readLines(path,n=250))
#check for check-up escape sentence and/or overrule_vcf_checks argument
#can be anything in the header, e.g.
###INFO=<ID=Something=I solemnly swear that this file is of good enough quality>
escape_checks<-FALSE
escape_sentences <- "I solemnly swear that this file is of good enough quality"
for(escape_sentence in escape_sentences){
if(length(agrep(escape_sentences, testReadHeader, max.distance = 0.1,ignore.case=T))){
escape_checks<-TRUE
m<-c(format(Sys.time(),"%Y-%m-%d-%H-%M-%S"),"vcf_escape_sentence_used",email,uniqueID,filename)
m<-paste(m,collapse="\t")
write(m,file=paste0(get_conf("submission_logs_path"),"submission_log.txt"),append=TRUE)
}
}
if(overrule_vcf_checks)escape_checks<-TRUE
#check header
if(!testReadHeader[1] %in% c("##fileformat=VCFv4.2","##fileformat=VCFv4.1","##fileformat=VCFv4.0") && !escape_checks){
m<-c(format(Sys.time(),"%Y-%m-%d-%H-%M-%S"),"wrong_starting_vcf_header",email,uniqueID,filename)
m<-paste(m,collapse="\t")
write(m,file=paste0(get_conf("submission_logs_path"),"submission_log.txt"),append=TRUE)
if(!protect_from_deletion)unlink(homeFolder,recursive=T)
stop(safeError(paste0("The first header line of your VCF didn't have exactly '##fileformat=VCFv4.2'. It had ",testReadHeader[1],". This is required, not because these exact columns are important, but because the importer strives to avoid any custom-format imports at all. See the github issue #32 for further discussion (https://github.com/lassefolkersen/impute-me/issues/32). You may also try to follow the cookbook-recipe in this discussion in order to convert your file into microarray-like data, and then retry upload.")))
}
#size check requirement
#we used to have this as a line-count requirement, but the zcat path | wc -l call
#is too slow. Instead it's set to at least 100 MB size, which is about half
#an average Dante lab vcf file. We haven't established a firm lower
#bound on this, but it is logged for future fine-tuning.
#
size_requirement <- 104857600
size <- file.info(path)[["size"]]
if(size < size_requirement && !escape_checks){
m<-c(format(Sys.time(),"%Y-%m-%d-%H-%M-%S"),"too_small_vcf_error",email,uniqueID,filename, size,size_requirement)
m<-paste(m,collapse="\t")
write(m,file=paste0(get_conf("submission_logs_path"),"submission_log.txt"),append=TRUE)
if(!protect_from_deletion)unlink(homeFolder,recursive=T)
stop(safeError(paste0("Your vcf-file was too small to be accepted. Currently, we only accept vcf files that are larger than ",size_requirement/(1024*1024)," GB (gzipped). Your submitted data was ",signif(size/(1024*1024),3)," GB. This is done to avoid accidentally processing exon-sequencing data, which would be a problem for most subsequent algorithms. They all require genome-wide data, because the vcf-handling is done without imputation. You can read more about why that is, at http://doi.org/10.13140/RG.2.2.34644.42883. You may also find some tips on converting exon-sequencing data to microarray-like data-format in github issue #32 https://github.com/lassefolkersen/impute-me/issues/32. After doing this, you may resubmit your file and it will be run through the imputation-algorithm. It is possible, although un-tested, that some useful information can be extraxted when using this approach.")))
}
#Check and block the word 'indels' in the filename (common error from Dante labs uploads - usually caught by size-requirement but not always )
if(length(grep("indel",filename,ignore.case=T))>0 && !escape_checks){
m<-c(format(Sys.time(),"%Y-%m-%d-%H-%M-%S"),"vcf_indel_name_error",email,uniqueID,filename)
m<-paste(m,collapse="\t")
write(m,file=paste0(get_conf("submission_logs_path"),"submission_log.txt"),append=TRUE)
if(!protect_from_deletion)unlink(homeFolder,recursive=T)
stop(safeError("The vcf reading algorithm found the word 'indel' in the file name. Typically vcf files should contain both single nucleotide variants and indels, but if they only contain indels the calculations will fail. If this data is from Dante labs, check that you use the file with 'SNPs' or 'combined' in the name instead. If you are sure this file is correct, you may simply rename it to not include the word indel, and retry upload."))
}
#Try to establish genome build from header
grch37_hits<-grep("grch37|hg19",testReadHeader,ignore.case = TRUE,value=TRUE)
grch38_hits<-grep("grch38|hg38",testReadHeader,ignore.case = TRUE,value=TRUE)
if(length(grch37_hits)>0 & length(grch38_hits)>0 ){
m<-c(format(Sys.time(),"%Y-%m-%d-%H-%M-%S"),"vcf_both_builds_error",email,uniqueID,filename)
m<-paste(m,collapse="\t")
write(m,file=paste0(get_conf("submission_logs_path"),"submission_log.txt"),append=TRUE)
if(!protect_from_deletion)unlink(homeFolder,recursive=T)
stop(safeError("The vcf reading algorithm found references to both genome build grch37 and grch38 in the vcf header and was unsure how to proceed. Your data have been deleted. If you want to re-submit the data and try again, we recommend that you edit the header of your vcf file such that there are only references to either grch37/hg19 or grch38/hg38, as the case may be."))
}
#Can be inserted if you don't want to have grch38 blocks (but it's not a very good filter, only scans header)
# if(length(grch38_hits)>0){
# m<-c(format(Sys.time(),"%Y-%m-%d-%H-%M-%S"),"vcf_grch38_not_allowed",email,uniqueID,filename)
# m<-paste(m,collapse="\t")
# write(m,file=paste0(get_conf("submission_logs_path"),"submission_log.txt"),append=TRUE)
# if(!protect_from_deletion)unlink(homeFolder,recursive=T)
# stop(safeError("The vcf reading algorithm determined this data to be from the GRCh38 genome build. We are sorry, but this genome build is not supported yet. In the meantime, you may look into doing a liftover to GRCh37 yourself using tools such as Picard LiftoverVcf. Just remember to delete any references to GRCh38 in the vcf-header if you do so."))
# }
#check that first line column 9 is consistent
#There is a lot of fine-tuning going into this demand, here's some preliminary observations.
# First: we dislike formats that don't have DP, since it prevents filtering out low-pass sequencing samples
# That's ok for companies like Dante lab (GT:AD:AF:DP:F1R2:F2R1:GQ:PL:GP:PRI:SB:MB) and Nebula (GT:AD:DP:GQ:PGT:PID:PL:PS)
# That's a problem for the following companies that only report the GT field:
# genotek.ru, estonian biobank and sequencing.com ("ultimate DNA test kit"). This is a pity, since
# it seems the data is of ok quality otherwise. Possibly one could write a GT-only
# catching mechanism for them. For now we just fail them. At the benefit of not majorly
# messing up some low-pass or exon-seq submission.
allowed_formats <- c("GT:AD:AF:DP:F1R2:F2R1:GQ:PL:GP:PRI:SB:MB","GT:AD:DP:GQ:PGT:PID:PL:PS","GT:AD:DP:GQ:PL","GT:AD:DP:GQ:PGT:PID:PL")
if(!testRead[1,9] %in% allowed_formats && !escape_checks){
m<-c(format(Sys.time(),"%Y-%m-%d-%H-%M-%S"),"wrong_starting_vcf_line",email,uniqueID,filename,testRead[1,9])
m<-paste(m,collapse="\t")
write(m,file=paste0(get_conf("submission_logs_path"),"submission_log.txt"),append=TRUE)
if(!protect_from_deletion)unlink(homeFolder,recursive=T)
stop(safeError(paste0("The first line of the 'format'-column of your VCF had ",testRead[1,9],". We currently only allow vcf-files with very specific formatting-requirements, corresponding to Dante labs and Nebula. This is necessary in order to avoid grave mistakes with custom-formatted e.g. exon sequencings and also to be able to test the read-depth (the DP format entry). See github issue #32 for further discussion. In this discussion you may also find suggestions on alternative approaches to submitting your file for processing. The currently allowed format-column entries include: ",paste(allowed_formats,collapse=", "))))
}
#check read-depth
minimum_mean_depth_of_first_hundred_reads <- 10 #would be nice to have genome-wide average depth, since the first chunk of reads in testRead could be hard to sequence regions. But it's not possible to read the entire vcf in browsing-time, which is required for these format rejection-checks
format<-try(strsplit(testRead[1,9],":")[[1]],silent=T)
depth_index <- try(which(format%in%"DP"),silent=T)
depths<-try(as.numeric(sapply(strsplit(testRead[,10],":"),function(x,depth_index){x[depth_index]},depth_index)),silent=T)
if((any(is.na(depths)) || class(depths) == "try-error" || mean(depths,na.rm=T) < minimum_mean_depth_of_first_hundred_reads) && !escape_checks){
m<-c(format(Sys.time(),"%Y-%m-%d-%H-%M-%S"),"too_low_vcf_depth",email,uniqueID,filename)
m<-paste(m,collapse="\t")
write(m,file=paste0(get_conf("submission_logs_path"),"submission_log.txt"),append=TRUE)
if(!protect_from_deletion)unlink(homeFolder,recursive=T)
stop(safeError(paste0("The average depth (DP) entry of the first hundred lines of your vcf file was not at least ",minimum_mean_depth_of_first_hundred_reads," which is required. Low-coverage sequencing will not work in the down-stream algorithms.")))
}
}
##checking if this job has not actually been run before
if(get_conf("block_double_uploads_by_md5sum")){
if(verbose>0)print(paste0(Sys.time(),": checking if this job has not actually been run before"))
this_person_md5sum <- md5sum(path)
all_md5sums_path<-paste0(get_conf("misc_files_path"),"md5sums.txt")
if(!file.exists(all_md5sums_path)){write("md5sums",file=paste0(get_conf("misc_files_path"),"md5sums.txt"))}
all_md5sums<-read.table(all_md5sums_path,sep="\t",stringsAsFactors = F)[,1]
if(this_person_md5sum %in% all_md5sums){
m<-c(format(Sys.time(),"%Y-%m-%d-%H-%M-%S"),"md5sum_match",email,this_person_md5sum,filename)
m<-paste(m,collapse="\t")
write(m,file=paste0(get_conf("submission_logs_path"),"submission_log.txt"),append=TRUE)
if(!protect_from_deletion)unlink(homeFolder,recursive=T)
stop(safeError("A person with this genome was already analyzed by the system. If this is an error, you can try to re-upload a new version of your DNA-data, e.g. you can go to your data provider (23andme, ancestry.com) and re-download the data you are interested. Then try that file. There will be no block flag for such new file, because any edits to the file will make the system unable to recognize it as a previously analyzed genome."))
}
write(this_person_md5sum,file=paste0(get_conf("misc_files_path"),"md5sums.txt"),append=TRUE)
}
#finalizing:
#1) saving the small job_status.txt that just shows a status for the node/hub setup to read quickly over ssh
#2) saving the variables.rdata which is a file of processing-specific parameters that is needed in first process, but afterwards deleted
if(verbose>0)print(paste0(Sys.time(),": Finalize prepare_individual_genome"))
imputemany_upload <- FALSE
should_be_imputed <- TRUE
upload_time<-format(Sys.time(),"%Y-%m-%d-%H-%M-%S")
save(uniqueID,email,filename,protect_from_deletion,is_vcf_file,imputemany_upload,should_be_imputed,upload_time,file=paste0(homeFolder,"variables.rdata"))
unlink(paste0(homeFolder,"job_status.txt"))
write.table("Job is ready",file=paste0(homeFolder,"job_status.txt"),col.names=F,row.names=F,quote=F)
#updating progress
updateProgress(detail = "Unzipping, sending receipt mail",value=3,max=4)
#If possible, send off a mail as a receipt of data
if(get_conf("from_email_address") != "" & get_conf("from_email_password") != "" & get_conf("error_report_mail")!= ""){
queue_length <- length(list.files(get_conf("imputations_path")))
if(is_vcf_file){
algorithm_name <- "a vcf-extraction algorithm"
}else{
algorithm_name <- "an imputation algorithm"
}
message_start <-paste0("<HTML>We received your data from file <i>", filename,"</i> at www.impute.me. It will now be processed, first through ",algorithm_name," and then through several types of genetic-risk score calculators. This takes a little less than a day per genome.")
if(queue_length > 50){
run_time <- 0.75 #days (was 1.6 with t2.medium, but now on c5a.large it has dropped to 0.75 days)
servers_running <- get_conf("bulk_node_count") #servers
genomes_per_server <- 10 #genomes
genomes_per_day <- servers_running * genomes_per_server / run_time
days_left <- queue_length / genomes_per_day
days_left_lower <- round(days_left*1.2) #because always have at least some paid ones skipping the line
days_left_upper <- round(days_left*2.5)
queue_message<-paste0(" Currently ",queue_length," other genomes are waiting in queue, so expect approximately ",days_left_lower,"-",days_left_upper," days of waiting.")
}else if(queue_length > 5){
queue_message<-paste0(" Currently ",queue_length," other genomes are waiting in queue, so expect several days of waiting.")
}else{
queue_message<-""
}
message_end1 <- paste0(" The service is non-profit, but because of heavy computing-requirements for the imputation analysis it is not free to run. We therefore strongly encourage you to pay a contribution to keep the servers running (<u><a href='",get_conf("paypal"),"'>paypal</a></u>, suggested 5 USD). Also, if you do this and put your unique ID, (<i>",uniqueID,"</i>) as payment-message, you'll be moved to priority queue. Either way, once the analysis is finished you'll receive a mail containing download links for the imputed data. You will also be able to browse the analytics-interface using this uniqueID.<br><br> ")
#assign the right language book to people (Swedes and Norwegians can get the Danish language one too)
if(length(grep("\\.dk$",email))==1 | length(grep("\\.no$",email))==1 | length(grep("\\.se$",email))==1){
booklink<-"https://www.saxo.com/dk/forstaa-dit-dna_lasse-westergaard-folkersen_haeftet_9788770170154"
}else{
booklink<-"https://www.worldscientific.com/worldscibooks/10.1142/11070"
}
message_end2 <- paste0("In the meantime, may we recommend the book <u><a href='",booklink,"'>'Understand your DNA'</a></u> that serves as a guide for the underlying concepts of this analysis.<br></HTML>")
#send the mail
message <- paste0(message_start,queue_message,message_end1,message_end2)
#Send receipt mail
suppressWarnings(library("gmailr",warn.conflicts = FALSE))
gm_auth_configure( path =paste0(get_conf("misc_files_path"),"mailchecker.json"))
gm_auth(email=get_conf("from_email_address"),cache=paste0(get_conf("misc_files_path"),"mail_secret"))
prepared_email <- try(gm_mime() %>%
gm_to(email) %>%
gm_from(get_conf("from_email_address")) %>%
gm_subject("Imputation is queued") %>%
gm_html_body(message))
mailingResult<-try(gm_send_message(prepared_email))
if(class(mailingResult)=="try-error"){
m<-c(format(Sys.time(),"%Y-%m-%d-%H-%M-%S"),"mailing_error",email,uniqueID,filename)
m<-paste(m,collapse="\t")
write(m,file=paste0(get_conf("submission_logs_path"),"submission_log.txt"),append=TRUE)
}
}
#if autostart_supercronic, then kill previous supercronic jobs and restart
if(get_conf("autostart_supercronic")){
processes<-system("ps -e",intern=T)
processes<-do.call(rbind,strsplit(processes," +"))
if("supercronic" %in% processes[,5]){
pids<-processes[processes[,5]%in%"supercronic",2]
for(pid in pids){
if(verbose>1)print(paste0(Sys.time(),": Killing previous supercronic process: ",pid," (only need one)"))
system(paste("kill",pid))
Sys.sleep(0.1)
}
}
if(verbose>=0 & !file.exists(paste0(get_conf("programs_path"),"supercronic.txt")))print(paste0(Sys.time(),": Code was found to be running as docker container, but with no supercronic.txt file ready for launch. The job needs to be executed manually."))
#it's possible that the ending ampersand is not a good idea. The problem is that when the
#docker is running as a web-interface it works well. But when the function is called from outside of
#the docker, with docker exec - then supercronic deletes itself after a few seconds. It can be
#kept with e.g. a Sys.sleep() argument. But that would destroy the feedback on the website (port 3838-based)
#and there's no easy way to tell which place invoked the command. So right now the web
#interface "wins" because that's for more casual users. Then super users can
#separately call and start the supercronic
supercronic_out<-try(system(paste0("supercronic ",get_conf("programs_path"),"supercronic.txt &")))
if(supercronic_out != 0)stop(safeError("Code was found to be running as docker container, but gave an error when trying to start supercronic. The job must be started manually."))
}
#end with a message. In docker the message includes uniqueID, in web-running it does not (needs email absolutely)
if(get_conf("running_as_docker")){
return(paste("Genome files succesfully submitted. <b>The processing of your genome will take several days to run</b>. Typically between 1 and 5 days, depending on server-queue. The uniqueID of this run is <b>", uniqueID,"</b> - you can close this browser window, but do note the uniqueID since you need it when processing is finished."))
}else{
return(paste("Genome files succesfully submitted. <b>The processing of your genome will take several days to run</b>. Typically between 1 and 5 days, depending on server-queue. When the processing is finished you will receive an email to",email,"with uniqueID and download-instructions. Look in your spam filter if not. You can close this browser window."))
}
}
prepare_imputemany_genome<-function(
path,
email=NULL,
updateProgress=NULL,
should_be_imputed=TRUE,
protect_from_deletion=FALSE,
filename=NULL
){
#' prepare imputemany genome
#'
#' This is the data-receiving function for batch uploads, performing the
#' same function as prepare_individual_genome, only for many genomes
#' at the same time. Like prepare_individual_genome, it will perform in
#' web-speed, meaning a few seconds (more slow than individual genome
#' handling though). If these checks are passed, files converted to
#' individual genomes, plus meta information are saved in the queueing area,
#' for further processing and checks.
#'
#' @param path The path to an input file
#' @param email The user-email to report back to
#' @param updateProgress The progress update tracker for the shiny interface
#' @param protect_from_deletion A switch carried down the analysis, that can be set as TRUE for debugging purposes
#' @param filename The optional filename of the sample (useful to have separate for e.g. shiny file submission). If not set it will default to basename(path).
#'
#' @return A short text string with a completion message. In addition all submitted samples will be available for processing in the standard impute-me file-structure, at ~/imputations
#set programs
library("tools")
suppressWarnings(library("shiny"))
plink=paste0(get_conf("programs_path"),"plink")
#set logging level
verbose <- get_conf("verbose")
if(class(verbose)!="numeric")stop(paste("verbose must be numeric, not",class(verbose)))
if(length(verbose)!=1)stop(paste("verbose must be length 1, not",length(verbose)))
if(verbose > 2){
ignore.stdout <- FALSE
}else{
ignore.stdout<- TRUE
}
if(verbose > 1){
ignore.stderr <- FALSE
}else{
ignore.stderr<- TRUE
}
#check path
if(class(path)!="character")stop(paste("path must be character, not",class(path)))
if(length(path)!=1)stop(paste("path must be length 1, not",length(path)))
if(!file.exists(path))stop(paste("Did not find file at path:",path))
#Check filename is ok - set to basename(path) if not given
if(is.null(filename))filename<-basename(path)
if(class(filename)!="character")stop(paste("filename must be character, not",class(filename)))
if(length(filename)!=1)stop(paste("filename must be length 1, not",length(filename)))
if(length(grep(" ",filename))>0)stop(safeError("Please don't use spaces in filenames"))
#check if this sample should be protected_from_deletion
if(class(protect_from_deletion)!="logical")stop(paste("protect_from_deletion must be logical, not",class(protect_from_deletion)))
if(length(protect_from_deletion)!=1)stop(paste("protect_from_deletion must be length 1, not",length(protect_from_deletion)))
#check the updateProgress object - set to a NULL-returning function if not given.
if(is.null(updateProgress))updateProgress<-function(detail,value,max){return(NULL)}
if(class(updateProgress)!="function")stop(paste("updateProgress must be function, not",class(updateProgress)))
#check the user-inputted email, set to the default error_report_mail if not given
if(is.null(email))email<-get_conf("error_report_mail")
if(class(email)!="character")stop(paste("email must be character, not",class(email)))
if(length(email)!=1)stop(paste("email must be length 1, not",length(email)))
if(!get_conf("running_as_docker")){
if( email == "" | sub("[A-Z0-9._%+-]+@[A-Z0-9.-]+\\.[A-Z]{2,4}","",toupper(email)) != ""){
stop(safeError(paste("a real email adress is needed:",email)))
}
}
#check if should_be_imputed is ok
if(class(should_be_imputed)!="logical")stop(paste("should_be_imputed must be logical, not",class(should_be_imputed)))
if(length(should_be_imputed)!=1)stop(paste("should_be_imputed must be length 1, not",length(should_be_imputed)))
#check if mail address is in positive list for bulk upload (when running as docker it is ok to auto-create this)
acceptedMails_path<-paste0(get_conf("misc_files_path"),"accepted_emails.txt")
if(!file.exists(acceptedMails_path)){
if(get_conf("running_as_docker")){
default_accepted_emails_all<-c(
"email imputeok",
"any TRUE"
)
if(!file.exists(dirname(acceptedMails_path)))dir.create(dirname(acceptedMails_path),recursive = T)
f<-file(acceptedMails_path,"w")
writeLines(default_accepted_emails_all,f)
close(f)
if(verbose>0)print(paste0(Sys.time(),": Didn't find an accepted_emails.txt, so wrote a default one, accepting all emails."))
}else{
stop(safeError("Configuration error: Email accepted-emails list not found"))
}
}
#read accepted emails
acceptedMails<-read.table(acceptedMails_path,stringsAsFactors=F,header=T)
if(!email%in%acceptedMails[,"email"] & !"any" %in% acceptedMails[,"email"]){ #bulk-upload must adhere to upload criteria
m<-c(format(Sys.time(),"%Y-%m-%d-%H-%M-%S"),"not_accepted_email",email,path)
m<-paste(m,collapse="\t")
write(m,file=paste0(get_conf("submission_logs_path"),"submission_log.txt"),append=TRUE)
stop(safeError(paste0("Email ",email," was not in the accepted-emails list, and/or the entry 'any' was not found in the accepted emails list. Your data will not be processed and have already been deleted.")))
}
if(should_be_imputed ){
if(!"any" %in% acceptedMails[,"email"]){
imputeok <- acceptedMails[acceptedMails[,"email"]%in%email,"imputeok"]
if(!imputeok)stop(safeError("Email was in the accepted-emails list, but was FALSE for imputeok"))
}
}
#set upload time
upload_time<-format(Sys.time(),"%Y-%m-%d-%H-%M-%S")
#check for too many ongoing imputations
if(verbose>0)print(paste0(Sys.time(),": Check for too many ongoing imputations"))
s<-list.files(get_conf("imputations_path"))
if(length(grep("^imputation_folder",s)) >= get_conf("max_imputations_in_queue")){
m<-c(format(Sys.time(),"%Y-%m-%d-%H-%M-%S"),"too_many_jobs",email,length(grep("^imputation_folder",s)))
m<-paste(m,collapse="\t")
write(m,file=paste0(get_conf("submission_logs_path"),"submission_log.txt"),append=TRUE)
stop(safeError(paste("Too many imputations are already in progress. Cannot start a new one.")))
}
##checking if this job has not actually been run before
if(get_conf("block_double_uploads_by_md5sum")){
if(verbose>0)print(paste0(Sys.time(),": checking if this job has not actually been run before"))
this_person_md5sum <- md5sum(path)
all_md5sums_path<-paste0(get_conf("misc_files_path"),"md5sums.txt")
if(!file.exists(all_md5sums_path)){write("md5sums",file=paste0(get_conf("misc_files_path"),"md5sums.txt"))}
all_md5sums<-read.table(all_md5sums_path,sep="\t",stringsAsFactors = F)[,1]
if(this_person_md5sum %in% all_md5sums){
m<-c(format(Sys.time(),"%Y-%m-%d-%H-%M-%S"),"md5sum_match",email,this_person_md5sum)
m<-paste(m,collapse="\t")
write(m,file=paste0(get_conf("submission_logs_path"),"submission_log.txt"),append=TRUE)
stop(safeError(paste0("This file was already analyzed by the system. Write an email if you wish to clear this flag (or make a small change in your file so it doesn't have the same md5sum, i.e. ",this_person_md5sum,").")))
}
write(this_person_md5sum,file=paste0(get_conf("misc_files_path"),"md5sums.txt"),append=TRUE)
}
#Start handling logic. There's many different paths in and out of this interface
#because it both has to handle various input formats, but also be able to either
#send the data to the main imputation algorithm, or elsewhere for later
#manual handling - depending on user choice
#if the 'should be imputed" switch is not on, we just leave them in a folder for
#later (manual) inspection and downstream processing. This is the most 'robust'
#choice in the sense that virtually nothing is done automatically.
if(!should_be_imputed){
outfolder <- get_conf("uploads_for_imputemany_path")
if(!file.exists(outfolder))dir.create(outfolder)
file.copy(path,paste0(outfolder,upload_time,".zip") )
}
#if the 'should be imputed" switch is on, a lot more overhead code is required to
#handle different file-types - because this will be done at (slow) web-speed, hence the progress-tracker
if(should_be_imputed){
#unpacking and file-submission logic
uploads_for_imputemany_path <- get_conf("uploads_for_imputemany_path")
if(!file.exists(uploads_for_imputemany_path))dir.create(uploads_for_imputemany_path)
newUnzippedPath <- paste0(uploads_for_imputemany_path,upload_time,"_input.txt")
gunzipResults<-unzip(path,exdir="~/uploads_for_imputemany/")
if(length(grep(" ",gunzipResults))>0)stop(safeError("Please don't use spaces in filenames, also not inside zip-file contents."))
gunzipResults<-grep("_MACOSX",gunzipResults,invert=T,value=T)
if(length(gunzipResults)==1){ #then its a zip file
file.rename(gunzipResults, newUnzippedPath)
submission_type <- "illumina"
}else if(length(gunzipResults)==2){
if(!all(c("ped","map")%in%gsub("^.+\\.","",gunzipResults))){
stop(safeError("If submitting zip files with two files in them, their endings must be .ped and .map"))
}else{
submission_type <- "plink-ped"
}
}else if(length(gunzipResults)==3){
if(!all(c("bed","fam","bim")%in%gsub("^.+\\.","",gunzipResults))){
stop(safeError("If submitting zip files with their files in them, their endings must be bed and fam and bim"))
}else{
submission_type <- "plink-bed"
}
}else if(length(gunzipResults)>3){
stop(safeError("Don't submit zip files with more than two files in them"))
}else{ #then it's probably not
#check if it is a gz file
filetype<-system(paste("file ", path),intern=T)