forked from englianhu/binary.com-interview-question
-
Notifications
You must be signed in to change notification settings - Fork 0
/
binary-Q1Inter-HFT.Rmd
7475 lines (6502 loc) · 320 KB
/
binary-Q1Inter-HFT.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
---
title: "<img src='www/deriv.jpg' width='240'>"
subtitle: "[Deriv.com](https://github.com/englianhu/binary.com-interview-question) - Interday High Frequency Trading Models Comparison <span style='color:red'>**Blooper**</span>"
author: "[®γσ, Lian Hu](https://englianhu.github.io/) <img src='www/Me byteDance.jpg' width='24'> <img src='www/RYU.jpg' width='24'> <img src='www/ENG.jpg' width='24'>® <img src='www/xueba1.jpg' width='24'>"
date: "`r lubridate::today('Asia/Tokyo')`"
output:
html_document:
mathjax: https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js
number_sections: yes
toc: yes
toc_depth: 4
toc_float:
collapsed: yes
smooth_scroll: yes
code_folding: hide
css: CSSBackgrounds.css
---
# Abstract
There has already 2 years passed by while [Binary.com-is-Rebranding-to-Deriv.com](https://derivdotcom.medium.com/binary-com-is-rebranding-to-deriv-com-and-here-is-everything-you-need-to-know-6f4a8513c84b), Here I summarized some previous research papers in [Binary.com → Deriv.com](https://englianhu.medium.com/binary-com-deriv-com-6058cdbfc3a1) and continue from this high-frequency-trading.
```{r, warning = FALSE, message = FALSE}
if(!suppressPackageStartupMessages(require('BBmisc'))) {
install.packages('BBmisc', dependencies = TRUE, INSTALL_opts = '--no-lock')
}
suppressPackageStartupMessages(require('BBmisc'))
# suppressPackageStartupMessages(require('rmsfuns'))
pkgs <- c('devtools', 'knitr', 'kableExtra', 'tint', 'dygraphs',
'devtools','readr', 'lubridate', 'data.table', 'reprex',
'feather', 'purrr', 'quantmod', 'tidyquant', 'plotly',
'tibbletime', 'furrr', 'flyingfox', 'tidyr', 'jsonlite',
'timetk', 'plyr', 'dplyr', 'stringr', 'magrittr', 'tdplyr',
'tidyverse', 'memoise', 'htmltools', 'formattable', 'rbokeh',
'dash', 'dashCoreComponents', 'dashHtmlComponents', 'dtplyr',
##https://dashr.plotly.com
'zoo', 'forecast', 'seasonal', 'seasonalview', 'rjson',
'rugarch', 'rmgarch', 'mfGARCH', 'sparklyr', 'jcolors',
'microbenchmark', 'dendextend', 'lhmetools', 'ggthemr',
'stringr', 'pacman', 'profmem', 'DescTools', 'ggthemes',
'htmltools', 'echarts4r', 'viridis', 'hrbrthemes',
'fable', 'fabletools', 'Rfast', 'Metrics', 'MLmetrics')
# https://github.com/mpiktas/midasr
# https://github.com/onnokleen/mfGARCH
# devtools::install_github("business-science/tibbletime")
# devtools::install_github("DavisVaughan/furrr")
suppressAll(lib(pkgs))
# load_pkg(pkgs)
funs <- c('uv_fx.R', 'opt_arma.R', 'multi_seasons.R',
'filterFX.R', 'filter_spec.R', 'mv_fx.R',
'task_progress.R', 'read_umodels.R', 'convertOHLC.R')
l_ply(funs, function(x) source(paste0('./function/', x)))
# spark_install()
# if(FALSE) {
# Not run due to side-effects
# spark_home_set()
# }
# sc <- spark_connect(master = 'local')
#spark_install()
#sc <- spark_connect(master = 'local')
.cl = FALSE
Sys.setenv(TZ = 'Asia/Tokyo')
## options(knitr.table.format = 'html') will set all kableExtra tables to be 'html', otherwise need to set the parameter on every single table.
options(warn = -1, knitr.table.format = 'html')#, digits.secs = 6)
## https://stackoverflow.com/questions/39417003/long-vectors-not-supported-yet-error-in-rmd-but-not-in-r-script
knitr::opts_chunk$set(cache = TRUE, warning = FALSE,
message = FALSE, cache.lazy = FALSE)
## https://www.researchgate.net/post/How_to_solve_Error_cannot_allocate_vector_of_size_12_Gb_in_R
memory.size() ### Checking your memory size
memory.limit() ## Checking the set limit
#memory.size(size=500000)
memory.limit(size=56000) ### expanding your memory _ here it goes beyond to your actually memory. This 56000 is proposed for 64Bit.
rm(pkgs, funs)
```
# Introduction
By refer to <span style='color:goldenrod'>*GARCH模型中的ARIMA(p,d,q)参数最优化*</span> and <span style='color:goldenrod'>*binary.com Interview Question I - Comparison of Univariate GARCH Models*</span>, we know **Fractional Intergrated GJR-GARCH** is the best fit model. This paper we compare the MIDAS, GARCH-MIDAS and Levy Process models. Here I also test another high frequency trading model mcmcsGARCH. These paper might consider as comparison interday trading before start the high frequency trading via [Real Time FXCM](https://github.com/scibrokes/real-time-fxcm).
<span style='color:goldenrod'>*High Frequency Financial Time Series Prediction - Machine Learning Approach*</span> introduce multilayer modelling for high-frequency-trading. <span style='color:goldenrod'>*binary.com Interview Question I - Tick-Data-HiLo For Daily Trading </span><span style='color:red'>(Blooper)</span>* tried to use Hi-Lo daily dataset for modelling but failed. The paper recommend to use complete itraday dataset.
> I noticed that buying early in the morning, around 5am eastern time, tends to give lower prices and selling around 10pm eastern time gives you the highest prices. The wording is weird but i want to know your opinions on how time of day affects the bitcoin trading market. Thank you.
*Source : [Time of day affects trading prices](https://www.reddit.com/r/Bitcoin/comments/7hdtnw/time_of_day_affects_trading_prices/)*
From above quotes, we can know the seasonality of price trend daily, weekly, monthly or annually etc. MIDAS and mcsGARCH are the models designate for high frequency trading.
*A Comparison of GARCH-class Models and MIDAS Regression with Application in Volatility Prediction and Value at Risk Estimation* compares GARCH, eGARCH and MIDAS 3 models with normal and student distribution with matrix. The author concludes that the MIDAS model is the most accurate in volatility prediction but there is inconclusive for VaR 1% and 5%.
> Note that there does not seem to be an option to use SARMA models in the "rugarch" package, so you will have to let the "S" part go. But if there is a seasonal pattern (and that is quite likely when it comes to tourist arrivals), you will have to account for it somehow. Consider using exogenous seasonal variables (dummies or Fourier terms) in the conditional mean model via the argument external.regressors inside the argument mean.model in function ugarchspec. Alternatively, note that a SARMA model corresponds to a restricted ARMA model. An approximation of SARMA could thus be an ARMA with the appropriate lag order but without the SARMA-specific parameter restrictions (since those might not be available in "rugarch").
The quotes above describe about the seasonal factors onto the model which is similar with MIDAS model, kindly refer to [Fitting ARIMA-GARCH model using “rugarch” package](https://stats.stackexchange.com/questions/176550/fitting-arima-garch-model-using-rugarch-package?answertab=votes#tab-top).
# Data
## Tick Data
### Get Data
Due to the dataset gather via `getSymbols('JPY=X', src='av', api.key=api, periodicity='intraday')` is tidied but only 100 observations. Moreover I cannot select the time period from few years ago, therefore here I omit it and use the intraday data gather from [`real-time-fxcm/data/USDJPY/`](https://github.com/scibrokes/real-time-fxcm/tree/master/data/USDJPY) from `Y2015W1` to `Y2018W27`, due to the dataset is tick-data-base and more than 1 million observation per file (per week) and there has 4 years dataset where. Here I need to backtest day-by-day. There will be spent a long time to do.
- [Chapter 7 Importing Financial Data from the Internet](https://msperlin.github.io/pafdR/importingInternet.html) introduce few packages where provides api service.
- [High Frequency Data Price (Sample)](https://raw.githubusercontent.com/DavisVaughan/fin-econ-project-bitcoin/master/data/cleaned/cleaned-bitstamp-minutely.csv) is an example for intraday high-frequency-trading.
```{r, warning = FALSE, message = FALSE}
cr_code <- c('AUDUSD=X', 'EURUSD=X', 'GBPUSD=X', 'CHF=X', 'CAD=X', 'CNY=X', 'JPY=X')
names(cr_code) <- c('AUDUSD', 'EURUSD', 'GBPUSD', 'USDCHF', 'USDCAD', 'USDCNY', 'USDJPY')
# names(cr_code) <- c('USDAUD', 'USDEUR', 'USDGBP', 'USDCHF', 'USDCAD', 'USDCNY', 'USDJPY')
dtr <- str_extract_all(getwd(), '.*/', '')[1]
dtr1 <- paste0(dtr, 'real-time-fxcm/data/')
## Read presaved FXCM data.
# mbase <- sapply(names(cr_code), function(x) readRDS(paste0('./data/', x, '.rds')) %>% na.omit)
fls <- sapply(names(cr_code), function(x) {
dtr1 <- paste0(dtr1, x)
list.files(dtr1, pattern = '^Y[0-9]{4}W[0-9]{1,2}.rds$') %>%
str_replace_all('.rds', '')
})
fls[lengths(fls) == 0] <- NA_character_
fls[is.na(fls)] <- NULL
# AUDUSD <- sapply(fls[[1]], read_rds)
# EURUSD <- sapply(fls[[2]], read_rds)
# GBPUSD <- sapply(fls[[3]], read_rds)
# USDCHF <- sapply(fls[[4]], read_rds)
# USDCAD <- sapply(fls[[5]], read_rds)
# USDCNY <- sapply(fls[[6]], read_rds)
# mbase <- llply(as.list(fls[[7]]), read_rds) #185 files where 1 files contains 1 million observation.
## Here I take USDJPY as example...
dtr1s <- paste0(dtr1, names(fls[length(fls)]))
fs <- list.files(dtr1s, pattern = '^Y[0-9]{4}W[0-9]{1,2}.rds$') %>%
str_replace_all('.rds', '')
# eval(parse(text = paste0(fs, "<- readRDS('", fls[[7]], "') %>% as_tibble")))
t.unit <- c('seconds', 'minutes', 'hours', 'days', 'weeks', 'months', 'quarters', 'quarters')
## https://www.alphavantage.co/
## https://www.alphavantage.co/support/#api-key
# api = 'UL7EPVVEGDVC3TXC'
# getSymbols('JPY=X', src='av', api.key=api, periodicity='intraday')
```
[binary.com Interview Question I - Multivariate GARCH Models](http://rpubs.com/englianhu/binary-Q1Multi-GARCH) concludes that the multivariate will be more accurate but due to save time, here I only use univariate for models comparison.
Due to high volume of dataset, here I only use `USDJPY` since the variance is higher than the rest of currencies.
```{r, warning = FALSE, message = FALSE}
## Read raw dataset.
#Y2015W1 <- read_rds(paste0(dtr1, '/', fls[[7]][1], '.rds')) %>% as_tibble
eval(parse(text = paste0(fs[1], "<- readRDS('", dtr1, "/",
names(fls[length(fls)]), '/', fls[length(fls)][[1]][1],
".rds') %>% as_tibble")))
## raw dataset
Y2015W1
```
Above table shows the raw tick-dataset (shows price fluctuation in mili-seconds). As we know that the variance in unit `mili-second` is almost 0. Therefore I refer to <span style='color:goldenrod'>*High Frequency GARCH: The multiplicative component GARCH (mcsGARCH) model*</span> and use 1 minute as 1 time unit, convert from univariate `ask` and univariate `bid` to be OHLC dataset.
### Tidy Data
![](www/seasonality.jpg)
> For example, the `taylor` data set from the `forecast` package contains half-hourly electricity demand data from England and Wales over about 3 months in 2000. It was defined as `taylor <- msts(x, seasonal.periods=c(48,336)`.
*Source [Seasonal periods](https://robjhyndman.com/hyndsight/seasonal-periods/)*
<span style='color:goldenrod'>*A Review of Literature on Time Zone Difference and Trade*</span> study about trading across timezone and different country, if the timezone difference will affect the trading.
> I would like to use R for time series analysis. I want to make a time-series model and use functions from the packages timeDate and forecast.
I have intraday data in the CET time zone (15 minutes data, 4 data points per hour). On March 31st daylight savings time is implemented and I am missing 4 data points of the 96 that I usually have. On October 28th I have 4 data points too many as time is switched back.
For my time series model I always need 96 data points, as otherwise the intraday seasonality gets messed up.
Do you have any experiences with this? Do you know an R function or a package that would be of help to automat such data handling - something elegant? Thank you!
> I had a similar problem with hydrological data from a sensor. My timestamps were in UTC+1 (CET) and did not switch to daylight saving time (UTC+2, CEST). As I didn't want my data to be one hour off (which would be the case if UTC were used) I took the %z conversion specification of strptime. In ?strptime you'll find:
`%z` Signed offset in hours and minutes from UTC, so -0800 is 8 hours behind UTC.
For example: In 2012, the switch from Standard Time to DST occured on 2012-03-25, so there is no 02:00 on this day. If you try to convert "2012-03-25 02:00:00" to a POSIXct-Object,
```{r, warning = FALSE, message = FALSE}
## https://stackoverflow.com/questions/29202021/r-how-to-extract-dates-from-a-time-series
getTStime <- function(ats){
start <- start(ats)
end <- end(ats)
time <- list()
time[[1]] <- start
m <- 2
while(!(identical(start, end))){
start[2] <- start[2] + 1
if (start[2]==1441){ #mins per day
start[1] <- start[1] + 1
start[2] <- 1
}
time[[m]] <- start
m <- m + 1
}
return(time)
}
## https://stackoverflow.com/questions/13865172/handling-data-on-the-days-when-we-switch-to-daylight-savings-time-and-back-in-r
#> as.POSIXct("2012-03-25 02:00:00", tz="Europe/Vienna")
#[1] "2012-03-25 CET"
#
## you don't get an error or a warning, you just get date without the time (this behavior is documented).
## Using `format = "%z"` gives the desired result:
#
#> as.POSIXct("2012-03-25 02:00:00 +0100", format="%F %T %z", tz="Europe/Vienna")
#[1] "2012-03-25 03:00:00 CEST"
## function
as.POSIXct.no.dst <- function (
x, tz = '', format='%Y-%m-%d %H:%M', offset='+0100', ...) {
x <- paste(x, offset)
format <- paste(format, '%z')
res <- as.POSIXct(x, tz, format=format, ...)
return(res)
}
```
*Source : [Handling data on the days when we switch to daylight savings time and back in R](https://stackoverflow.com/questions/13865172/handling-data-on-the-days-when-we-switch-to-daylight-savings-time-and-back-in-r)*
[Why is this xts frequency always 1?](https://stackoverflow.com/questions/34454947/why-is-this-xts-frequency-always-1) talk about the frequency of `xts` dataset and we need to use `zoo` to convert it.
> So far, we have considered relatively simple seasonal patterns such as quarterly and monthly data. However, higher frequency time series often exhibit more complicated seasonal patterns. For example, daily data may have a weekly pattern as well as an annual pattern. Hourly data usually has three types of seasonality: a daily pattern, a weekly pattern, and an annual pattern. Even weekly data can be challenging to forecast as it typically has an annual pattern with seasonal period of `365.25/7≈52.179` on average.
...
The top panel of Figure 11.1 shows the number of retail banking call arrivals per 5-minute interval between 7:00am and 9:05pm each weekday over a 33 week period. The bottom panel shows the first three weeks of the same time series. There is a strong daily seasonal pattern with frequency 169 (there are 169 5-minute intervals per day), and a weak weekly seasonal pattern with frequency `169 × 5 = 845`. (Call volumes on Mondays tend to be higher than the rest of the week.) If a longer series of data were available, we may also have observed an annual seasonal pattern.
*Source : [11.1 Complex seasonality](https://otexts.org/fpp2/complexseasonality.html)*
```{r, warning = FALSE, message = FALSE}
## Convert the univariate price to be OHLC price in `minutes` unit.
Y2015W1_min1 <- Y2015W1 %>%
convertOHLC(combine = TRUE, trade = FALSE, .unit = t.unit[2]) %>%
bind_rows #combined `ask/bid` price
Y2015W1_min1
```
```{r, warning = FALSE, message = FALSE}
#suppressWarnings(Y2015W1 <- tbl %>%
# dplyr::select(date, close) %>% tk_xts %>%
# auto.arima(seasonal = FALSE))
## Count the observation in order to model seasonal frequency model.
Y2015W1_min1 %>%
dplyr::select(index) %>%
ddply(.(date(index)), summarise, n = length(index)) %>%
as_tibble
```
Kindly refer to section [Seasonal Data] or [Seasonal periods](https://robjhyndman.com/hyndsight/seasonal-periods/) to know the seasonality dataset. <span style='color:goldenrod'>*High Frequency GARCH: The multiplicative component GARCH (mcsGARCH) model*</span> use the dataset start from `00:01:00` but not `00:00:00`, therefore above dataset shows the last observation will be the start of next day.
```{r, warning = FALSE, message = FALSE}
# tsz <- llply(fls[[7]], function(x) {
# y <- read_rds(x) %>%
# convertOHLC(combine = TRUE, trade = FALSE, .unit = t.unit[2]) %>%
# bind_rows %>%
# dplyr::filter(index == head(index, 1) |
# index == tail(index, 1)) %>%
# dplyr::mutate(diff = difftime(index, lag(index, 1), units = 'mins'))
# }) %>% bind_rows %>% as_tibble %>% arrange(index)
# saveRDS(tsz, 'C:/Users/scibr/Documents/GitHub/scibrokes/real-time-fxcm/data/USDJPY/tsz.rds')
## The daylight saving convertion in not tally.
tsz <- read_rds(paste0(dtr1s, '/tsz.rds')) %>%
dplyr::filter(index >= ymd_hms('2015-01-05 00:00:00', tz = 'Europe/Athens'))
tsz
## count the frequency of weekly observation.
tsz %>% dplyr::count(diff) %>%
kable(caption = 'Count data point') %>%
kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
scroll_box(width = '100%', fixed_thead = TRUE, height = '400px')
## missing observation.
tsz %>% dplyr::filter(diff <= 6000)
```
```{r, warning = FALSE, message = FALSE}
## https://stackoverflow.com/questions/34454947/why-is-this-xts-frequency-always-1
## https://www.stat.berkeley.edu/~s133/dates.html
## https://stackoverflow.com/questions/45070078/convert-forecast-time-produced-by-arima-into-standard-date-time
#How to extract below index to datetime
#Y2015W1 %>% tk_ts(end = c(1440, 7200), frequency = 1440) %>% attributes %>% .$index %>% as.POSIXct(tz = 'UTC', origin = '1970-01-01') %>% force_tz(tz = 'Europe/Athens')
Y2015W1_min1 %>%
tk_ts(end = c(3, 7200), frequency = 1440) %>%
.[,1:ncol(.)] %>%
head #1440 * 5 = 7200
Y2015W1_min1 %>%
head %>%
zooreg(frequency = 1440)
#Y2015W1_min1 %>% tk_ts(end = c(1440, 7200), frequency = 1440) %>% index %>% as.POSIXct(tz = 'UTC', origin = '1970-01-01')
## https://stats.stackexchange.com/questions/144158/daily-time-series-analysis
## http://manishbarnwal.com/blog/2017/05/03/time_series_and_forecasting_using_R/
#How to extract below index to datetime
#Y2015W1_min1 %>% msts(seasonal.periods=c(1440, 7200)) %>% .[,1] %>% as.numeric %>% as.POSIXct(tz = 'UTC', origin = '1970-01-01') %>% force_tz(tz = 'Europe/Athens')
Y2015W1_min1 %>%
head %>%
msts(seasonal.periods=c(1440, 7200))
```
`ts()` only can build a year-month seasonal dataset, otherwise need to decimal numeric to figure out what date within a month accordingly. `msts()` will be more user friendly which is modelling intra-and-inter seasonal dataset. <s>Now I convert all dataset again from `UTC` to `UTC+2` to be a constant weekly seasonal dataset. Due to the trading day is 5 days and 2 rest days, therefore I set a weekly seasonal period instead of daily.</s>
[2018 Time Zones - UTC](https://www.timeanddate.com/time/zone/timezone/utc) shows that has no change in *No changes, UTC all of the period* from 2010 to 2019.
> From the [Wikipedia UTC page](http://en.wikipedia.org/wiki/Coordinated_Universal_Time):
> UTC does not change with a change of seasons, but local time or civil time may change if a time zone jurisdiction observes daylight saving time or summer time. For example, UTC is 5 hours ahead of local time on the east coast of the United States during winter, but 4 hours ahead during summer.
In other words, when a time zone observes DST, its offset from UTC changes when there's a DST transition, but that's that time zone observing DST, not UTC.
Without knowing much about PHP time zone handling, it seems strange to me that you can specify "with DST" or "without DST" in a conversion - the time zones themselves specify when DST kicks in... it shouldn't have to be something you specify yourself.
*Source : [Does UTC observe daylight saving time?](https://stackoverflow.com/questions/5495803/does-utc-observe-daylight-saving-time)*
<s>Due to the `UTC` timezone has no daylight saving issue, therefore the initial trading time will be a problem where I need to cope with.</s> [Handling Data for Daylight-Saving and Non-Daylight-Saving for HFT](https://community.rstudio.com/t/handling-data-for-daylight-saving-and-non-daylight-saving-for-hft/15699/2) provides the solution for the timezone issue.
```{r, eval=FALSE}
## -------- eval=FALSE --------------
## Now I simply tidy all datasets and save it prior to start the statistical modelling.
llply(fls[[7]], function(x) {
mbase <- read_rds(x) %>% as_tibble
## Convert the univariate price to be OHLC price in `minutes` unit.
mbase %<>% convertOHLC(.unit = t.unit[2], combine = TRUE) %>%
bind_rows #combined `ask/bid` price
y <- x %>% str_replace_all('.rds$', '_tick-to-min1.rds')
saveRDS(mbase, y)
cat(y, 'saved!\n')
})
```
## 1 Minute Data
### Get and Tidy Data
Due to [fxcm/MarketData](https://github.com/fxcm/MarketData) updated and provides 1 minute, 1 hour, 1 day datasets recently, here I directly download 1 minute dataset and tidy it.
> time is based on EST, because our server is in New Jersey USA.
it is 5:00PM at the end of the day, which is shown in GMT as 21:00 day light saving or 22:00 without day light saving.
*Source : [Inconsistency of trading date time #1](https://github.com/fxcm/MarketData/issues/1#issuecomment-427464344)*
From above quote, we can know even the `EST` converted to `UTC` will not be `00:00:00`, therefore I refer to [Handling data on the days when we switch to daylight savings time and back in R](https://stackoverflow.com/questions/13865172/handling-data-on-the-days-when-we-switch-to-daylight-savings-time-and-back-in-r) as my solution as `UTC+2` (daylight saving `UTC+3`) will get the desired result.
```{r, warning = FALSE, message = FALSE}
cr_code <- c('EURUSD=X', 'GBPUSD=X', 'CHF=X', 'CAD=X', 'JPY=X')
names(cr_code) <- c('EURUSD', 'GBPUSD', 'USDCHF', 'USDCAD', 'USDJPY')
fls1 <- llply(names(cr_code), function(x) {
fls <- list.files(paste0(dtr1, x), pattern = '^Y[0-9]{4}W[0-9]{1,2}_m1.rds$')
if (length(fls) > 0) paste0(dtr1, x, '/', fls)
})
names(fls1) <- names(cr_code)
dtr1s <- paste0(dtr1, names(fls1[length(fls1)]))
fs1 <- list.files(dtr1s, pattern = '^Y[0-9]{4}W[0-9]{1,2}_m1.rds$') %>%
str_replace_all('.rds', '')
## Read raw dataset.
# eval(parse(text = paste0(fs, "<- read_rds('", fls[[7]], "')")))
```
```{r, warning = FALSE, message = FALSE}
## Read raw dataset.
eval(parse(text = paste0(fs1[1], "<- read_rds('", fls1[[5]][1], "') %>% as_tibble")))
## raw dataset
Y2015W1_m1
```
Now I try to validate the daylight saving date.
```{r, warning = FALSE, message = FALSE}
# tsz2 <- llply(fls1[[5]], function(x) {
# y <- read_rds(x) %>% dplyr::filter(DateTime == head(DateTime, 1)|
# DateTime == tail(DateTime, 1)) %>%
# dplyr::mutate(DateTime = DateTime %>% mdy_hms %>%
# .POSIXct(tz = 'Europe/Athens'),
# diff = difftime(DateTime, lag(DateTime, 1), units = 'mins'))
#
# nch <- y$DateTime[1] %>% substr(nchar(.)+2, nchar(.)+3)
# y %<>% dplyr::mutate(
# nch = nch, DateTime = if_else(
# nch == '23', DateTime + hours(1), DateTime)) %>%
# dplyr::select(-nch)
# }) %>% bind_rows %>% as_tibble %>% arrange(DateTime)
# saveRDS(tsz2, 'C:/Users/scibr/Documents/GitHub/scibrokes/real-time-fxcm/data/USDJPY/tsz2.rds')
## The daylight saving convertion in not tally.
tsz2 <- read_rds(paste0(dtr1s, '/tsz2.rds')) %>%
dplyr::rename(index = DateTime) %>%
dplyr::filter(index >= ymd_hms('2015-01-05 00:01:00', tz = 'Europe/Athens'))
tsz2
## count the frequency of weekly observation.
tsz2 %>% dplyr::count(diff) %>%
kable(caption = 'Count data point') %>%
kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
scroll_box(width = '100%', fixed_thead = TRUE, height = '400px')
## missing observation.
tsz2 %>% dplyr::filter(diff <= 6000)
```
From above tables, we can know the daylight saving detection and `datetime` auto convertion is not tally. Here I united all intial observation started from `00:01:00`.
## Completion of Data
### Data Selection
[binary.com 面试试题 I - 单变量数据缺失值管理](http://rpubs.com/englianhu/handle-missing-value) and [binary.com面试试题 I - 单变量数据缺失值管理 II](http://rpubs.com/englianhu/handle-multivariate-missing-value) compares the missing values dataset and refill the missing value with `imputeTS::na.seadec()`:
- interpolation
- kalman
- locf
- ma
- mean
- random
The papers concludes that the `imputeTS::na.seadec(algorithm = 'interpolation')` or `imputeTS::na.seadec(algorithm = 'kalman')` repaired dataset no matter how much of portion of missing value is workable since the MSE and bias is very low. The `Amelia::amelia` is accurate and the bias and imprecision is small compare to `imputeTS::sea.dec` when the portion of missing value is small. The papers compare tick-data-to-1min-data and also 1min-data where both datasets gather from FXCM. It is very weird that the `tidyr::fill` and `na.locf` both are not too accurate. However, in this paper I use `tidy::fill()` method to fill the missing value, it will similar with `kalman filter` method since it will filled up the missing value ascending to fill up with direction down until the bottom of the dataset and then fill up with direction up by descending to fill up initial missing value or top. It will not occured standard error bias like open or close price higher than highest price or lower than lowest price. Moreover, the filled price will not bias a lot from the trend as time goes by.
```{r, warning = FALSE, message = FALSE}
## tick-data to minute 1 dataset
#tsz <- read_rds('C:/Users/scibr/Documents/GitHub/scibrokes/real-time-fxcm/data/USDJPY/tsz.rds')
tsz %>% dplyr::count(diff) %>%
kable(caption = 'Count data point') %>%
kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
scroll_box(width = '100%', fixed_thead = TRUE, height = '400px')
## minute 1 dataset
#tsz2 <- read_rds('C:/Users/scibr/Documents/GitHub/scibrokes/real-time-fxcm/data/USDJPY/tsz2.rds')
tsz2 %>% dplyr::count(diff) %>%
kable(caption = 'Count data point') %>%
kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
scroll_box(width = '100%', fixed_thead = TRUE, height = '400px')
```
Above tables show both datasets are incompleted. However, when I tried to check the `Y2017W20` and `Y2017W20_m1`, the tick-dataset only start from `2017-05-15` until `2017-05-17` but the m1-dataset from `2017-05-15` until `2017-05-19`. Kindly refer to below comparison.
```{r, warning = FALSE, message = FALSE}
mean(tsz$diff, na.rm=TRUE)
mean(tsz2$diff, na.rm=TRUE)
```
From above comparison, we know the dataset of `1min` is better compelete than `tick-to-1min` dataset.
```{r, warning = FALSE, message = FALSE}
#left_join(tsz[c(1, ncol(tsz))], tsz2[c(1, ncol(tsz2))])
right_join(tsz[c(1, ncol(tsz))], tsz2[c(1, ncol(tsz2))])
```
- [How to fill in missing data in time series?](https://stats.stackexchange.com/questions/245615/how-to-fill-in-missing-data-in-time-series) talk about the missing data.
- [4.1 Seasonal ARIMA models](https://onlinecourses.science.psu.edu/stat510/node/67/)
- [How to fill in missing data in time series?](https://stats.stackexchange.com/questions/245615/how-to-fill-in-missing-data-in-time-series)
- [How to Handle Missing Data](https://towardsdatascience.com/how-to-handle-missing-data-8646b18db0d4)
Here I compare the mean and get to know the downloaded min1 dataset from FXCM is better than downloaded tick-dataset (converted to min1) from FXCM.
```{r, eval=FALSE}
# -------- eval=FALSE -----------
## tick-data to min-1 dataset
data_tm1 <- llply(fls[[7]], function(x) {
y <- read_rds(x) %>%
convertOHLC(combine = TRUE)
yw <- x %>%
str_extract_all('Y[0-9]{4}W[0-9]{1,2}') %>%
str_split_fixed('[A-Z]{1}', 3) %>%
.[,-1]
y %<>% dplyr::mutate(
year = as.numeric(yw[1]), week = as.numeric(yw[2]), .)
}) %>%
bind_rows %>%
as_tibble %>%
arrange(index)
## min-1 dataset
data_m1 <- llply(fls1[[5]], function(x) {
y <- read_rds(x) %>%
dplyr::rename(index = DateTime) %>%
dplyr::mutate(index = index %>% mdy_hms %>%
.POSIXct(tz = 'Europe/Athens') %>%
force_tz())
yw <- x %>% str_extract_all('Y[0-9]{4}W[0-9]{1,2}') %>%
str_split_fixed('[A-Z]{1}', 3) %>% .[,-1]
nch <- y$index[1] %>% substr(nchar(.)+2, nchar(.)+3)
y %<>% dplyr::mutate(
year = as.numeric(yw[1]), week = as.numeric(yw[2]),
nch = nch, index = if_else(
nch == '23', index + hours(1), index)) %>%
dplyr::select(-nch)
}) %>%
bind_rows %>%
as_tibble %>%
arrange(index)
```
```{r, eval=FALSE}
# -------- eval=FALSE -----------
#dtm <- seq(ymd_hms('2015-01-01 00:00:00'), ymd_hms('2017-08-31 00:00:00'), by = 'minutes')
#dtm <- seq(from = ymd('2015-01-05'), to = ymd('2018-07-07'), by = 'weeks')
dtm <- data_tm1 %>%
dplyr::select(index, year ,week) %>%
dplyr::mutate(index = date(index)) %>%
ddply(.(year, week), head, 1) %>%
.[-nrow(.),]
## create a seq of datetime to complete the data point.
dttm <- llply(1:nrow(dtm), function(i) {
x1 <- dtm$index[i] %>%
as.POSIXct %>%
with_tz(tz = 'UTC') %>%
force_tz()
#x2 <- x1 + days(4) + hours(23) + minutes(59)
x2 <- x1 + days(5)
data_frame(index = seq.POSIXt(from = x1 + minutes(1), to = x2, by = '1 min'),
year = dtm[i,2], week = dtm[i,3])
}) %>%
bind_rows %>%
as_tibble
```
Above chunk created a sequence of `datetime`.
```{r, eval=FALSE}
## merge dataset
data_m1 <- left_join(dttm, data_m1) %>%
as_tibble %>%
unique %>%
arrange(index)
data_tm1 <- left_join(dttm, data_tm1) %>%
as_tibble %>%
unique %>%
arrange(index)
## https://stackoverflow.com/questions/43212308/na-locf-using-group-by-from-dplyr
## https://stackoverflow.com/questions/233401., eval.expr = TRUE) :
Scanner error: mapping values50/replace-missing-values-na-with-most-recent-non-na-by-group
## https://stackoverflow.com/questions/40040834/replace-na-with-previous-or-next-value-by-group-using-dplyr/40041172
## https://stackoverflow.com/questions/47242643/na-at-the-end-of-column-using-na-locf-function
## https://stackoverflow.com/questions/49578085/na-locf-function-is-changing-data-frame-values-from-int-to-char-in-r
## https://stackoverflow.com/questions/13616965/how-to-fill-nas-with-locf-by-factors-in-data-frame-split-by-country
## https://stackoverflow.com/questions/23340150/replace-missing-values-na-with-most-recent-non-na-by-group
# data_m1 %>%
# group_by(index, week) %>%
# dplyr::mutate_all(funs(na.locf(., na.rm = FALSE)))
# data_m1 %>% split(data_m1$index) %>%
# llply(function(x) {
# na.locf(na.locf(x), fromLast = TRUE)
# }) %>% do.call(rbind, .)
# data_m1 %<>% ddply(.(index, week), na_locf) %>% as_tibble
#> data_m1 %>% anyNA
#[1] FALSE
data_m1 %<>%
ddply(.(year, week), function(x) {
x %>% fill(year, week, BidOpen, BidHigh, BidLow, BidClose,
AskOpen, AskHigh, AskLow, AskClose) %>% #default direction down
fill(year, week, BidOpen, BidHigh, BidLow, BidClose,
AskOpen, AskHigh, AskLow, AskClose, .direction = 'up')
}) %>%
as_tibble
data_tm1 %<>%
ddply(.(year, week), function(x) {
x %>% fill(year, week, BidOpen, BidHigh, BidLow, BidClose,
AskOpen, AskHigh, AskLow, AskClose) %>% #default direction down
fill(year, week, BidOpen, BidHigh, BidLow, BidClose,
AskOpen, AskHigh, AskLow, AskClose, .direction = 'up')
}) %>%
as_tibble
#> data_m1 %>% anyNA
#[1] TRUE
#> data_m1 %>% filter_all(any_vars(is.na(.)))
## A tibble: 7,200 x 11
# index year week BidOpen BidHigh BidLow BidClose AskOpen AskHigh AskLow AskClose
# <dttm> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 2018-01-02 00:01:00 2017 53 NA NA NA NA NA NA NA NA
# 2 2018-01-02 00:02:00 2017 53 NA NA NA NA NA NA NA NA
# 3 2018-01-02 00:03:00 2017 53 NA NA NA NA NA NA NA NA
# 4 2018-01-02 00:04:00 2017 53 NA NA NA NA NA NA NA NA
# 5 2018-01-02 00:05:00 2017 53 NA NA NA NA NA NA NA NA
# 6 2018-01-02 00:06:00 2017 53 NA NA NA NA NA NA NA NA
# 7 2018-01-02 00:07:00 2017 53 NA NA NA NA NA NA NA NA
# 8 2018-01-02 00:08:00 2017 53 NA NA NA NA NA NA NA NA
# 9 2018-01-02 00:09:00 2017 53 NA NA NA NA NA NA NA NA
#10 2018-01-02 00:10:00 2017 53 NA NA NA NA NA NA NA NA
## ... with 7,190 more rows
#> data_tm1 %>% anyNA
#[1] FALSE
#> data_tm1 %>% filter_all(any_vars(is.na(.)))
## A tibble: 0 x 11
## ... with 11 variables: index <dttm>, year <dbl>, week <dbl>, AskOpen <dbl>, AskHigh <dbl>, AskLow <dbl>,
## AskClose <dbl>, BidOpen <dbl>, BidHigh <dbl>, BidLow <dbl>, BidClose <dbl>
saveRDS(data_m1, paste0(dtr1, '/data_m1.rds'))
saveRDS(data_tm1, paste0(dtr1, '/data_tm1.rds'))
```
I don't use the `data.table` and `feather` because of the storage concerns. Kindly refer to [[问答] 对大数据如何用R高效处理](http://bbs.pinggu.org/forum.php?mod=redirect&goto=findpost&ptid=6685427&pid=53936462&fromuid=5794471)和[[问答] 对大数据如何用R高效处理](http://bbs.pinggu.org/forum.php?mod=redirect&goto=findpost&ptid=6685427&pid=53936861&fromuid=5794471).
- [christophsax/feather-fwrite.R](https://gist.github.com/christophsax/3db87a48596768b232a26dfce87c3299)
- [比較 save/load, saveRDS/read_rds, feather, 與 data.table 套件的讀寫速度](http://steve-chen.tw/?p=555)
Finally, I filled-up the `NA` section of `data_m1` and `data_tm1` and eventually filled up by `tidyr::fill` function.
### Read Data
Due to the files preload all before simulate the statistical modelling will occupy the space. Here I directly read the files and simulate the algorithmic prediction in following sections.
```{r, warning = FALSE, message = FALSE}
rm(list = ls()[grep('i|j|tz|nch|yw|dtm|dttm|form|data|Y2015W|tsz|tsz2|dc', ls())])
cr_code <- c('AUDUSD=X', 'EURUSD=X', 'GBPUSD=X', 'CHF=X', 'CAD=X', 'CNY=X', 'JPY=X')
names(cr_code) <- c('AUDUSD', 'EURUSD', 'GBPUSD', 'USDCHF', 'USDCAD', 'USDCNY', 'USDJPY')
data_m1 <- read_rds(paste0(dtr1s, '/data_m1.rds'))
tb1 <- data_m1 %>% ddply(.(year, week), head, 1) %>%
kbl('html', caption = 'Weekly Initial Data Point', escape = FALSE) %>%
kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
scroll_box(width = '100%', fixed_thead = TRUE, height = '400px')
tb1
```
```{r, warning = FALSE, message = FALSE}
tb2 <- data_m1 %>% ddply(.(year, week), tail, 1) %>%
kbl('html', caption = 'Weekly Final Data Point', escape = FALSE) %>%
kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
scroll_box(width = '100%', fixed_thead = TRUE, height = '400px')
tb2
```
```{r, warning = FALSE, message = FALSE}
## Tidy dataset for modelling.
data_m1 %<>%
dplyr::mutate(open = (BidOpen + AskOpen)/2, close = (BidClose + AskClose)/2) %>%
dplyr::rename(high = BidHigh, low = AskLow) %>% #use bid price for sell.
dplyr::select(index, open, high, low, close) # and ask price for buy.
```
Here I try to check if the filled dataset bias or not. Due to above I used `open = (BidOpen + AskOpen)/2`, `high = BidHigh`, `low = AskLow` and `close = (BidClose + AskClose)/2`^[For buying order, we need to refer to `ask` price and selling order need to refer to `bid` price.]. There will probably have bias.
```{r, warning = FALSE, message = FALSE}
tb3 <- data_m1 %>% dplyr::mutate(
bias.open = if_else(open>high|open<low, 1, 0),
bias.high = if_else(high<open|high<low|high<close, 1, 0),
bias.low = if_else(low>open|low>high|low>close, 1, 0),
bias.close = if_else(close>high|close<low, 1, 0)) %>%
dplyr::filter(bias.open==1|bias.high==1|bias.low==1|bias.close==1)# %>%
# kable(caption = 'Bias Imputation') %>%
# kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
# scroll_box(width = '100%', fixed_thead = TRUE, height = '400px')
tb3
```
```{r include = FALSE}
rm(tb3)
```
```{r, warning = FALSE, eval=FALSE}
# -------- eval=FALSE -----------
## Below prove that the dataset has no any bias or error.
> read_rds(paste0(dtr1, '/data_m1.rds')) %>% dplyr::mutate(
+ open = BidOpen, high = BidHigh, low = BidLow, close = BidClose,
+ bias.open = if_else(open>high|open<low, 1, 0),
+ bias.high = if_else(high<open|high<low|high<close, 1, 0),
+ bias.low = if_else(low>open|low>high|low>close, 1, 0),
+ bias.close = if_else(close>high|close<low, 1, 0)) %>%
+ dplyr::filter(bias.open==1|bias.high==1|bias.low==1|bias.close==1)
# A tibble: 0 x 19
# ... with 19 variables: index <dttm>, year <dbl>, week <dbl>, BidOpen <dbl>, BidHigh <dbl>, BidLow <dbl>,
# BidClose <dbl>, AskOpen <dbl>, AskHigh <dbl>, AskLow <dbl>, AskClose <dbl>, open <dbl>, high <dbl>, low <dbl>,
# close <dbl>, bias.open <dbl>, bias.high <dbl>, bias.low <dbl>, bias.close <dbl>
> read_rds(paste0(dtr, '/data_tm1.rds')) %>% dplyr::mutate(
+ open = BidOpen, high = BidHigh, low = BidLow, close = BidClose,
+ bias.open = if_else(open>high|open<low, 1, 0),
+ bias.high = if_else(high<open|high<low|high<close, 1, 0),
+ bias.low = if_else(low>open|low>high|low>close, 1, 0),
+ bias.close = if_else(close>high|close<low, 1, 0)) %>%
+ dplyr::filter(bias.open==1|bias.high==1|bias.low==1|bias.close==1)
# A tibble: 0 x 19
# ... with 19 variables: index <dttm>, year <dbl>, week <dbl>, AskOpen <dbl>, AskHigh <dbl>, AskLow <dbl>,
# AskClose <dbl>, BidOpen <dbl>, BidHigh <dbl>, BidLow <dbl>, BidClose <dbl>, open <dbl>, high <dbl>, low <dbl>,
# close <dbl>, bias.open <dbl>, bias.high <dbl>, bias.low <dbl>, bias.close <dbl>
> read_rds(paste0(dtr1, '/data_m1.rds')) %>% dplyr::mutate(
+ open = AskOpen, high = AskHigh, low = AskLow, close = AskClose,
+ bias.open = if_else(open>high|open<low, 1, 0),
+ bias.high = if_else(high<open|high<low|high<close, 1, 0),
+ bias.low = if_else(low>open|low>high|low>close, 1, 0),
+ bias.close = if_else(close>high|close<low, 1, 0)) %>%
+ dplyr::filter(bias.open==1|bias.high==1|bias.low==1|bias.close==1)
# A tibble: 0 x 19
# ... with 19 variables: index <dttm>, year <dbl>, week <dbl>, BidOpen <dbl>, BidHigh <dbl>, BidLow <dbl>,
# BidClose <dbl>, AskOpen <dbl>, AskHigh <dbl>, AskLow <dbl>, AskClose <dbl>, open <dbl>, high <dbl>, low <dbl>,
# close <dbl>, bias.open <dbl>, bias.high <dbl>, bias.low <dbl>, bias.close <dbl>
> read_rds(dtr1, paste0('/data_tm1.rds')) %>% dplyr::mutate(
+ open = AskOpen, high = AskHigh, low = AskLow, close = AskClose,
+ bias.open = if_else(open>high|open<low, 1, 0),
+ bias.high = if_else(high<open|high<low|high<close, 1, 0),
+ bias.low = if_else(low>open|low>high|low>close, 1, 0),
+ bias.close = if_else(close>high|close<low, 1, 0)) %>%
+ dplyr::filter(bias.open==1|bias.high==1|bias.low==1|bias.close==1)
# A tibble: 0 x 19
# ... with 19 variables: index <dttm>, year <dbl>, week <dbl>, AskOpen <dbl>, AskHigh <dbl>, AskLow <dbl>,
# AskClose <dbl>, BidOpen <dbl>, BidHigh <dbl>, BidLow <dbl>, BidClose <dbl>, open <dbl>, high <dbl>, low <dbl>,
# close <dbl>, bias.open <dbl>, bias.high <dbl>, bias.low <dbl>, bias.close <dbl>
```
I initially try to use `bid` for high and `ask` for low in order to produce a better prediction price for buy and sell. However, I use the mean value of OHLC all prices for this paper to avoid the statistical error/bias.
Due to `1min` dataset is better than (more complete) `tickdata-to-1min`, here I use the `1min` dataset.
```{r, warning = FALSE, message = FALSE}
if(!exists('data_m1')) {
data_m1 <- read_rds(paste0(dtr1s, '/data_m1.rds'))
}
if(names(data_m1) %>% str_detect('Bid|Ask') %>% any()) {
data_m1 %<>%
dplyr::mutate(open = (BidOpen + AskOpen)/2,
high = (BidHigh + AskHigh)/2,
low = (BidLow + AskLow)/2,
close = (BidClose + AskClose)/2) %>%
dplyr::select(index, open, high, low, close)
}
```
```{r, warning = FALSE, message = FALSE}
tb4 <- data_m1 %>% dplyr::mutate(
bias.open = if_else(open>high|open<low, 1, 0),
bias.high = if_else(high<open|high<low|high<close, 1, 0),
bias.low = if_else(low>open|low>high|low>close, 1, 0),
bias.close = if_else(close>high|close<low, 1, 0)) %>%
dplyr::filter(bias.open==1|bias.high==1|bias.low==1|bias.close==1)# %>%
# kable(caption = 'Bias Imputation') %>%
# kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
# scroll_box(width = '100%', fixed_thead = TRUE, height = '400px')
tb4
```
```{r include = FALSE}
rm(tb4)
```
```{js}
//<script type="text/javascript" src="//cdn.datacamp.com/dcl-react.js.gz"></script>
```
*Source : [DataCamp Light](https://github.com/datacamp/datacamp-light)*
You are feel free to surf [Online Coding Platform Example](https://cdn.datacamp.com/dcl-react-prod/example.html) for R, Python and also Shell.
# Modelling
## Seasonal Modelling
Below articles introduce **TBATS models** `tbats()` and **Dynamic harmonic regression with multiple seasonal periods** `auto.arima()`. Here I also includes `ts()`, **MIDAS** `midasr()`, **GARCH-MIDAS**, **mcsGARCH** and **Levy Process** for this research.
- [Forecasting: Principles and Practice - *11.1 Complex seasonality*](https://otexts.com/fpp2/complexseasonality.html#complexseasonality)
- [Forecasting: Principles and Practice - *12.1 Weekly, daily and sub-daily data*](https://otexts.com/fpp2/weekly.html#weekly)
<span style='color:red'>**Progress Function**</span>
```{r}
task_progress <- function(mbase, timeID0 = NULL, scs = 60, .pattern = '^mts|^sets', .loops = TRUE) {
## ------------- 定时查询进度 ----------------------
## 每分钟自动查询与更新以上模拟预测汇价进度(储存文件量)。
require('magrittr')
require('tibble')
if(!is.data.frame(class(mbase))) {
mbase %<>% data.frame
}
if (.loops == TRUE) {
while(1) {
cat('Current Tokyo Time :', as.character(now('Asia/Tokyo')), '\n\n')
y = as_date(mbase$index) %>%
unique
y <- y[weekdays(y) != 'Saturday'] #filter and omit the weekly last price which is 12:00am on saturday
datee = y
if(is.null(timeID0)) {
timeID0 = y[1]
} else if (is.Date(timeID0)) {
timeID0 = as_date(timeID0)
} else {
timeID0 = as_date(mbase$index) %>%
unique
}
y = y[y >= timeID0]
x = list.files(paste0('./data/fx/USDJPY/'), pattern = .pattern) %>%
str_replace_all('.rds', '') %>%
str_replace_all('.201', '_201') %>%
str_split_fixed('_', '2') %>%
as_tibble %>%
dplyr::rename('Model' = 'V1', 'Date' = 'V2') %>%
dplyr::mutate(Model = factor(Model), Date = as_date(Date))
x = join(tibble(Date = datee), x) %>%
as_tibble
x %<>% na.omit
x %<>% dplyr::mutate(binary = if_else(is.na(Model), 0, 1)) %>%
spread(Model, binary)
z <- ldply(x[,-1], function(zz) {
na.omit(zz) %>% length }) %>%
dplyr::rename(x = V1) %>%
dplyr::mutate(n = length(y), progress = percent(x/n))
print(z)
prg = sum(z$x)/sum(z$n)
cat('\n================', as.character(percent(prg)), '================\n\n')
if (prg == 1) break #倘若进度达到100%就停止更新。
Sys.sleep(scs) #以上ldply()耗时3~5秒,而休息时间60秒。
}
} else {
cat('Current Tokyo Time :', as.character(now('Asia/Tokyo')), '\n\n')
y = as_date(mbase$index) %>%
unique
datee = y
if(is.null(timeID0)) {
timeID0 = y[1]
} else if (is.Date(timeID0)) {
timeID0 = as_date(timeID0)
} else {
timeID0 = as_date(mbase$index) %>%
unique
}
y = y[y >= timeID0]
x = list.files(paste0('./data/fx/USDJPY/'), pattern = .pattern) %>%
str_replace_all('.rds', '') %>%
str_replace_all('.201', '_201') %>%
str_split_fixed('_', '2') %>%
as_tibble %>%
dplyr::rename('Model' = 'V1', 'Date' = 'V2') %>%
dplyr::mutate(Model = factor(Model), Date = as_date(Date))
x = join(tibble(Date = datee), x) %>%
as_tibble
x %<>% na.omit
x %<>% dplyr::mutate(binary = if_else(is.na(Model), 0, 1)) %>%
spread(Model, binary)
z <- ldply(x[,-1], function(zz) {
na.omit(zz) %>% length }) %>%
dplyr::rename(x = V1) %>%
dplyr::mutate(n = length(y), progress = percent(x/n))
print(z)
prg = sum(z$x)/sum(z$n)
cat('\n================', as.character(percent(prg)), '================\n\n')
}
}
```
## Seasonal ETS
### ETS `ts()`
The `forecast.ets()` will automatically use the optimal `ets()` which is similar theory with `auto.arima()`.
### Weekly >> Daily
I set the length of dataset as weekly but the frequency set as 1440 minutes (per day).
```{r, eval=FALSE}
# --------- eval=FALSE ---------
#sq <- seq(1 , length(data_m1$index), by = 1440)
#sets <- list()
timeID <- data_m1$index %>%
as_date %>%
unique %>%
sort
timeID %<>% .[. > as_date('2015-01-11')]
for (dt in timeID) {
smp <- data_m1 %>%
tk_xts(silent = TRUE)
dt %<>% as_date
smp <- smp[paste0(dt %m-% weeks(1) + seconds(59), '/', dt + seconds(59))]
sets <- smp %>%
tk_ts(frequency = 1440) %>%
forecast(h = 1440) %>%
llply(tk_tbl)
if(is.double(sets$forecast$index[1])){
sq <- smp %>%
tail(1) %>%
index
if(weekdays(sq) == '土曜日'|weekdays(sq) == 'Saturday') sq <- sq + days(2)
sq <- seq(from = sq + minutes(1), sq + days(1), by = 'min')
sets$forecast$index <- sq
} else {
sets$forecast$index <- data_m1$index[
(which(data_m1$index == smp %>%
index %>%
xts::last()) + 1):(
which(data_m1$index == smp %>%
index %>%
xts::last()) + 1440)]
}
if (!dir.exists(paste0('data/fx/USDJPY')))
dir.create(paste0('data/fx/USDJPY'))
saveRDS(sets, paste0('data/fx/USDJPY/sets.wk.1440.',
as_date(sets$forecast$index[1]), '.rds'))
cat(paste0(
'data/fx/USDJPY/sets.wk.1440.',
as_date(sets$forecast$index[1]), '.rds saved!\n'))
}
```
### Monthly >> Daily
I set the length of dataset as monthly but the frequency set as 1440 minutes (per day). Initial forecast will be based on weekly dataset and then accumulated date-by-date until a monthly dataset.
```{r, eval=FALSE}
# --------- eval=FALSE ---------
#sq <- seq(1 , length(data_m1$index), by = 1440)
#sets <- list()
timeID <- data_m1$index %>%
as_date %>%
unique %>%
sort
timeID %<>% .[. > as_date('2015-01-11')]
for (dt in timeID) {
smp <- data_m1 %>%
tk_xts(silent = TRUE)
dt %<>% as_date
smp <- smp[paste0(dt %m-% months(1) + seconds(59), '/', dt + seconds(59))]
sets <- smp %>%
tk_ts(frequency = 1440) %>%
forecast(h=1440) %>%
llply(tk_tbl)
if(is.double(sets$forecast$index[1])){
sq <- smp %>%
tail(1) %>%
index
if(weekdays(sq) == '土曜日'|weekdays(sq) == 'Saturday') sq <- sq + days(2)
sq <- seq(from = sq + minutes(1), sq + days(1), by = 'min')
sets$forecast$index <- sq
} else {
sets$forecast$index <- data_m1$index[
(which(data_m1$index == smp %>%