add torch_model

AMS-QF · Feb 7, 2024 · 3e76c6d · 3e76c6d
1 parent f13c34b
commit 3e76c6d
Show file tree

Hide file tree

Showing 23 changed files with 3,324 additions and 4 deletions.
diff --git a/Example_torch_model.ipynb b/Example_torch_model.ipynb
diff --git a/config.py b/config.py
@@ -0,0 +1,14 @@
+from __future__ import annotations
+
+
+# directories
+DATA_SAVE_DIR = "data"
+TRAINED_MODEL_DIR = "trained_models"
+
+# date format %YYYY-mm-dd"
+START_DATE = "2020-01-06"
+END_DATE = "2020-01-07"
+
+# trades/quotes subcolumns
+TRADE_COLUMNS = ['Time', 'Symbol', 'Date', 'Participant_Timestamp', 'Trade_Volume', 'Trade_Price', 'Trade_Reporting_Facility']
+QUOTE_COLUMNS = ['Time', 'Symbol', 'Date', 'Participant_Timestamp', 'Bid_Price', 'Bid_Size', 'Offer_Price', 'Offer_Size']
diff --git a/config_tickers.py b/config_tickers.py
@@ -0,0 +1,107 @@
+from __future__ import annotations
+
+SINGLE_TICKER = ["AAPL"]
+
+
+NAS_100_TICKER = [
+    "AMGN",
+    "AAPL",
+    "AMAT",
+    "INTC",
+    "PCAR",
+    "PAYX",
+    "MSFT",
+    "ADBE",
+    "CSCO",
+    "XLNX",
+    "QCOM",
+    "COST",
+    "SBUX",
+    "FISV",
+    "CTXS",
+    "INTU",
+    "AMZN",
+    "EBAY",
+    "BIIB",
+    "CHKP",
+    "GILD",
+    "NLOK",
+    "CMCSA",
+    "FAST",
+    "ADSK",
+    "CTSH",
+    "NVDA",
+    "GOOGL",
+    "ISRG",
+    "VRTX",
+    "HSIC",
+    "BIDU",
+    "ATVI",
+    "ADP",
+    "ROST",
+    "ORLY",
+    "CERN",
+    "BKNG",
+    "MYL",
+    "MU",
+    "DLTR",
+    "ALXN",
+    "SIRI",
+    "MNST",
+    "AVGO",
+    "TXN",
+    "MDLZ",
+    "FB",
+    "ADI",
+    "WDC",
+    "REGN",
+    "LBTYK",
+    "VRSK",
+    "NFLX",
+    "TSLA",
+    "CHTR",
+    "MAR",
+    "ILMN",
+    "LRCX",
+    "EA",
+    "AAL",
+    "WBA",
+    "KHC",
+    "BMRN",
+    "JD",
+    "SWKS",
+    "INCY",
+    "PYPL",
+    "CDW",
+    "FOXA",
+    "MXIM",
+    "TMUS",
+    "EXPE",
+    "TCOM",
+    "ULTA",
+    "CSX",
+    "NTES",
+    "MCHP",
+    "CTAS",
+    "KLAC",
+    "HAS",
+    "JBHT",
+    "IDXX",
+    "WYNN",
+    "MELI",
+    "ALGN",
+    "CDNS",
+    "WDAY",
+    "SNPS",
+    "ASML",
+    "TTWO",
+    "PEP",
+    "NXPI",
+    "XEL",
+    "AMD",
+    "NTAP",
+    "VRSN",
+    "LULU",
+    "WLTW",
+    "UAL",
+]
diff --git a/data_preprocessing/__init__.py b/data_preprocessing/__init__.py
diff --git a/data_preprocessing/get_data.py b/data_preprocessing/get_data.py
@@ -1,11 +1,12 @@
 import os
+import pandas as pd
 
 import paramiko
 from dotenv import load_dotenv
 from scp import SCPClient
 
 
-def get_trades(symbols, start_date, end_date, row_limit,columns):
+def get_trades(datapath, symbols, start_date, end_date, row_limit,columns):
     # load the contents of the .env file into the environment
     load_dotenv()
 
@@ -43,10 +44,11 @@ def get_trades(symbols, start_date, end_date, row_limit,columns):
 
             # fetch the remote file 'trade_results.csv' from the directory 'TAQNYSE-Clickhouse'
             # and save it to the data directory in the pipelines folder
-            local_file_path = f'data/trades_{symbol}_{start_date.replace("-", "")}-{end_date.replace("-", "")}.csv.gz'
+            local_file_path = datapath + f'/raw/trades_{symbol}_{start_date.replace("-", "")}-{end_date.replace("-", "")}.csv.gz'
             os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
 
             scp.get(f"trade_results.csv.gz", local_file_path)
+
 
     except Exception as e:
         print(f"An error occurred: {e}")
@@ -58,7 +60,7 @@ def get_trades(symbols, start_date, end_date, row_limit,columns):
         ssh.close()
 
 
-def get_quotes(symbols, start_date, end_date, row_limit,columns):
+def get_quotes(datapath, symbols, start_date, end_date, row_limit,columns):
     # load the contents of the .env file into the environment
     load_dotenv()
 
@@ -96,7 +98,7 @@ def get_quotes(symbols, start_date, end_date, row_limit,columns):
 
             # fetch the remote file 'trade_results.csv' from the directory 'TAQNYSE-Clickhouse'
             # and save it to the data directory in the pipelines folder
-            local_file_path = f'data/quotes_{symbol}_{start_date.replace("-", "")}-{end_date.replace("-", "")}.csv.gz'
+            local_file_path = datapath + f'/raw/quotes_{symbol}_{start_date.replace("-", "")}-{end_date.replace("-", "")}.csv.gz'
             os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
             scp.get(f"quote_results.csv.gz", local_file_path)
 

diff --git a/torch_models/__init__.py b/torch_models/__init__.py
diff --git a/torch_models/evaluations/__init__.py b/torch_models/evaluations/__init__.py
diff --git a/torch_models/evaluations/metrics.py b/torch_models/evaluations/metrics.py
@@ -0,0 +1,11 @@
+import numpy as np
+
+
+def out_of_sample_R2(y_true, y_predict):
+
+    SS_res = np.sum((y_true - y_predict) ** 2)
+
+    mean_y_true = np.mean(y_true)
+    SS_tot = np.sum((y_true - mean_y_true) ** 2)
+
+    return 1 - (SS_res / SS_tot)
diff --git a/torch_models/metadata/__init__.py b/torch_models/metadata/__init__.py
@@ -0,0 +1,11 @@
+from typing import runtime_checkable, Protocol
+
+
+@runtime_checkable
+class Dataset(Protocol):
+    """
+    Abstract class for dataset that is used for model training
+    """
+
+    def __len__(self) -> int:
+        raise NotImplementedError
diff --git a/torch_models/metadata/data_processors/__init__.py b/torch_models/metadata/data_processors/__init__.py
diff --git a/torch_models/metadata/data_processors/features_taq.py b/torch_models/metadata/data_processors/features_taq.py
@@ -0,0 +1,175 @@
+from __future__ import annotations
+
+import pandas as pd
+import numpy as np
+from typing import Optional
+
+
+class CalendarFeatureGenerator:
+    """
+    Generate Calendar mode features for taq data
+    """
+
+    def __init__(self):
+        self.feature_map = {}
+
+    def reset(self):
+        self.feature_map = {}
+        return
+
+    def features_to_df(self, df:pd.DataFrame) -> pd.DataFrame:
+        feature_df = pd.DataFrame(self.feature_map)
+        feature_df.index = pd.DatetimeIndex(df['Participant_Timestamp'])
+        return feature_df
+
+
+    def getForwardReturn(self, df:pd.DataFrame, prediction_length:float):
+        if 'Return' in self.feature_map:
+            return self.feature_map['Return']
+        else:
+            pts = pd.to_datetime(df['Participant_Timestamp'].values)
+            forward_idx = np.searchsorted(pts, pts + pd.Timedelta(seconds=prediction_length), side='right')
+            self.feature_map['Return'] = pd.Series([df.iloc[start:end]['Trade_Price'].mean()/df.iloc[start]['Mid_Price'] - 1 \
+                                       for start, end in zip(df.index, forward_idx)])
+            return self.feature_map['Return']
+
+
+    def calc_time_deltas(self, df:pd.DataFrame, delta1:float, delta2:float):
+        """
+        Calculate start/end index for lookback interval
+        """
+        self.delta1, self.delta2 = delta1, delta2
+        new_df = df.copy()
+        pts = pd.to_datetime(new_df['Participant_Timestamp'].values)
+        self.start_idx = np.searchsorted(pts, pts - pd.Timedelta(seconds=delta2), side='right')
+        self.end_idx = np.searchsorted(pts, pts - pd.Timedelta(seconds=delta1), side='right') - 1
+
+        return
+
+
+    #Breath
+    def getBreath(self, df:pd.DataFrame) -> pd.Series:
+        if 'Breath' in self.feature_map:
+            return self.feature_map['Breath']
+        else:
+            self.feature_map['Breath'] = pd.Series([df.iloc[start:end+1]['Type'].eq('T').sum() \
+                                       for start, end in zip(self.start_idx, self.end_idx)])
+            return self.feature_map['Breath']
+
+    #Immediacy
+    def getImmediacy(self, df:pd.DataFrame) -> pd.Series:
+        if 'Immediacy' in self.feature_map:
+            return self.feature_map['Immediacy']
+        else:
+            breath = self.getBreath(df)
+            delta_diff = self.delta2-self.delta1
+            self.feature_map['Immediacy'] = np.where(breath == 0, np.nan, (delta_diff) / breath)
+            return self.feature_map['Immediacy']
+
+    #VolumeAll
+    def getVolumeAll(self, df:pd.DataFrame) -> pd.Series:
+        if 'VolumeAll' in self.feature_map:
+            return self.feature_map['VolumeAll']
+        else:
+            self.feature_map['VolumeAll'] = pd.Series([df.iloc[start:end+1]['Trade_Volume'].sum() \
+                                          for start, end in zip(self.start_idx, self.end_idx)])
+            return self.feature_map['VolumeAll']
+
+    # VolumeAvg
+    def getVolumeAvg(self, df:pd.DataFrame) -> pd.Series:
+        if 'VolumeAvg' in self.feature_map:
+            return self.feature_map['VolumeAvg']
+        else:
+            volumeAll = self.getVolumeAll(df)
+            breath = self.getBreath(df)
+            self.feature_map['VolumeAvg'] = volumeAll/breath
+            return self.feature_map['VolumeAvg']
+
+    # VolumeMax
+    def getVolumeMax(self, df:pd.DataFrame) -> pd.Series:
+        if 'VolumeMax' in self.feature_map:
+            return self.feature_map['VolumeMax']
+        else:
+            self.feature_map['VolumeMax'] = pd.Series([max(df.iloc[start:end+1]['Trade_Volume'], default=0) \
+                                          for start, end in zip(self.start_idx, self.end_idx)])
+            return self.feature_map['VolumeMax']
+
+
+    # Lambda
+    def getLambda(self, df:pd.DataFrame) -> pd.Series:
+        if 'Lambda' in self.feature_map:
+            return self.feature_map['Lambda']
+        else:
+            volumeAll = self.getVolumeAll(df)
+            p_change = pd.Series([df.iloc[end]['Mid_Price']-df.iloc[start]['Mid_Price'] \
+                                  for start, end in zip(self.start_idx, self.end_idx)])
+
+            self.feature_map['Lambda'] = np.where(volumeAll != 0, p_change / volumeAll, np.nan)
+            return self.feature_map['Lambda']
+
+
+    # LobImbalance
+    def getLobImbalance(self, df:pd.DataFrame) -> pd.Series:
+        if 'LobImbalance' in self.feature_map:
+            return self.feature_map['LobImbalance']
+        else:
+            temp_df = df.copy()
+            temp_df['Imbalance'] = (temp_df['Offer_Size'] - temp_df['Bid_Size']) / (temp_df['Offer_Size'] + temp_df['Bid_Size'])
+            self.feature_map['LobImbalance'] = pd.Series([temp_df.iloc[start:end+1]['Imbalance'].mean() \
+                                                          for start, end in zip(self.start_idx, self.end_idx)])
+            return self.feature_map['LobImbalance']
+
+
+    # TxnImbalance
+    def getTxnImbalance(self, df:pd.DataFrame) -> pd.Series:
+        if 'TxnImbalance' in self.feature_map:
+            return self.feature_map['TxnImbalance']
+        else:
+            volumeAll = self.getVolumeAll(df)
+            temp_df = df.copy()
+            temp_df['Vt_Dir'] = temp_df['Trade_Volume'] * temp_df['Trade_Side']
+            sum_Vt_Dir = pd.Series([temp_df.iloc[start:end+1]['Vt_Dir'].sum() for start, end in zip(self.start_idx, self.end_idx)])
+            self.feature_map['TxnImbalance'] = np.where(volumeAll != 0, sum_Vt_Dir / volumeAll, np.nan)
+            return self.feature_map['TxnImbalance']
+
+
+    # PastReturn
+    def getPastReturn(self, df:pd.DataFrame) -> pd.Series:
+        if 'PastReturn' in self.feature_map:
+            return self.feature_map['PastReturn']
+        else:
+            p_return = pd.Series([df.iloc[start:end+1]['Trade_Price'].mean() / df.iloc[end]['Mid_Price']\
+                                for start, end in zip(self.start_idx, self.end_idx)])
+
+            self.feature_map['PastReturn'] = 1 - p_return
+            return self.feature_map['PastReturn']
+
+
+    #QuotedSpread
+    def getQuotedSpread(self, df:pd.DataFrame) -> pd.Series:
+        if 'QuotedSpread' in self.feature_map:
+            return self.feature_map['QuotedSpread']
+        else:
+            temp_df = df.copy()
+            temp_df['Spread'] = (temp_df['Offer_Price'] - temp_df['Bid_Price'])/temp_df['Mid_Price']
+            self.feature_map['QuotedSpread'] = pd.Series([temp_df.iloc[start:end+1]['Spread'].mean() \
+                                                          for start, end in zip(self.start_idx, self.end_idx)])
+            return self.feature_map['QuotedSpread']
+
+
+    #EffectiveSpread
+    def getEffectiveSpread(self, df:pd.DataFrame) -> pd.Series:
+        if 'EffectiveSpread' in self.feature_map:
+            return self.feature_map['EffectiveSpread']
+        else:
+            temp_df = df.copy()
+            temp_df['Spread'] = np.log(temp_df['Trade_Price']/temp_df['Mid_Price'])*temp_df['Trade_Side']\
+                                            *temp_df['Trade_Volume']*temp_df['Trade_Price']
+            temp_df['Vt_P'] = temp_df['Trade_Volume'] * temp_df['Trade_Price']
+            self.feature_map['EffectiveSpread'] = pd.Series([np.where(temp_df.iloc[start:end+1]['Vt_P'].sum() != 0, 
+                                    np.divide(temp_df.iloc[start:end+1]['Spread'].sum(), temp_df.iloc[start:end+1]['Vt_P'].sum()), 
+                                    np.nan) 
+                           for start, end in zip(self.start_idx, self.end_idx)])
+            return self.feature_map['EffectiveSpread']
+
+