Skip to content

Commit

Permalink
add torch_model
Browse files Browse the repository at this point in the history
  • Loading branch information
Zilin Chen authored and Zilin Chen committed Feb 7, 2024
1 parent f13c34b commit 3e76c6d
Show file tree
Hide file tree
Showing 23 changed files with 3,324 additions and 4 deletions.
2,131 changes: 2,131 additions & 0 deletions Example_torch_model.ipynb

Large diffs are not rendered by default.

14 changes: 14 additions & 0 deletions config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from __future__ import annotations


# directories
DATA_SAVE_DIR = "data"
TRAINED_MODEL_DIR = "trained_models"

# date format %YYYY-mm-dd"
START_DATE = "2020-01-06"
END_DATE = "2020-01-07"

# trades/quotes subcolumns
TRADE_COLUMNS = ['Time', 'Symbol', 'Date', 'Participant_Timestamp', 'Trade_Volume', 'Trade_Price', 'Trade_Reporting_Facility']
QUOTE_COLUMNS = ['Time', 'Symbol', 'Date', 'Participant_Timestamp', 'Bid_Price', 'Bid_Size', 'Offer_Price', 'Offer_Size']
107 changes: 107 additions & 0 deletions config_tickers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
from __future__ import annotations

SINGLE_TICKER = ["AAPL"]


NAS_100_TICKER = [
"AMGN",
"AAPL",
"AMAT",
"INTC",
"PCAR",
"PAYX",
"MSFT",
"ADBE",
"CSCO",
"XLNX",
"QCOM",
"COST",
"SBUX",
"FISV",
"CTXS",
"INTU",
"AMZN",
"EBAY",
"BIIB",
"CHKP",
"GILD",
"NLOK",
"CMCSA",
"FAST",
"ADSK",
"CTSH",
"NVDA",
"GOOGL",
"ISRG",
"VRTX",
"HSIC",
"BIDU",
"ATVI",
"ADP",
"ROST",
"ORLY",
"CERN",
"BKNG",
"MYL",
"MU",
"DLTR",
"ALXN",
"SIRI",
"MNST",
"AVGO",
"TXN",
"MDLZ",
"FB",
"ADI",
"WDC",
"REGN",
"LBTYK",
"VRSK",
"NFLX",
"TSLA",
"CHTR",
"MAR",
"ILMN",
"LRCX",
"EA",
"AAL",
"WBA",
"KHC",
"BMRN",
"JD",
"SWKS",
"INCY",
"PYPL",
"CDW",
"FOXA",
"MXIM",
"TMUS",
"EXPE",
"TCOM",
"ULTA",
"CSX",
"NTES",
"MCHP",
"CTAS",
"KLAC",
"HAS",
"JBHT",
"IDXX",
"WYNN",
"MELI",
"ALGN",
"CDNS",
"WDAY",
"SNPS",
"ASML",
"TTWO",
"PEP",
"NXPI",
"XEL",
"AMD",
"NTAP",
"VRSN",
"LULU",
"WLTW",
"UAL",
]
Empty file added data_preprocessing/__init__.py
Empty file.
10 changes: 6 additions & 4 deletions data_preprocessing/get_data.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
import os
import pandas as pd

import paramiko
from dotenv import load_dotenv
from scp import SCPClient


def get_trades(symbols, start_date, end_date, row_limit,columns):
def get_trades(datapath, symbols, start_date, end_date, row_limit,columns):
# load the contents of the .env file into the environment
load_dotenv()

Expand Down Expand Up @@ -43,10 +44,11 @@ def get_trades(symbols, start_date, end_date, row_limit,columns):

# fetch the remote file 'trade_results.csv' from the directory 'TAQNYSE-Clickhouse'
# and save it to the data directory in the pipelines folder
local_file_path = f'data/trades_{symbol}_{start_date.replace("-", "")}-{end_date.replace("-", "")}.csv.gz'
local_file_path = datapath + f'/raw/trades_{symbol}_{start_date.replace("-", "")}-{end_date.replace("-", "")}.csv.gz'
os.makedirs(os.path.dirname(local_file_path), exist_ok=True)

scp.get(f"trade_results.csv.gz", local_file_path)


except Exception as e:
print(f"An error occurred: {e}")
Expand All @@ -58,7 +60,7 @@ def get_trades(symbols, start_date, end_date, row_limit,columns):
ssh.close()


def get_quotes(symbols, start_date, end_date, row_limit,columns):
def get_quotes(datapath, symbols, start_date, end_date, row_limit,columns):
# load the contents of the .env file into the environment
load_dotenv()

Expand Down Expand Up @@ -96,7 +98,7 @@ def get_quotes(symbols, start_date, end_date, row_limit,columns):

# fetch the remote file 'trade_results.csv' from the directory 'TAQNYSE-Clickhouse'
# and save it to the data directory in the pipelines folder
local_file_path = f'data/quotes_{symbol}_{start_date.replace("-", "")}-{end_date.replace("-", "")}.csv.gz'
local_file_path = datapath + f'/raw/quotes_{symbol}_{start_date.replace("-", "")}-{end_date.replace("-", "")}.csv.gz'
os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
scp.get(f"quote_results.csv.gz", local_file_path)

Expand Down
Empty file added torch_models/__init__.py
Empty file.
Empty file.
11 changes: 11 additions & 0 deletions torch_models/evaluations/metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import numpy as np


def out_of_sample_R2(y_true, y_predict):

SS_res = np.sum((y_true - y_predict) ** 2)

mean_y_true = np.mean(y_true)
SS_tot = np.sum((y_true - mean_y_true) ** 2)

return 1 - (SS_res / SS_tot)
11 changes: 11 additions & 0 deletions torch_models/metadata/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from typing import runtime_checkable, Protocol


@runtime_checkable
class Dataset(Protocol):
"""
Abstract class for dataset that is used for model training
"""

def __len__(self) -> int:
raise NotImplementedError
Empty file.
175 changes: 175 additions & 0 deletions torch_models/metadata/data_processors/features_taq.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
from __future__ import annotations

import pandas as pd
import numpy as np
from typing import Optional


class CalendarFeatureGenerator:
"""
Generate Calendar mode features for taq data
"""

def __init__(self):
self.feature_map = {}

def reset(self):
self.feature_map = {}
return

def features_to_df(self, df:pd.DataFrame) -> pd.DataFrame:
feature_df = pd.DataFrame(self.feature_map)
feature_df.index = pd.DatetimeIndex(df['Participant_Timestamp'])
return feature_df


def getForwardReturn(self, df:pd.DataFrame, prediction_length:float):
if 'Return' in self.feature_map:
return self.feature_map['Return']
else:
pts = pd.to_datetime(df['Participant_Timestamp'].values)
forward_idx = np.searchsorted(pts, pts + pd.Timedelta(seconds=prediction_length), side='right')
self.feature_map['Return'] = pd.Series([df.iloc[start:end]['Trade_Price'].mean()/df.iloc[start]['Mid_Price'] - 1 \
for start, end in zip(df.index, forward_idx)])
return self.feature_map['Return']


def calc_time_deltas(self, df:pd.DataFrame, delta1:float, delta2:float):
"""
Calculate start/end index for lookback interval
"""
self.delta1, self.delta2 = delta1, delta2
new_df = df.copy()
pts = pd.to_datetime(new_df['Participant_Timestamp'].values)
self.start_idx = np.searchsorted(pts, pts - pd.Timedelta(seconds=delta2), side='right')
self.end_idx = np.searchsorted(pts, pts - pd.Timedelta(seconds=delta1), side='right') - 1

return


#Breath
def getBreath(self, df:pd.DataFrame) -> pd.Series:
if 'Breath' in self.feature_map:
return self.feature_map['Breath']
else:
self.feature_map['Breath'] = pd.Series([df.iloc[start:end+1]['Type'].eq('T').sum() \
for start, end in zip(self.start_idx, self.end_idx)])
return self.feature_map['Breath']

#Immediacy
def getImmediacy(self, df:pd.DataFrame) -> pd.Series:
if 'Immediacy' in self.feature_map:
return self.feature_map['Immediacy']
else:
breath = self.getBreath(df)
delta_diff = self.delta2-self.delta1
self.feature_map['Immediacy'] = np.where(breath == 0, np.nan, (delta_diff) / breath)
return self.feature_map['Immediacy']

#VolumeAll
def getVolumeAll(self, df:pd.DataFrame) -> pd.Series:
if 'VolumeAll' in self.feature_map:
return self.feature_map['VolumeAll']
else:
self.feature_map['VolumeAll'] = pd.Series([df.iloc[start:end+1]['Trade_Volume'].sum() \
for start, end in zip(self.start_idx, self.end_idx)])
return self.feature_map['VolumeAll']

# VolumeAvg
def getVolumeAvg(self, df:pd.DataFrame) -> pd.Series:
if 'VolumeAvg' in self.feature_map:
return self.feature_map['VolumeAvg']
else:
volumeAll = self.getVolumeAll(df)
breath = self.getBreath(df)
self.feature_map['VolumeAvg'] = volumeAll/breath
return self.feature_map['VolumeAvg']

# VolumeMax
def getVolumeMax(self, df:pd.DataFrame) -> pd.Series:
if 'VolumeMax' in self.feature_map:
return self.feature_map['VolumeMax']
else:
self.feature_map['VolumeMax'] = pd.Series([max(df.iloc[start:end+1]['Trade_Volume'], default=0) \
for start, end in zip(self.start_idx, self.end_idx)])
return self.feature_map['VolumeMax']


# Lambda
def getLambda(self, df:pd.DataFrame) -> pd.Series:
if 'Lambda' in self.feature_map:
return self.feature_map['Lambda']
else:
volumeAll = self.getVolumeAll(df)
p_change = pd.Series([df.iloc[end]['Mid_Price']-df.iloc[start]['Mid_Price'] \
for start, end in zip(self.start_idx, self.end_idx)])

self.feature_map['Lambda'] = np.where(volumeAll != 0, p_change / volumeAll, np.nan)
return self.feature_map['Lambda']


# LobImbalance
def getLobImbalance(self, df:pd.DataFrame) -> pd.Series:
if 'LobImbalance' in self.feature_map:
return self.feature_map['LobImbalance']
else:
temp_df = df.copy()
temp_df['Imbalance'] = (temp_df['Offer_Size'] - temp_df['Bid_Size']) / (temp_df['Offer_Size'] + temp_df['Bid_Size'])
self.feature_map['LobImbalance'] = pd.Series([temp_df.iloc[start:end+1]['Imbalance'].mean() \
for start, end in zip(self.start_idx, self.end_idx)])
return self.feature_map['LobImbalance']


# TxnImbalance
def getTxnImbalance(self, df:pd.DataFrame) -> pd.Series:
if 'TxnImbalance' in self.feature_map:
return self.feature_map['TxnImbalance']
else:
volumeAll = self.getVolumeAll(df)
temp_df = df.copy()
temp_df['Vt_Dir'] = temp_df['Trade_Volume'] * temp_df['Trade_Side']
sum_Vt_Dir = pd.Series([temp_df.iloc[start:end+1]['Vt_Dir'].sum() for start, end in zip(self.start_idx, self.end_idx)])
self.feature_map['TxnImbalance'] = np.where(volumeAll != 0, sum_Vt_Dir / volumeAll, np.nan)
return self.feature_map['TxnImbalance']


# PastReturn
def getPastReturn(self, df:pd.DataFrame) -> pd.Series:
if 'PastReturn' in self.feature_map:
return self.feature_map['PastReturn']
else:
p_return = pd.Series([df.iloc[start:end+1]['Trade_Price'].mean() / df.iloc[end]['Mid_Price']\
for start, end in zip(self.start_idx, self.end_idx)])

self.feature_map['PastReturn'] = 1 - p_return
return self.feature_map['PastReturn']


#QuotedSpread
def getQuotedSpread(self, df:pd.DataFrame) -> pd.Series:
if 'QuotedSpread' in self.feature_map:
return self.feature_map['QuotedSpread']
else:
temp_df = df.copy()
temp_df['Spread'] = (temp_df['Offer_Price'] - temp_df['Bid_Price'])/temp_df['Mid_Price']
self.feature_map['QuotedSpread'] = pd.Series([temp_df.iloc[start:end+1]['Spread'].mean() \
for start, end in zip(self.start_idx, self.end_idx)])
return self.feature_map['QuotedSpread']


#EffectiveSpread
def getEffectiveSpread(self, df:pd.DataFrame) -> pd.Series:
if 'EffectiveSpread' in self.feature_map:
return self.feature_map['EffectiveSpread']
else:
temp_df = df.copy()
temp_df['Spread'] = np.log(temp_df['Trade_Price']/temp_df['Mid_Price'])*temp_df['Trade_Side']\
*temp_df['Trade_Volume']*temp_df['Trade_Price']
temp_df['Vt_P'] = temp_df['Trade_Volume'] * temp_df['Trade_Price']
self.feature_map['EffectiveSpread'] = pd.Series([np.where(temp_df.iloc[start:end+1]['Vt_P'].sum() != 0,
np.divide(temp_df.iloc[start:end+1]['Spread'].sum(), temp_df.iloc[start:end+1]['Vt_P'].sum()),
np.nan)
for start, end in zip(self.start_idx, self.end_idx)])
return self.feature_map['EffectiveSpread']


Loading

0 comments on commit 3e76c6d

Please sign in to comment.