diff --git a/reports/Content/Appendix.tex b/reports/Content/Appendix.tex index 5a629ab9..de66d80e 100644 --- a/reports/Content/Appendix.tex +++ b/reports/Content/Appendix.tex @@ -15,7 +15,7 @@ \section{Appendix} Research & Data & Sample Period & Method & Baseline & Improvement \\ \midrule \autocite[][15]{rosenthalModelingTradeDirection2012} & \gls{NASDAQ} & & Logistic regression & \gls{EMO} rule, \gls{LR} rule,\newline and tick rule & max. \SI{2.2}{\percent} \\ \cmidrule{2-6} & \gls{NYSE} & 03/12/2004 -- 31/12/2004 & Logistic regression & \gls{EMO} rule, \gls{LR} rule,\newline and tick rule & max. \SI{1.1}{\percent} \\\cmidrule{1-6} - \autocite[][489--494]{blazejewskiLocalNonParametricModel2005} & Australian Stock\newline Exchange & 11/11/2002 -- 27/08/2003 & $k$ nearest neighbour, \newline logistic regression,\newline trade continuation,\newline majority vote & - & - \\ \cmidrule{1-6} + \autocite[][489--494]{blazejewskiLocalNonParametricModel2005} & Australian Stock\newline Exchange & 11/11/2002 -- 27/08/2003 & $k$ nearest neighbor, \newline logistic regression,\newline trade continuation,\newline majority vote & - & - \\ \cmidrule{1-6} \autocite[][49--57]{ronenMachineLearningTrade2022} & \gls{TRACE} & 01/07/2002 -- 31/12/2019 & Logistic regression, decision tree,\newline neural network, and random forests & \gls{LR} rule and tick rule,\newline and \gls{BVC} algorithm & max. \SI{13.3}{\percent} \\ \cmidrule{2-6} & \gls{NASDAQ} & 09/12/2013 -- 13/12/2013 & Logistic regression, decision tree,\newline neural network, and random forests & \gls{LR} rule, tick rule,\newline and \gls{BVC} algorithm & max. \SI{3.3}{\percent} \\ \bottomrule \end{tabular} @@ -65,32 +65,32 @@ \subsection{Features and Transformations} \begin{tabular}{@{}ll@{}} \toprule Feature Name & Transform \\ \midrule - trade price & $\log$ + standardised \\ - price lag (ex) & $\log$ + standardised \\ - price lag (all) & $\log$ + standardised \\ - price change lag (ex) & clipped + standardised \\ - price change lag (all) & clipped + standardised \\ - price lead (ex) & $\log$ + standardised \\ - price lead (all) & $\log$ + standardised \\ - price change lead (ex) & clipped + standardised \\ - price change lead (all) & clipped + standardised \\ - bid (all) & $\log$ + standardised \\ - bid (ex) & $\log$ + standardised \\ - ask (all) & $\log$ + standardised \\ - ask (ex) & $\log$ + standardised \\ - prox. to quotes (ex) & clipped + standardised \\ - prox. to quotes (all) & clipped + standardised \\ - bid ask size ratio (ex) & clipped + standardised \\ - bid size (ex) & $\log$ + standardised \\ - ask size (ex) & $\log$ + standardised \\ - rel. bid size (ex) & clipped + standardised \\ - rel. ask size (ex) & clipped + standardised \\ - trade size & $\log$ + standardised \\ - strike price & $\log$ + standardised \\ - volume option series & $\log$ + standardised \\ + trade price & $\log$ + standardized \\ + price lag (ex) & $\log$ + standardized \\ + price lag (all) & $\log$ + standardized \\ + price change lag (ex) & clipped + standardized \\ + price change lag (all) & clipped + standardized \\ + price lead (ex) & $\log$ + standardized \\ + price lead (all) & $\log$ + standardized \\ + price change lead (ex) & clipped + standardized \\ + price change lead (all) & clipped + standardized \\ + bid (all) & $\log$ + standardized \\ + bid (ex) & $\log$ + standardized \\ + ask (all) & $\log$ + standardized \\ + ask (ex) & $\log$ + standardized \\ + prox. to quotes (ex) & clipped + standardized \\ + prox. to quotes (all) & clipped + standardized \\ + bid ask size ratio (ex) & clipped + standardized \\ + bid size (ex) & $\log$ + standardized \\ + ask size (ex) & $\log$ + standardized \\ + rel. bid size (ex) & clipped + standardized \\ + rel. ask size (ex) & clipped + standardized \\ + trade size & $\log$ + standardized \\ + strike price & $\log$ + standardized \\ + volume option series & $\log$ + standardized \\ root & binarised \\ - time to maturity & standardised \\ - moneyness & standardised \\ + time to maturity & standardized \\ + moneyness & standardized \\ option type & binarised \\ issue type & binarised \\ \bottomrule \end{tabular} @@ -168,6 +168,6 @@ \subsection{Attention Heads of Transformer} % \subfloat[Head (2,8)]{\label{sfig:hb}\includegraphics[width=.23\textwidth]{attention_head_8_layer_2_color_green_ise_quotes_mid.pdf}}\hfill % \subfloat[Head (3,8)]{\label{sfig:hc}\includegraphics[width=.23\textwidth]{attention_head_8_layer_3_color_green_ise_quotes_mid.pdf}}\hfill % \subfloat[Head (4,8)]{\label{sfig:hd}\includegraphics[width=.23\textwidth]{attention_head_8_layer_4_color_green_ise_quotes_mid.pdf}}\\ - \caption[Rule-Like Roles of All Attention Heads]{Attention heads that correspond to trade classification rules. Tuple denotes the location of the attention head in the model in the form of (layer, head). Plot visualises the attention weights for a trade executed at the quote, correctly classified by the model.} + \caption[Rule-Like Roles of All Attention Heads]{Attention heads that correspond to trade classification rules. Tuple denotes the location of the attention head in the model in the form of (layer, head). Plot visualizes the attention weights for a trade executed at the quote, correctly classified by the model.} \label{fig:attention-heads-ise-all-transformer} \end{figure} \ No newline at end of file diff --git a/reports/Content/data-preprocessing.tex b/reports/Content/data-preprocessing.tex index 144ed165..36b2e307 100644 --- a/reports/Content/data-preprocessing.tex +++ b/reports/Content/data-preprocessing.tex @@ -11,7 +11,7 @@ \subsection{Data and Data Preparation}\label{sec:data-and-data-preparation} \subsubsection{Data Collection}\label{sec:data-collection} -Testing the empirical accuracy of our approaches requires option trades where the true initiator is known. To arrive at a labelled sample, we combine data from four individual data sources. Our primary source is LiveVol, which records option trades executed at US option exchanges at a transaction level. We limit our focus to option trades executed at the \gls{CBOE} and \gls{ISE}. LiveVol contains both trade and matching quote data. Like most proprietary data sources, it does not distinguish the initiator nor does it include the involved trader types. For the \gls{CBOE} and \gls{ISE} exchange, the \gls{ISE} Open/Close Trade Profile and \gls{CBOE} Open-Close Volume Summary contain the buy and sell volumes for the option series by trader type aggregated on a daily level. A combination of the LiveVol dataset with the open/close data, allows us to infer the trade initiator for a subset of trades. For evaluation and use in some of our machine learning models, we acquire additional underlying and option characteristics from IvyDB's OptionMetrics. +Testing the empirical accuracy of our approaches requires option trades where the true initiator is known. To arrive at a labeled sample, we combine data from four individual data sources. Our primary source is LiveVol, which records option trades executed at US option exchanges at a transaction level. We limit our focus to option trades executed at the \gls{CBOE} and \gls{ISE}. LiveVol contains both trade and matching quote data. Like most proprietary data sources, it does not distinguish the initiator nor does it include the involved trader types. For the \gls{CBOE} and \gls{ISE} exchange, the \gls{ISE} Open/Close Trade Profile and \gls{CBOE} Open-Close Volume Summary contain the buy and sell volumes for the option series by trader type aggregated on a daily level. A combination of the LiveVol dataset with the open/close data, allows us to infer the trade initiator for a subset of trades. For evaluation and use in some of our machine learning models, we acquire additional underlying and option characteristics from IvyDB's OptionMetrics. In \cref{sec:trade-initiator} we discussed three views on the trade initiator. Due to the absence of order entry times or order types in our data sources, we define the trade initiator based on the position relative to the market maker, who caters to the liquidity demand. Specifically, we classify customer trades as buyer-initiated if the trade is due to a customer buy order and as seller-initiated for customer sales. As previous literature, e.g., \textcite[][4276]{garleanuDemandBasedOptionPricing2009} suggests that trader types, for example, proprietary traders, have a similar role to market makers by supplying liquidity, we limit our analysis to trades between customers and market makers for which the picture is unambiguous. Our definition is consistent with the of \textcite[][8]{grauerOptionTradeClassification2022}. @@ -20,24 +20,24 @@ \subsubsection{Data Collection}\label{sec:data-collection} \begin{enumerate}[label=(\roman*),noitemsep] \item trades with a trade price $\leq \SI{0}[\$]{}$, \item trades with a trade volume $\leq 0$ or $\ge \num{10000000}$ contracts, - \item cancelled or duplicated trades, + \item canceled or duplicated trades, \item entries with multiple underlying symbols for the same root. \end{enumerate} The open/close datasets for the \gls{ISE} and \gls{CBOE} contain the daily buy and sell volumes for the option series by trader type, the trade volume and whether a position was closed or opened. Four trader types are available: customer, professional customer, broker/dealer, and firm proprietary. Customer orders are placed by a retail trader or a member of the exchange on behalf of the customer. Professional customers are distinguished from the former by a high trading activity ($\geq390$ orders per day over one month period). Likewise, trades by a member are classified as proprietary, if executed for their account or broker/dealer if placed for non-members of the exchange \autocite[][2]{nasdaqincFrequentlyAskedQuestions2017}. Trades of customers and professional customers are detailed by trade volume ($\leq 100$; 101--199; $> 199$ contracts). As well as, if a position is newly opened or closed. We first sum buy and sell orders of all trader types and volumes to obtain the daily trading volumes at the \gls{ISE} or \gls{CBOE} per option series and day. Separately for the customer buy and sell volumes, we calculate the daily aggregates identified by the account type customer. -To infer the true label, we exploit that, if there were only customer buy or sell orders, hence the customer buy or sell volume equals the daily trading volume, we can confidently sign all transactions for the option series at the specific date and exchange as either buyer- or seller-initiated. Our labelling approach fails in the presence of non-customer or simultaneous customer buy or sell trades. The so-obtained trade initiator is merged with the LiveVol trades of the exchange based on the unique key for the option series. +To infer the true label, we exploit that, if there were only customer buy or sell orders, hence the customer buy or sell volume equals the daily trading volume, we can confidently sign all transactions for the option series at the specific date and exchange as either buyer- or seller-initiated. Our labeling approach fails in the presence of non-customer or simultaneous customer buy or sell trades. The so-obtained trade initiator is merged with the LiveVol trades of the exchange based on the unique key for the option series. For the \gls{ISE} trades, our matched sample spans from 2 May 2005 to 31 May 2017 and includes \num{49203747} trades. The period covers the full history of \gls{ISE} open/close data up to the last date the dataset was available to us. Our matched \gls{CBOE} sample consists of \num{37155412} trades between 1 January 2011 and 31 October 2017. The sample period is governed by a paradigm shift in the construction of the \gls{CBOE} open/close dataset and the most recent trade in our LiveVol subscription. -Following our initial rationale to explore semi-supervised methods, we reserve unlabelled trades between 24 October 2012 and 24 October 2013 at the \gls{ISE} for pre- and self-training. We provide further details in \cref{sec:train-test-split}. Since LiveVol doesn't distinguish by trader types, this dataset includes both customer and non-customer trades, as well as simultaneous buy and sell trades on the same day. Within this period, we filter out trades for which the true label can be inferred to avoid overlap with the supervised dataset. This is crucial for self-training, where labelled and unlabelled data are presented to the model simultaneously. +Following our initial rationale to explore semi-supervised methods, we reserve unlabeled trades between 24 October 2012 and 24 October 2013 at the \gls{ISE} for pre- and self-training. We provide further details in \cref{sec:train-test-split}. Since LiveVol doesn't distinguish by trader types, this dataset includes both customer and non-customer trades, as well as simultanous buy and sell trades on the same day. Within this period, we filter out trades for which the true label can be inferred to avoid overlap with the supervised dataset. This is crucial for self-training, where labeled and unlabeled data are presented to the model simultaneously. While our procedure makes the inference of the true trade initiator partly feasible, concerns regarding a selection bias due to the excessive filtering have to be raised. We address these concerns and report summary statistics for unmerged and merged sub-samples in \cref{app:summary-statistics}. In the following chapter, we motivate feature engineering, present our feature sets and discuss strategies for transforming features into a form that accelerates the training of our models. \subsubsection{Data Preprocessing}\label{sec:data-preprocessing} -Classical algorithms infer the initiator of the trade from the \emph{raw} price and quote data. We employ feature engineering to pre-process input data and enhance the convergence and performance of our machine learning models. Gradient-boosted trees and neural networks, though, flexible estimators have limitations in synthesising new features from existing ones, as demonstrated in empirical work on synthetic data by \textcite[][5--6]{heatonEmpiricalAnalysisFeature2016}. Specifically, ratios, standard deviations, and differences can be difficult for these models to learn and must therefore be engineered beforehand. +Classical algorithms infer the initiator of the trade from the \emph{raw} price and quote data. We employ feature engineering to pre-process input data and enhance the convergence and performance of our machine-learning models. Gradient-boosted trees and neural networks, though, flexible estimators have limitations in synthesizing new features from existing ones, as demonstrated in empirical work on synthetic data by \textcite[][5--6]{heatonEmpiricalAnalysisFeature2016}. Specifically, ratios, standard deviations, and differences can be difficult for these models to learn and must therefore be engineered beforehand. \textbf{Features and Feature Sets} @@ -56,7 +56,7 @@ \subsubsection{Data Preprocessing}\label{sec:data-preprocessing} Feature Name & Definition & Source & \gls{FS} Classic & \gls{FS} Size & \gls{FS} Option \\ \midrule \endfirsthead - \multicolumn{6}{l}{\textit{Continued \tablename~\thetable}} \\ + \multicolumn{6}{l}{\emph{Continued \tablename~\thetable}} \\ \toprule Feature Name & Definition & Source & \gls{FS} Classic & \gls{FS} Size & \gls{FS} Option \\ \midrule \endhead @@ -114,11 +114,11 @@ \subsubsection{Data Preprocessing}\label{sec:data-preprocessing} Our largest feature set, abbreviated with \gls{FS} option, also incorporates option characteristics, including the option type among others. By providing unique identifiers for the option series, we can potentially establish connections between transactions when trade initiators divide a single order into sub-orders or rely on complex trades. Similar reasoning applies to the daily volume of the option series. Option features are also informative individually. Time to maturity $\tau_{i,t}$, estimated in months, indirectly affects classification performance. On \gls{CBOE} data in \textcite[][889]{savickasInferringDirectionOption2003}, trades with longer maturities are smaller, hence more likely to be classified correctly. Moreover, time-of-maturity can be used as a dummy to identify rollovers \autocite[][700]{muravyevOrderFlowExpected2016}. When investors are short in call or put options, they replace expiring for non-expiring options, which creates selling pressure in the non-expiring option. The feature could make the procedure learnable. Related to the time-to-maturity is moneyness, estimated as the ratio between the price of the underlying $S_{i,t}$ and the strike price $K_{i}$ for calls and the reciprocal for puts. As moneyness is linked to leverage in the investment, we reason that incentives to initiate a trade might vary between buyers and sellers. The classification of index options poses challenges for traditional approaches relative to other security types (cp. \textcites[][22]{grauerOptionTradeClassification2022}[][898-899]{savickasInferringDirectionOption2003}), we equip the models with the security type, as well as the option type and root to extend the learnable context. -Arguably, our models have simultaneous access to the previous and successive trade prices and quotes for both the exchange and the \gls{NBBO}, which is an advantage over base rules. As we benchmark against various, stacked hybrid rules, the data requirements are still comparable. We emphasise this aspect, as it is neglected in previous works \autocites[][485]{blazejewskiLocalNonParametricModel2005}[][48]{ronenMachineLearningTrade2022}[][9]{rosenthalModelingTradeDirection2012}. +Arguably, our models have simultaneous access to the previous and successive trade prices and quotes for both the exchange and the \gls{NBBO}, which is an advantage over base rules. As we benchmark against various, stacked hybrid rules, the data requirements are still comparable. We emphasize this aspect, as it is neglected in previous works \autocites[][485]{blazejewskiLocalNonParametricModel2005}[][48]{ronenMachineLearningTrade2022}[][9]{rosenthalModelingTradeDirection2012}. \textbf{Numerical Features} -Pricing or quote data can often not be fully reconstructed, resulting in missing values across all features. Decision trees and ensembles thereof can inherently handle $\mathtt{[NaN]}$ values by discarding missing values in the splitting procedure \autocite[][150--152]{breimanClassificationRegressionTrees2017} or by incorporating missing values into the splitting criterion \autocite[][951]{twalaGoodMethodsCoping2008}. Transformers require missing values to be imputed beforehand, as a $\mathtt{[NaN]}$ value cannot be propagated through the network. We choose zero imputation for being a single-pass strategy that minimises data leakage and allows \glspl{GBRT} and neural networks to separate imputed values from observed ones. With a low degree of missing values, the impact on the final result is minuscule. +Pricing or quote data can often not be fully reconstructed, resulting in missing values across all features. Decision trees and ensembles thereof can inherently handle $\mathtt{[NaN]}$ values by discarding missing values in the splitting procedure \autocite[][150--152]{breimanClassificationRegressionTrees2017} or by incorporating missing values into the splitting criterion \autocite[][951]{twalaGoodMethodsCoping2008}. Transformers require missing values to be imputed beforehand, as a $\mathtt{[NaN]}$ value cannot be propagated through the network. We choose zero imputation for being a single-pass strategy that minimizes data leakage and allows \glspl{GBRT} and neural networks to separate imputed values from observed ones. With a low degree of missing values, the impact on the final result is minuscule. Price and size-related features exhibit positive skewness. Tree-based learners are unaffected by the feature scale, as the splitting process is based on the quality of the split but not on the scale of splitting value (cp. \cref{sec:decision-tree}). To avoid the tails of the distribution dominating the weight updates of neural networks, we apply power transformations, which transform the distribution of features to be Gaussian-like. Apart from quantization effects, \glspl{GBRT} are unaffected. We determine the power transformation using the Box-Cox procedure \autocite[][214]{boxAnalysisTransformations2022}, given by: @@ -127,18 +127,18 @@ \subsubsection{Data Preprocessing}\label{sec:data-preprocessing} \label{eq:box-cox-test} \end{equation} -Here, $\lambda$ is the power parameter and determines the specific power function. It is estimated by optimising the Gaussian likelihood on the training set. As shown in \cref{eq:box-cox-test}, a value of $\lambda=0$ corresponds to a log-transform, while $\lambda=1$ leaves the feature unaltered. As the test is only defined on positive $\mathbf{X}\left[:,j\right]$, we follow common practice by adding a constant if needed. Our estimates for $\lambda$ are documented in the \cref{app:power-transforms-of-features}. Based on the results of the Box-Cox test, we apply a common $\mathbf{X}\left[:,j\right]=\log(\mathbf{X}\left[:,j\right])$ transform on all price and size-related features with the effect of compressing large values and expanding smaller ones.\footnote{More specifically, $\mathtt{log1p}$ is used to improve numerical stability in floating point calculations. Results are not affected.} The use of the Box-Cox transform differs from its original purpose. +Here, $\lambda$ is the power parameter and determines the specific power function. It is estimated by optimizing the Gaussian likelihood on the training set. As shown in \cref{eq:box-cox-test}, a value of $\lambda=0$ corresponds to a log-transform, while $\lambda=1$ leaves the feature unaltered. As the test is only defined on positive $\mathbf{X}\left[:,j\right]$, we follow common practice by adding a constant if needed. Our estimates for $\lambda$ are documented in the \cref{app:power-transforms-of-features}. Based on the results of the Box-Cox test, we apply a common $\mathbf{X}\left[:,j\right]=\log(\mathbf{X}\left[:,j\right])$ transform on all price and size-related features with the effect of compressing large values and expanding smaller ones.\footnote{More specifically, $\mathtt{log1p}$ is used to improve numerical stability in floating point calculations. Results are not affected.} The use of the Box-Cox transform differs from its original purpose. % In feature engineering, the transformation is used in an unsupervised fashion, as the transformation's outcome is not directly used in the model and the transformation is applied to the features, rather than the model's residuals \autocite[122]{kuhnFeatureEngineeringSelection2020}. In experimental tests, features derived as ratios, such as the proximity to quotes, pose a particular challenge for training the FT-Transformer. We observe extreme outliers dominate the gradient update, leading to unstable gradients and poor convergence. We resolve the issue by clipping to a range $[-3,3]$. -To further improve the convergence of Transformers, we normalise all numerical features using $z$-score normalisation to obtain zero mean and unit variance. Intuitionally, the zero means prevents bias in the direction of the weight update and scaling to unit variance balances the rate at which parameters are updated \autocite[][8]{lecunEfficientBackProp2012}. Normalisation of raw inputs is complementary to batch normalisation, which is used in deeper layers of the Transformer stack and single batches. Following good standards, all statistics are estimated on the imputed training set only. The unlabelled \gls{ISE} training set and the \gls{CBOE} test set share the statistics of the \gls{ISE} labelled training set. +To further improve the convergence of Transformers, we normalize all numerical features using $z$-score normalization to obtain zero mean and unit variance. Intuitionally, the zero means prevents bias in the direction of the weight update and scaling to unit variance balances the rate at which parameters are updated \autocite[][8]{lecunEfficientBackProp2012}. Normalization of raw inputs is complementary to batch normalization, which is used in deeper layers of the Transformer stack and single batches. Following good standards, all statistics are estimated on the imputed training set only. The unlabeled \gls{ISE} training set and the \gls{CBOE} test set share the statistics of the \gls{ISE} labeled training set. -Normalisation and log transformations have the advantage of preserving the data distribution, which is a desirable property when comparing the feature importances from machine learning models against their classical counterparts in \cref{sec:feature-importance}. +Normalization and log transformations have the advantage of preserving the data distribution, which is a desirable property when comparing the feature importances from machine learning models against their classical counterparts in \cref{sec:feature-importance}. \textbf{Categorical Features} -As for the categorical variables, consisting of the option type, the underlying, and the issue type, different transformations are required. We perform a label encoding by randomly mapping every unique value onto their integer key. This basic transformation defers the handling of categorical data to the model. Also, it minimises target leakage. Missing classes or classes unseen during training are mapped to the key of an $\mathtt{[UNK]}$ \gls{token}, as motivated in \cref{sec:token-embeddings}. +As for the categorical variables, consisting of the option type, the underlying, and the issue type, different transformations are required. We perform a label encoding by randomly mapping every unique value onto their integer key. This basic transformation defers the handling of categorical data to the model. Also, it minimizes target leakage. Missing classes or classes unseen during training are mapped to the key of an $\mathtt{[UNK]}$ \gls{token}, as motivated in \cref{sec:token-embeddings}. The option type and issue type are both low-cardinal with two and five unique classes. Differently, the underlying is high-cardinal with more than \num{9107} distinct classes, as options are written on a wide range of underlyings, impacting both the model's tendency to overfit and parameter count. For simplicity in evaluation, we do not remove infrequent categories. @@ -152,7 +152,7 @@ \subsubsection{Train-Test Split}\label{sec:train-test-split} Trades in the dataset are ordered by time of execution, and nearby trades can be auto-correlated, as documented in \cref{app:autocorrelation-of-features}. Prime examples for auto-correlation between trades are market or limit orders, that are split into smaller orders for eager order execution. % Also, informed traders disguise their trading activity by slicing orders into smaller-sized orders, as reported by \textcite[][183]{anandStealthTradingOptions2007}. -The resulting, separate transactions are trivial to classify with the true label of a single transaction. This imposes constraints on the train-test split, which must ensure that minimal information leaks into the test set through serially-correlated features, leading to an otherwise overestimated model performance.\footnote{We emphasise this aspect, as previous research of \textcite[][14]{ronenMachineLearningTrade2022} is expectedly affected from this issue leading to exaggerated results.} The violation of statistical independence, out rules methods like the $k$-fold cross-validation or random train-test splits, both of which assume samples to be i.i.d. \autocite[][103--105]{lopezdepradoAdvancesFinancialMachine2018}. Differently, our work statically splits into subsets by time, which maintains the temporal ordering and eschews data leakage. Albeit this limits the model's ability to leverage recent information for prediction beyond the training set's cut-off point. We do not explore dynamic training schemes, as they are practically intractable considering the number of model combinations and computational requirements of Transformers and \glspl{GBRT}. In the absence of an update mechanism, our results can be interpreted as a lower bound. +The resulting, separate transactions are trivial to classify with the true label of a single transaction. This imposes constraints on the train-test split, which must ensure that minimal information leaks into the test set through serially-correlated features, leading to an otherwise overestimated model performance.\footnote{We emphasize this aspect, as previous research of \textcite[][14]{ronenMachineLearningTrade2022} is expectedly affected from this issue leading to exaggerated results.} The violation of statistical independence, out rules methods like the $k$-fold cross-validation or random train-test splits, both of which assume samples to be i.i.d. \autocite[][103--105]{lopezdepradoAdvancesFinancialMachine2018}. Differently, our work statically splits into subsets by time, which maintains the temporal ordering and eschews data leakage. Albeit this limits the model's ability to leverage recent information for prediction beyond the training set's cut-off point. We do not explore dynamic training schemes, as they are practically intractable considering the number of model combinations and computational requirements of Transformers and \glspl{GBRT}. In the absence of an update mechanism, our results can be interpreted as a lower bound. Applying the time-based split, we attribute the first \SI{60}{\percent} of our dataset for training and the next \SI{20}{\percent} each for validation and testing. Days at the split boundary are assigned to either one set to avoid train-test contamination. % Data within the training and validation set may be shuffled to accelerate training. @@ -163,10 +163,10 @@ \subsubsection{Train-Test Split}\label{sec:train-test-split} \label{fig:train-test-split} \end{figure} -Overall, we use \gls{ISE} data from 2 May 2005 to 24 October 2013 to train and data between 25 October 2013 and 5 November 2015 to validate our models. The most recent trades until 31 May 2017 to assess the generalisation error. +Overall, we use \gls{ISE} data from 2 May 2005 to 24 October 2013 to train and data between 25 October 2013 and 5 November 2015 to validate our models. The most recent trades until 31 May 2017 to assess the generalization error. -Models are pre-trained on unlabelled samples from the last year of the training period. Given the significantly larger number of unlabelled customer trades, the pre-training period is reduced to one year to facilitate training on the available computing resources. +Models are pre-trained on unlabeled samples from the last year of the training period. Given the significantly larger number of unlabeled customer trades, the pre-training period is reduced to one year to facilitate training on the available computing resources. -We use the \gls{CBOE} sample past 5 November 2015 as a second test set, as visualised in \cref{fig:train-test-split}. Our evaluation approach is the most rigorous as it disallows any form of adaptation of the models, thereby ensuring a rigorous evaluation. Unlike transfer learning techniques such as parameter or model transfer, which expectedly improve model performance, we choose to forgo these techniques and demonstrate the effectiveness of our models without any transfer of knowledge. The start date ensures that leakage from the \gls{ISE} set is minimised.\footnote{The datasets contain features, such as the \gls{NBBO}, that are identical for both sets, assuming trades were executed at both exchanges simultaneously. Also, quotes can be identical between exchanges, if market makers quote at the \gls{NBBO}, which is common practice as documented in \textcite[10]{securitiesandexchangecommissionReportConcerningExaminations2007}. Utilising the full \gls{CBOE} sample could result in exaggerated performance estimates if the corresponding \gls{ISE} trade is used in training.} +We use the \gls{CBOE} sample past 5 November 2015 as a second test set, as visualized in \cref{fig:train-test-split}. Our evaluation approach is the most rigorous as it disallows any form of adaptation of the models, thereby ensuring a rigorous evaluation. Unlike transfer learning techniques such as parameter or model transfer, which expectedly improve model performance, we choose to forgo these techniques and demonstrate the effectiveness of our models without any transfer of knowledge. The start date ensures that leakage from the \gls{ISE} set is minimized.\footnote{The datasets contain features, such as the \gls{NBBO}, that are identical for both sets, assuming trades were executed at both exchanges simultaneously. Also, quotes can be identical between exchanges, if market makers quote at the \gls{NBBO}, which is common practice as documented in \textcite[10]{securitiesandexchangecommissionReportConcerningExaminations2007}. Utilizing the full \gls{CBOE} sample could result in exaggerated performance estimates if the corresponding \gls{ISE} trade is used in training.} -Our train-test-split assumes that all subsets are drawn from the same distribution, so fitting a classifier on the training set and optimising for the validation set provides good estimates for the test set. To validate this assumption, we use adversarial validation. Specifically, we re-label all training samples with $y=-1$ and all trades of the validation set with $y=1$, train a classifier on a random subset of the composed dataset and predict class conformance. The performance is estimated using the \gls{MCC} of \textcite[][445]{matthewsComparisonPredictedObserved1975}, which ranges between $\left[-1, 1\right]$ and is insensitive to class imbalances.\footnote{Classes are imbalanced, due to the training set being three times the size of the validation set.} Assuming train and validation samples are sampled from the same distribution, the performance estimate is near a random guess, or $\operatorname{MCC} = 0$. For the mid-sized feature set, the \gls{MCC} is \num{0.364260805498287} suggesting training and validation sets are approximately similar. The next section discusses techniques used in training the classifiers. \ No newline at end of file +Our train-test-split assumes that all subsets are drawn from the same distribution, so fitting a classifier on the training set and optimizing for the validation set provides good estimates for the test set. To validate this assumption, we use adversarial validation. Specifically, we re-label all training samples with $y=-1$ and all trades of the validation set with $y=1$, train a classifier on a random subset of the composed dataset and predict class conformance. The performance is estimated using the \gls{MCC} of \textcite[][445]{matthewsComparisonPredictedObserved1975}, which ranges between $\left[-1, 1\right]$ and is insensitive to class imbalances.\footnote{Classes are imbalanced, due to the training set being three times the size of the validation set.} Assuming train and validation samples are sampled from the same distribution, the performance estimate is near a random guess, or $\operatorname{MCC} = 0$. For the mid-sized feature set, the \gls{MCC} is \num{0.364260805498287} suggesting training and validation sets are approximately similar. The next section discusses techniques used in training the classifiers. \ No newline at end of file diff --git a/reports/Content/end.tex b/reports/Content/end.tex index 35db8a03..97f2548f 100644 --- a/reports/Content/end.tex +++ b/reports/Content/end.tex @@ -2,33 +2,33 @@ \section{Discussion}\label{sec:discussion} In this study, we applied gradient boosting and the FT-Transformer, two well-established machine learning classifiers, to the task of trade classification in the option market. While our results clearly demonstrate the superior performance of machine learning over rule-based classification, it's important to acknowledge the limitations of our approach. -Inference of our classifiers is computationally cheap, but training requires a significant amount of compute. To make training feasible at all, great effort is spent on utilising computing resources by optimising memory transfers, compute graphs, and data representation. In cases, where computing resources are scarce or classification accuracy is not the primary target, we advocate for heuristics, such as the \gls{GSU} method, which balances between computational cost and performance. +Inference of our classifiers is computationally cheap, but training requires a significant amount of compute. To make training feasible at all, great effort is spent on utilizing computing resources by optimizing memory transfers, compute graphs, and data representation. In cases, where computing resources are scarce or classification accuracy is not the primary target, we advocate for heuristics, such as the \gls{GSU} method, which balances between computational cost and performance. -All of our classifiers require some labelled instances for training. If the true label cannot be inferred from trades or generating labelled data is wasteful, our approaches are not applicable. For cases, where trades are partially labelled, our pre-trained FT-Transformer offers a viable alternative to rule-based classification. +All of our classifiers require some labeled instances for training. If the true label cannot be inferred from trades or generating labeled data is wasteful, our approaches are not applicable. For cases, where trades are partially labeled, our pre-trained FT-Transformer offers a viable alternative to rule-based classification. \newpage \section{Conclusion}\label{sec:conclusion} \todo{The predictability results survive an extensive list of robustness checks. Make clear we compare deep learning vs tree-based methods. None is superior.} -The goal of this study is to examine the performance of machine learning-based trade classification in the option market. In particular, we propose to model trade classification with Transformers and gradient boosting. Both approaches are supervised and leverage labelled trades. For settings, where labelled trades are scarce, we extend Transformers with a pre-training objective to train on unlabelled trades as well as generate pseudo-labels for gradient boosting through a self-training procedure. +The goal of this study is to examine the performance of machine learning-based trade classification in the option market. In particular, we propose to model trade classification with Transformers and gradient boosting. Both approaches are supervised and leverage labeled trades. For settings, where labeled trades are scarce, we extend Transformers with a pre-training objective to train on unlabeled trades as well as generate pseudo-labels for gradient boosting through a self-training procedure. -Our models establish a new state-of-the-art for trade classification on the \gls{ISE} and \gls{CBOE} dataset. For \gls{ISE} trades, Transformers achieve an accuracy of \SI{63.78}{\percent} when trained on trade and quoted prices as well as \SI{72.58}{\percent} when trained on additional quoted sizes, improving over current best of \textcite[][27]{grauerOptionTradeClassification2022} by \SI{3.73}{\percent} and \SI{4.97}{\percent}. Similarly, \glspl{GBRT} reach accuracies between \SI{63.67}{\percent} and \SI{73.24}{\percent}. We observe performance improvements up to \SI{6.51}{\percent} for \glspl{GBRT} and \SI{6.31}{\percent} for Transformers when models have access to option characteristics. Relative to the ubiquitous tick test, quote rule, and LR algorithm, improvements are \SI{23.88}{\percent}, \SI{17.11}{\percent}, and \SI{17.02}{\percent}. Outperformance is particularly strong for \gls{ITM} options, options with a long maturity, as well as options traded at the quotes. Both architectures generalise well on \gls{CBOE} data, with even stronger improvements between \SI{4.92}{\percent} and \SI{7.58}{\percent} over the benchmark depending on the model and feature set. +Our models establish a new state-of-the-art for trade classification on the \gls{ISE} and \gls{CBOE} dataset. For \gls{ISE} trades, Transformers achieve an accuracy of \SI{63.78}{\percent} when trained on trade and quoted prices as well as \SI{72.58}{\percent} when trained on additional quoted sizes, improving over current best of \textcite[][27]{grauerOptionTradeClassification2022} by \SI{3.73}{\percent} and \SI{4.97}{\percent}. Similarly, \glspl{GBRT} reach accuracies between \SI{63.67}{\percent} and \SI{73.24}{\percent}. We observe performance improvements up to \SI{6.51}{\percent} for \glspl{GBRT} and \SI{6.31}{\percent} for Transformers when models have access to option characteristics. Relative to the ubiquitous tick test, quote rule, and \gls{LR} algorithm, improvements are \SI{23.88}{\percent}, \SI{17.11}{\percent}, and \SI{17.02}{\percent}. Outperformance is particularly strong for \gls{ITM} options, options with a long maturity, as well as options traded at the quotes. Both architectures generalize well on \gls{CBOE} data, with even stronger improvements between \SI{4.92}{\percent} and \SI{7.58}{\percent} over the benchmark depending on the model and feature set. -In the semi-supervised setting, Transformers on \gls{ISE} dataset profit from pre-training on unlabelled trades with accuracies up to \SI{74.55}{\percent}, but the performance gains slightly diminish on the \gls{CBOE} test set. Vice versa, we observe no benefits from semi-supervised training of \glspl{GBRT}. +In the semi-supervised setting, Transformers on \gls{ISE} dataset profit from pre-training on unlabeled trades with accuracies up to \SI{74.55}{\percent}, but the performance gains slightly diminish on the \gls{CBOE} test set. Vise versa, we observe no benefits from semi-supervised training of \glspl{GBRT}. % Consistent with \textcites[][27]{grauerOptionTradeClassification2022}[][901]{savickasInferringDirectionOption2003} we find evidence that the performance of common trade classification rules deteriorates in the option market. In particular, tick-based methods marginally outperform a random guess. Unlike previous studies, we can trace back the performance of our approaches as well as of trade classification rules to individual features and feature groups using the importance measure \gls{SAGE}. We find that both paradigms attain the largest performance improvements from classifying trades based on quoted sizes and prices, but machine learning-based classifiers attain higher performance gains and effectively exploit the data. The change in the trade price, decisive criteria to the (reverse) tick test, plays no role in option trade classification. We identify the relative illiquidity of options to affect the information content of the surrounding trade prices. Our classifiers profit from the inclusion of option-specific features, like moneyness and time-to-maturity, currently unexploited in classical trade classification. -By probing and visualising the attention mechanism of the Transformer, we can establish a connection to rule-based classification. Graphically, our results show, that attention heads encode knowledge about rule-based classification. Whilst attention heads in earlier layers of the network broadly attend to all features or their embeddings, later they focus on specific features jointly used in rule-based classification akin to the \gls{LR} algorithm, depth rule or others. Furthermore, embeddings encode domain knowledge. Our results demonstrate exemplary for traded underlying, that the Transformer learns to group similar underlyings in embedding space. +By probing and visualizing the attention mechanism of the Transformer, we can establish a connection to rule-based classification. Graphically, our results show, that attention heads encode knowledge about rule-based classification. Whilst attention heads in earlier layers of the network broadly attend to all features or their embeddings, later they focus on specific features jointly used in rule-based classification akin to the \gls{LR} algorithm, depth rule or others. Furthermore, embeddings encode domain knowledge. Our results demonstrate exemplary for traded underlying, that the Transformer learns to group similar underlyings in embedding space. -Our classifiers deliver accurate predictions and improved robustness, which effectively reduces noise and bias in option research dependent on reliable trade initiator estimates. When applied to measuring trading cost through effective spreads, the models dominate all rule-based approaches by approximating the true effective spread of options best. Exemplary, the Transformer pre-trained on unlabelled trades estimates a mean spread of \SI[round-mode=places, round-precision=3]{0.013118}[\$]{} versus \SI[round-mode=places, round-precision=3]{0.004926}[\$]{} actual spread at the \gls{ISE}. +Our classifiers deliver accurate predictions and improved robustness, which effectively reduces noise and bias in option research dependent on reliable trade initiator estimates. When applied to measuring trading cost through effective spreads, the models dominate all rule-based approaches by approximating the true effective spread of options best. Exemplary, the Transformer pre-trained on unlabeled trades estimates a mean spread of \SI[round-mode=places, round-precision=3]{0.013118}[\$]{} versus \SI[round-mode=places, round-precision=3]{0.004926}[\$]{} actual spread at the \gls{ISE}. -In conclusion, our study showcases the efficacy of machine learning as a viable alternative to existing trade signing algorithms for classifying option trades, if partially-labelled or labelled trades are available for training. % While we tested our models on option trades, we expect the results to transfer to other modalities including equity trades. +In conclusion, our study showcases the efficacy of machine learning as a viable alternative to existing trade signing algorithms for classifying option trades, if partially-labeled or labeled trades are available for training. % While we tested our models on option trades, we expect the results to transfer to other modalities including equity trades. \newpage \section{Outlook}\label{sec:outlook} -In future work, we plan to revisit training Transformers on a larger corpus of unlabelled trades through pre-training objectives and study the effects from \emph{exchange-specific} finetuning. While our current results show that pre-training positively drives classification performance, for comparability it is only performed on a small subset of trades and models have not fully converged. Thus, we expect to see benefits from additional data and compute, following the scaling laws of \textcite[][7]{hoffmannTrainingComputeOptimalLarge2022}. The application confers advantages when finetuning is constrained due to the limited availability of the true trade initiator. +In future work, we plan to revisit training Transformers on a larger corpus of unlabeled trades through pre-training objectives and study the effects from \emph{exchange-specific} finetuning. While our current results show that pre-training positively drives classification performance, for comparability it is only performed on a small subset of trades and models have not fully converged. Thus, we expect to see benefits from additional data and compute, following the scaling laws of \textcite[][7]{hoffmannTrainingComputeOptimalLarge2022}. The application confers advantages when finetuning is constrained due to the limited availability of the true trade initiator. -Indicatively, our results show that specific attention heads in the Transformer specialise in patterns akin to classical trade classification rules. We want to explore this aspect further and potentially reverse engineer classification rules from attention heads that are yet unknown. This way, we can transfer the superior classification accuracy of the Transformer to regimes where labels are unavailable or computational costs of training are not affordable. \ No newline at end of file +Indicatively, our results show that specific attention heads in the Transformer specialize in patterns akin to classical trade classification rules. We want to explore this aspect further and potentially reverse engineer classification rules from attention heads that are yet unknown. This way, we can transfer the superior classification accuracy of the Transformer to regimes where labels are unavailable or computational costs of training are not affordable. \ No newline at end of file diff --git a/reports/Content/evaluation.tex b/reports/Content/evaluation.tex index 0d9fe6c4..46d0c63a 100644 --- a/reports/Content/evaluation.tex +++ b/reports/Content/evaluation.tex @@ -4,7 +4,7 @@ \subsection{Evaluation}\label{sec:evaluation} \subsubsection{Evaluation Metric}\label{sec:evaluation-metric} -Our goal is to maximise the number of trades, where the predicted trade initiator matches the true trade initiator. We assess the quality of our model’s prediction in terms of \emph{accuracy}, which can be stated as: +Our goal is to maximize the number of trades, where the predicted trade initiator matches the true trade initiator. We assess the quality of our model’s prediction in terms of \emph{accuracy}, which can be stated as: \begin{equation} % \operatorname{accuracy} \colon \mathbb{R}^{N} \times \mathbb{R}^{N} \to \left[0, 1\right], \quad \operatorname{accuracy}(\mathbf{y}, \widehat{\mathbf{y}}) = 1 - \frac{1}{N}\sum_{i=1}^{N} \operatorname{L}_{\mathrm{0-1}}(\mathbf{y}_i, \widehat{\mathbf{y}}_i), @@ -16,15 +16,15 @@ \subsubsection{Evaluation Metric}\label{sec:evaluation-metric} \label{eq:0-1-loss} \end{equation} -Intuitively, from the 0-1-loss we obtain the error rate on the dataset, as for every misclassified trade we count a loss of one and normalise by the number of samples $N$, which gives the normalised 0-1-loss. Notably, the loss is the same for false positives and negatives. +Intuitively, from the 0-1-loss we obtain the error rate on the dataset, as for every misclassified trade we count a loss of one and normalize by the number of samples $N$, which gives the normalized 0-1-loss. Notably, the loss is the same for false positives and negatives. -Our datasets are approximately balanced and buyer-initiated trades predicted as seller-initiated and vice versa have similar associated costs, which makes accuracy an ideal choice as a performance metric.\footnote{The \gls{ISE} test set consists of \SI{48.5973}{\percent} of buy trades and \SI{46.1278}{\percent} of the \gls{CBOE} test set are buy trades.} As the 0-1-loss and in consequence, the accuracy is not differentiable, it cannot be used in optimisation, but as an early stopping criterion to halt training or as an optimisation target in the hyperparameter search. We report the accuracy of the test sets. +Our datasets are approximately balanced and buyer-initiated trades predicted as seller-initiated and vise versa have similar associated costs, which makes accuracy an ideal choice as a performance metric.\footnote{The \gls{ISE} test set consists of \SI{48.5973}{\percent} of buy trades and \SI{46.1278}{\percent} of the \gls{CBOE} test set are buy trades.} As the 0-1-loss and in consequence, the accuracy is not differentiable, it cannot be used in optimization, but as an early stopping criterion to halt training or as an optimization target in the hyperparameter search. We report the accuracy of the test sets. \subsubsection{Feature Importance Measure}\label{sec:feature-importance-measure} Naturally, we aim to gain insights into the prediction process and identify relevant features, which fall under the umbrella of \emph{interpretability}. -Following, \textcite[][4]{liptonMythosModelInterpretability2017} interpretability can be reached through model transparency or post-hoc interpretability methods. Transparent models provide interpretability through a transparent mechanism in the model, whereas post-hoc methods extract information from the already learnt model \autocite[][4--5]{liptonMythosModelInterpretability2017}. +Following, \textcite[][4]{liptonMythosModelInterpretability2017} interpretability can be reached through model transparency or post-hoc interpretability methods. Transparent models provide interpretability through a transparent mechanism in the model, whereas post-hoc methods extract information from the already learned model \autocite[][4--5]{liptonMythosModelInterpretability2017}. Classical trade classification algorithms, as a rule-based classifier, are transparent with an easily understandable decision process and thus provide interpretability \autocite[][91]{barredoarrietaExplainableArtificialIntelligence2020}. Interpretability, however, decreases for deep, stacked combinations involving a large feature count, when interactions between base rules become more complex and the effect of a single feature on the final prediction more challenging to interpret. @@ -46,7 +46,7 @@ \subsubsection{Feature Importance \end{equation} where $D=\left\{1,\ldots,d\right\}$ is a set of feature indices corresponding to the features $x_1,\ldots,x_d$ and $S\subset D$. Intuitionally, \cref{eq:shapley} estimates Shapley value as the weighted average of the incremental change in the value function, $v_f(S)$, before and after the $i$-th feature is added to the feature subsets $S$. Hereby, the first term $\left(\begin{smallmatrix} d-1 \\|S|\end{smallmatrix}\right)^{-1}$ accounts for the possibilities to choose a $|S|$-strong subset from $D \backslash\{i\}$. -While subsets of features $X_S = \left\{X_i \mid i \in S \right\}$ can be easily constructed, most classifiers, including ours, cannot handle the absence of features as they require fixed-sized inputs during inference. \textcite[][2]{covertUnderstandingGlobalFeature2020} mitigate the issue, by marginalising out the missing features $\bar{S}=D\backslash S$ using the conditional distribution $p(X_{\bar{S}} \mid X_S=x_S)$. Following \textcite[][4--5]{covertUnderstandingGlobalFeature2020}, the performance of the model for a given subset of features $S$ and loss function $\ell$ can now be estimated by +While subsets of features $X_S = \left\{X_i \mid i \in S \right\}$ can be easily constructed, most classifiers, including ours, cannot handle the absence of features as they require fixed-sized inputs during inference. \textcite[][2]{covertUnderstandingGlobalFeature2020} mitigate the issue, by marginalizing out the missing features $\bar{S}=D\backslash S$ using the conditional distribution $p(X_{\bar{S}} \mid X_S=x_S)$. Following \textcite[][4--5]{covertUnderstandingGlobalFeature2020}, the performance of the model for a given subset of features $S$ and loss function $\ell$ can now be estimated by \begin{equation} v_f(S)=-\mathbb{E}\left[\ell\left(\mathbb{E}\left[f(X) \mid X_S\right], Y\right)\right]. \end{equation} @@ -57,9 +57,9 @@ \subsubsection{Feature Importance In addition to \gls{SAGE}, Transformer-based models offer \emph{some} interpretability through their attention mechanism. Consistent with \textcite[][8]{wiegreffeAttentionNotNot2019} we view attention scores as a vehicle to model transparency. -Recall from our discussion on attention (cp. \cref{sec:attention}) that the attention matrix stores how much attention a token pays to each of the keys. Thus, feature attributions can be derived from attention by visualising features to which the model attends to in an attention map. While attention maps are specific to Transformers or other attention-based architectures, rendering them useless for cross-model comparisons, they give additional insights from different attention layers and attention heads of the model on a per-trade and global basis. +Recall from our discussion on attention (cp. \cref{sec:attention}) that the attention matrix stores how much attention a token pays to each of the keys. Thus, feature attributions can be derived from attention by visualizing features to which the model attends to in an attention map. While attention maps are specific to Transformers or other attention-based architectures, rendering them useless for cross-model comparisons, they give additional insights from different attention layers and attention heads of the model on a per-trade and global basis. -In the tabular domain, various approaches have been investigated in the literature to obtain attention from multiple attention heads and Transformer blocks. \textcite[][18]{somepalliSaintImprovedNeural2021} and \textcite[][11]{borisovDeepNeuralNetworks2022} gather attention maps from the first attention layer only, and \textcite[][11]{borisovDeepNeuralNetworks2022} additionally obtain feature attributions by taking the diagonal of the attention matrix $\mathbf{A}$ or through column-wise summation. In contrast, \textcite[][10]{gorishniyRevisitingDeepLearning2021} leverage all attention matrices by averaging over multiple Transformer blocks, attention heads, and samples to obtain global feature attributions. Given \cref{sec:architectural-overview,sec:attention}, where we emphasised the unique role of attention heads and lower sub-layers, both approaches may be myopic, as attention heads contribute unequally to the result, or as later attention layers are neglected altogether. +In the tabular domain, various approaches have been investigated in the literature to obtain attention from multiple attention heads and Transformer blocks. \textcite[][18]{somepalliSaintImprovedNeural2021} and \textcite[][11]{borisovDeepNeuralNetworks2022} gather attention maps from the first attention layer only, and \textcite[][11]{borisovDeepNeuralNetworks2022} additionally obtain feature attributions by taking the diagonal of the attention matrix $\mathbf{A}$ or through column-wise summation. In contrast, \textcite[][10]{gorishniyRevisitingDeepLearning2021} leverage all attention matrices by averaging over multiple Transformer blocks, attention heads, and samples to obtain global feature attributions. Given \cref{sec:architectural-overview,sec:attention}, where we emphasized the unique role of attention heads and lower sub-layers, both approaches may be myopic, as attention heads contribute unequally to the result, or as later attention layers are neglected altogether. While not explored systematically in the tabular domain yet, the rollout attention method of \textcite[][3]{abnarQuantifyingAttentionFlow2020} combines raw attention from multiple layers through recursive matrix multiplication with the weight matrices from attention layers below, as shown in this Equation:\footnote{Notation from adapted from \textcite[][786]{cheferTransformerInterpretabilityAttention2021}.} \begin{equation} diff --git a/reports/Content/introduction.tex b/reports/Content/introduction.tex index c54a5ef6..1be51bcf 100644 --- a/reports/Content/introduction.tex +++ b/reports/Content/introduction.tex @@ -12,11 +12,11 @@ \section{Introduction}\label{sec:introduction} Our work fills this gap by focusing on machine learning methods to infer the trade initiator in the option market. Approaching trade classification with machine learning is a logical choice, given its capability to handle high-dimensional trade data and learn complex decision boundaries. This raises the question, \emph{can an alternative machine learning-based classifier improve upon the accuracy of state-of-the-art approaches for option trade classification?} -To answer this question, we model trade classification through machine learning. We consider the supervised case, where fully-labelled trade data is available, as well as the semi-supervised setting, where trades are partially labelled with the true trade initiator. Our work makes the following contributions: +To answer this question, we model trade classification through machine learning. We consider the supervised case, where fully-labeled trade data is available, as well as the semi-supervised setting, where trades are partially labeled with the true trade initiator. Our work makes the following contributions: \begin{enumerate}[label=(\roman*),noitemsep] \item We employ state-of-the-art supervised algorithms i.~e., gradient-boosted trees and Transformer networks to the problem of trade classification and benchmark these approaches against rules-based methods. Our approaches outperform all rule-based approaches on \gls{ISE} and \gls{CBOE} data with comparable data requirements. In the application setting, our approaches approximate the true effective spread best. - \item In a real-world setting, labelled trades are typically scarce, while unlabelled trades are abundant. Motivated by this consideration, we extend the classifiers to learn on both labelled and unlabelled instances through pre-training and self-training procedures. We analyse the effect on classification accuracy and observe that pre-training of Transformers further alleviates accuracy on \gls{ISE} trades. - \item Through a game-theoretic approach, our work is the first to consistently attribute the performance of rule-based and machine learning-based classification to individual features. We discover that both paradigms share common features, but machine learning-based classifiers attain higher performance gains and effectively exploit the data. By probing and visualising the attention mechanism in the Transformer, we can strengthen the connection to rule-based classification and reveal that \emph{learned} rules mimic classical rules. + \item In a real-world setting, labeled trades are typically scarce, while unlabeled trades are abundant. Motivated by this consideration, we extend the classifiers to learn on both labeled and unlabeled instances through pre-training and self-training procedures. We analyze the effect on classification accuracy and observe that pre-training of Transformers further alleviates accuracy on \gls{ISE} trades. + \item Through a game-theoretic approach, our work is the first to consistently attribute the performance of rule-based and machine learning-based classification to individual features. We discover that both paradigms share common features, but machine learning-based classifiers attain higher performance gains and effectively exploit the data. By probing and visualizing the attention mechanism in the Transformer, we can strengthen the connection to rule-based classification and reveal that \emph{learned} rules mimic classical rules. \end{enumerate} -The remainder of this thesis is organised as follows. \cref{sec:related-work} reviews publications on trade classification in option markets and using machine learning, thereby underpinning our research framework. \cref{sec:rule-based-approaches} introduces extant methods for rule-based trade classification. \cref{sec:supervised-approaches} discusses and introduces supervised methods for trade classification. Then, \cref{sec:semi-supervised-approaches} extends the previously selected algorithms for the semi-supervised case. We test the models in \cref{sec:empirical-study} in an empirical setting. In \cref{sec:application} we apply our models to the problem of effective spread estimation. Finally, \cref{sec:discussion} discusses limitations and \cref{sec:conclusion} concludes. +The remainder of this thesis is organized as follows. \cref{sec:related-work} reviews publications on trade classification in option markets and using machine learning, thereby underpinning our research framework. \cref{sec:rule-based-approaches} introduces extant methods for rule-based trade classification. \cref{sec:supervised-approaches} discusses and introduces supervised methods for trade classification. Then, \cref{sec:semi-supervised-approaches} extends the previously selected algorithms for the semi-supervised case. We test the models in \cref{sec:empirical-study} in an empirical setting. In \cref{sec:application} we apply our models to the problem of effective spread estimation. Finally, \cref{sec:discussion} discusses limitations, and \cref{sec:conclusion} concludes. diff --git a/reports/Content/main-expose.tex b/reports/Content/main-expose.tex index 2405b3ed..aef56e70 100644 --- a/reports/Content/main-expose.tex +++ b/reports/Content/main-expose.tex @@ -13,59 +13,59 @@ \section{Exposé} \textbf{Introduction} -In the introduction, we provide motivation and present our key findings. The contributions are three-fold: (I) We employ state-of-the-art machine learning algorithms i.~e., gradient-boosted trees and transformer networks, for trade classification. Tree-based approaches outperform state-of-the-art trade classification rules in out-of-sample tests. (II) As part of semi-supervised approaches, we study the impact of incorporating unlabelled trades into the training procedure on trade classification accuracy. (III) We consistently interpret feature contributions to classical trade classification rules and machine learning models with a game-theoretic approach. +In the introduction, we provide motivation and present our key findings. The contributions are three-fold: (I) We employ state-of-the-art machine learning algorithms i.~e., gradient-boosted trees and transformer networks, for trade classification. Tree-based approaches outperform state-of-the-art trade classification rules in out-of-sample tests. (II) As part of semi-supervised approaches, we study the impact of incorporating unlabeled trades into the training procedure on trade classification accuracy. (III) We consistently interpret feature contributions to classical trade classification rules and machine learning models with a game-theoretic approach. \textbf{Related Work} While classical trade classification algorithms are extensively tested in the stock markets \autocites[e.~g.,][]{chakrabartyTradeClassificationAlgorithms2012}{odders-whiteOccurrenceConsequencesInaccurate2000}, few works have examined trade classification in option markets \autocites{grauerOptionTradeClassification2022}{savickasInferringDirectionOption2003}. -For option markets, the sole focus is on classical classification rules. Even in stock markets, machine learning has hardly been applied to trade classification. An early work of \textcite{rosenthalModelingTradeDirection2012} incorporates standard trade classification rules into a logistic regression model and achieves outperformance in the stock market. Similarly, \textcites{fedeniaMachineLearningCorporate2021}{ronenMachineLearningTrade2022} improve upon classical rules with a random forest, a tree-based ensemble. Albeit their work considers a broad range of approaches, the selection leaves the latest advancements in artificial neural networks and ensemble learning aside. Even if the focus is on standard techniques, the unclear research agenda with regards to model selection, tuning, and testing hampers the transferability of their results to the yet unstudied option market. +For option markets, the sole focus is on classical classification rules. Even in stock markets, machine learning has hardly been applied to trade classification. An early work of \textcite{rosenthalModelingTradeDirection2012} incorporates standard trade classification rules into a logistic regression model and achieves outperformance in the stock market. Similarly, \textcites{fedeniaMachineLearningCorporate2021}{ronenMachineLearningTrade2022} improve upon classical rules with a random forest, a tree-based ensemble. Albeit their work considers a broad range of approaches, the selection leaves the latest advancements in artificial neural networks and ensemble learning aside. Even if the focus is on standard techniques, the unclear research agenda with regard to model selection, tuning, and testing hampers the transferability of their results to the yet unstudied option market. \textbf{Methodology} We start by introducing the basic quote rule, the tick test, the reverse tick test, the depth rule \autocite{grauerOptionTradeClassification2022}, and the trade size rule \autocite{grauerOptionTradeClassification2022} and derive popular hybrids thereof. Namely, the \gls{LR} algorithm, the \gls{EMO} algorithm, and the \gls{CLNV} method. We discuss deviations from the original algorithm, like the offset in the \gls{LR} algorithm. Optionally, we include Rosenthal's method \autocite{rosenthalModelingTradeDirection2012}, which incorporates the tick test, \gls{LR}, and \gls{EMO} algorithm into a logistic regression model. Our focus is on the features used within the rules and their economic intuition. We also stress the link between hybrid rules and ensemble techniques studied in machine learning. Classical trade classification rules serve as a benchmark in our study. -Data sets of option trades adhere to a tabular format. Thus, we begin with reviewing state-of-the-art algorithms for classifying tabular data in terms of accuracy. Possible models must support both categorical features e.~g., exercise style of the option and continuous features, e.~g., the option's $\Delta$. Most often, the true label i.~e., indicator if trade is buyer-initiated, can only be inferred for fractions of the data set \autocites{grauerOptionTradeClassification2022}{savickasInferringDirectionOption2003}. Large portions remain unlabelled. Leaving the unlabelled data aside, option trade classification can be viewed as a supervised classification task. Recent research \autocites{arikTabNetAttentiveInterpretable2020}{huangTabTransformerTabularData2020}{yoonVIMEExtendingSuccess2020} indicates, however, that leveraging unlabelled data can further improve classifier performance. Thus, we also frame the problem of trade classification in option markets as a semi-supervised classification task, whereby unlabelled and labelled data is incorporated into the learning procedure. +Data sets of option trades adhere to a tabular format. Thus, we begin with reviewing state-of-the-art algorithms for classifying tabular data in terms of accuracy. Possible models must support both categorical features e.~g., the exercise style of the option and continuous features, e.~g., the option's $\Delta$. Most often, the true label i.~e., indicator if the trade is buyer-initiated, can only be inferred for fractions of the data set \autocites{grauerOptionTradeClassification2022}{savickasInferringDirectionOption2003}. Large portions remain unlabeled. Leaving the unlabeled data aside, option trade classification can be viewed as a supervised classification task. Recent research \autocites{arikTabNetAttentiveInterpretable2020}{huangTabTransformerTabularData2020}{yoonVIMEExtendingSuccess2020} indicates, however, that leveraging unlabeled data can further improve classifier performance. Thus, we also frame the problem of trade classification in option markets as a semi-supervised classification task, whereby unlabeled and labeled data is incorporated into the learning procedure. -Our selection will likely consider wide ensembles in the form of gradient-boosted trees and deep, transformer-based neural networks, such as \textit{TabNet} \autocite{arikTabNetAttentiveInterpretable2020} or \textit{TabTransformer} \autocite{huangTabTransformerTabularData2020}, due to their superior performance in large scale comparisons \autocites{borisovDeepNeuralNetworks2022}{gorishniyRevisitingDeepLearning2021}{grinsztajnWhyTreebasedModels2022}{shwartz-zivTabularDataDeep2021}. Also, both model classes can naturally be enhanced to profit from partially unlabelled data and are interpretable locally and globally. +Our selection will likely consider wide ensembles in the form of gradient-boosted trees and deep, transformer-based neural networks, such as \emph{TabNet} \autocite{arikTabNetAttentiveInterpretable2020} or \emph{TabTransformer} \autocite{huangTabTransformerTabularData2020}, due to their superior performance in large scale comparisons \autocites{borisovDeepNeuralNetworks2022}{gorishniyRevisitingDeepLearning2021}{grinsztajnWhyTreebasedModels2022}{shwartz-zivTabularDataDeep2021}. Also, both model classes can naturally be enhanced to profit from partially unlabeled data and are interpretable locally and globaly. Thereafter, we thoroughly introduce the models for the supervised setting. We start with the notion of classical decision trees, as covered by \textcite{breimanClassificationRegressionTrees2017}. Trees are inherent to tree-based boosting approaches as weak learners. Thus, emphasis is put on the selection of features and the splitting process of the predictor space into disjoint regions. We motivate the use of ensemble approaches, such as gradient-boosted trees, with the poor variance property of decision trees. The subsequent chapter draws on \textcite{hastietrevorElementsStatisticalLearning2009} and \textcite{friedmanGreedyFunctionApproximation2001} with a focus on gradient boosting for classification. Therein, we introduce necessary enhancements to the boosting procedure to support probabilistic classification and discuss arising stability issues. Further adjustments are necessary for the treatment of categorical variables. Therefore, we draw on the ordered boosting by \textcite{prokhorenkovaCatBoostUnbiasedBoosting2018}, which enhances the classical gradient boosting algorithm. -Next, we focus on transformer networks for tabular data. We begin with the classical transformer architecture of \textcite{vaswaniAttentionAllYou2017}. We put our focus on introducing central concepts like the encoder-decoder structure, attention, embeddings, or point-wise networks. These chapters lay the basis for the subsequent tabular-specific architectures like \textit{TabNet} or \textit{TabTransformer}. Since the classical transformer is tailored to sequence-to-sequence modelling, it can not be directly applied to tabular data. +Next, we focus on transformer networks for tabular data. We begin with the classical transformer architecture of \textcite{vaswaniAttentionAllYou2017}. We put our focus on introducing central concepts like the encoder-decoder structure, attention, embeddings, or point-wise networks. These chapters lay the basis for the subsequent tabular-specific architectures like \emph{TabNet} or \emph{TabTransformer}. Since the classical transformer is tailored to sequence-to-sequence modeling, it can not be directly applied to tabular data. -Specialised for tabular data is the \textit{TabTransformer} of \textcite{huangTabTransformerTabularData2020}. The architecture utilises stacked transformers to learn contextual embeddings of categorical features, whereas continuous features are directly input into a standard, feed-forward network. +Specialized for tabular data is the \emph{TabTransformer} of \textcite{huangTabTransformerTabularData2020}. The architecture utilizes stacked transformers to learn contextual embeddings of categorical features, whereas continuous features are directly input into a standard, feed-forward network. -Another alternative is \textit{TabNet} \autocite{arikTabNetAttentiveInterpretable2020}, which fuses the concept of decision trees and transformers. Similar to growing a decision tree, several subnetworks are used to process the input in a sequential, hierarchical fashion. Sequential attention, a variant of attention, is used to decide which features to use in each step. The output of \textit{TabNet} is the aggregate of all subnetworks. Despite its difference, concepts like the encoder or attention can be transferred from the previous variants. +Another alternative is \emph{TabNet} \autocite{arikTabNetAttentiveInterpretable2020}, which fuses the concept of decision trees and transformers. Similar to growing a decision tree, several sub-networks are used to process the input in a sequential, hierarchical fashion. Sequential attention, a variant of attention, is used to decide which features to use in each step. The output of \emph{TabNet} is the aggregate of all sub-networks. Despite its difference, concepts like the encoder or attention can be transferred from the previous variants. -Next, we demonstrate how the models from above can be enhanced for the semi-supervised setting. We provide a short discussion on different alternatives. For gradient-boosted trees, self-training \autocite{yarowskyUnsupervisedWordSense1995} is used to obtain pseudo labels for unlabelled parts of the data set. The ensemble itself is trained on both true and pseudo labels. For the neural networks, the scope is limited to separate pre-training procedures to maintain consistency with the supervised counterparts. For \textit{TabNet}, we use unsupervised pre-training of the encoder as propagated in \textcite{arikTabNetAttentiveInterpretable2020}. Equally, for the \textit{TabTransformer}, we pre-train the transformer layers and column embeddings through masked language modelling or replaced token detection as popularised in \textcite{devlinBERTPretrainingDeep2019} and \textcite{clarkELECTRAPretrainingText2020}, respectively. +Next, we demonstrate how the models from above can be enhanced for the semi-supervised setting. We provide a short discussion on different alternatives. For gradient-boosted trees, self-training \autocite{yarowskyUnsupervisedWordSense1995} is used to obtain pseudo labels for unlabeled parts of the data set. The ensemble itself is trained on both true and pseudo labels. For the neural networks, the scope is limited to separate pre-training procedures to maintain consistency with the supervised counterparts. For \emph{TabNet}, we use unsupervised pre-training of the encoder as propagated in \textcite{arikTabNetAttentiveInterpretable2020}. Equally, for the \emph{TabTransformer}, we pre-train the transformer layers and column embeddings through masked language modeling or replaced token detection as popularized in \textcite{devlinBERTPretrainingDeep2019} and \textcite{clarkELECTRAPretrainingText2020}, respectively. \textbf{Empirical Study} In our empirical analysis, we introduce the data sets, the generation of true labels, and the applied pre-processing. The data sets contain option trades executed at either the \gls{CBOE} or the \gls{ISE} with additional intraday option price and quote data, end-of-day buy and sell trading volumes, characteristics of the option, and the underlying. Yet our primary focus is on classifying \gls{ISE} trades, with a secondary emphasis on the \gls{CBOE} data set. -Subsets of the \gls{CBOE} and the \gls{ISE} data set have been previously studied in \textcite{grauerOptionTradeClassification2022}. Thus, we align the data pre-processing with their work to maintain consistency. Nevertheless, some deviations are necessary for training the machine learning models. These include the imputation of missing features, standardisation, resampling, feature transformations, and feature subset selection. While all our models can theoretically handle raw tabular data without prior processing \autocites{arikTabNetAttentiveInterpretable2020}{prokhorenkovaCatBoostUnbiasedBoosting2018}{huangTabTransformerTabularData2020}, we expect to improve the model's performance with these additional steps. Features are derived through feature transformations e.~g., the relative distance of the trade from the midpoint found in the \gls{CLNV} method, to incorporate them into our models while not incorporating the rule directly. Doing so provides insights into the relationship between classical and machine learning-based approaches. Like \textcite{ronenMachineLearningTrade2022}, we define different subsets of data i.~e., one that includes only features found in the classical algorithms and another incorporating option characteristics as well as price and trading data. Finally, unlabelled data is kept for the training of semi-supervised models. +Subsets of the \gls{CBOE} and the \gls{ISE} data set have been previously studied in \textcite{grauerOptionTradeClassification2022}. Thus, we align the data pre-processing with their work to maintain consistency. Nevertheless, some deviations are necessary for training the machine learning models. These include the imputation of missing features, standardization, resampling, feature transformations, and feature subset selection. While all our models can theoretically handle raw tabular data without prior processing \autocites{arikTabNetAttentiveInterpretable2020}{prokhorenkovaCatBoostUnbiasedBoosting2018}{huangTabTransformerTabularData2020}, we expect to improve the model's performance with these additional steps. Features are derived through feature transformations e.~g., the relative distance of the trade from the midpoint found in the \gls{CLNV} method, to incorporate them into our models while not incorporating the rule directly. Doing so provides insights into the relationship between classical and machine learning-based approaches. Like \textcite{ronenMachineLearningTrade2022}, we define different subsets of data i.~e., one that includes only features found in the classical algorithms and another incorporating option characteristics as well as price and trading data. Finally, unlabeled data is kept for the training of semi-supervised models. -The data set is split into three disjoint sets for training, validation, and testing. As in \textcite{ellisAccuracyTradeClassification2000} and \textcite{ronenMachineLearningTrade2022} we perform a classical train-test split, thereby maintaining the temporal ordering within the data. We rely on labelled data to assess the performance of trade classification rules. With statistical tests, we verify that the distribution of the features and target is maintained on the test set. Due to the number of model combinations considered and the computational demand of transformers and gradient-boosted trees, we expect $k$-fold cross-validation to be practically intractable. +The data set is split into three disjoint sets for training, validation, and testing. As in \textcite{ellisAccuracyTradeClassification2000} and \textcite{ronenMachineLearningTrade2022} we perform a classical train-test split, thereby maintaining the temporal ordering within the data. We rely on labeled data to assess the performance of trade classification rules. With statistical tests, we verify that the distribution of the features and target is maintained on the test set. Due to the number of model combinations considered and the computational demand of transformers and gradient-boosted trees, we expect $k$-fold cross-validation to be practically intractable. Next, we describe the implementation and training of the supervised and semi-supervised models, as well as classical trade classification rules. -For a consistent evaluation, we opt to implement classical rules like the \gls{LR} algorithm as a classifier conforming to the programming interface of \textit{Scikit-learn} \autocite{pedregosaScikitlearnMachineLearning2018}. -Gradient boosting is implemented using \textit{CatBoost} by \textcite{prokhorenkovaCatBoostUnbiasedBoosting2018}.\textit{TabNet} and \textit{TabTransformer} are implemented in \textit{PyTorch} \autocite{paszkePyTorchImperativeStyle2019} and \textit{skorch} based on the original papers. Deviations from the papers are reported. -For training, we employ various model-agnostic deep learning practises like learning rate decay, drop out \autocite{hintonImprovingNeuralNetworks2012}, early stopping, ensembles \autocite{huangSnapshotEnsemblesTrain2017} or stochastic weight averaging \autocite{izmailovAveragingWeightsLeads2019} to speed up training or to obtain a better generalisation. We report the loss curves to detect over- or underfitting and study learning curves to get insights into our models' bias and variance properties. +For a consistent evaluation, we opt to implement classical rules like the \gls{LR} algorithm as a classifier conforming to the programming interface of \emph{Scikit-learn} \autocite{pedregosaScikitlearnMachineLearning2018}. +Gradient boosting is implemented using \emph{CatBoost} by \textcite{prokhorenkovaCatBoostUnbiasedBoosting2018}.\emph{TabNet} and \emph{TabTransformer} are implemented in \emph{PyTorch} \autocite{paszkePyTorchImperativeStyle2019} and \emph{skorch} based on the original papers. Deviations from the papers are reported. +For training, we employ various model-agnostic deep learning practices like learning rate decay, drop out \autocite{hintonImprovingNeuralNetworks2012}, early stopping, ensembles \autocite{huangSnapshotEnsemblesTrain2017} or stochastic weight averaging \autocite{izmailovAveragingWeightsLeads2019} to speed up training or to obtain a better generalization. We report the loss curves to detect over- or underfitting and study learning curves to get insights into our models' bias and variance properties. -In contrast to \textcite{ronenMachineLearningTrade2022} we emphasise a transparent hyperparameter tuning procedure. We tune with a novel Bayesian optimisation based on the tree-structured parzen estimator algorithm. Compared to other approaches like a randomised search, unpromising search regions are omitted, thus requiring fewer search trails. Bayesian search is also reported to be superior to a randomised search \autocite{turnerBayesianOptimizationSuperior2021}. The search space for the parameters is based on the configurations in the corresponding papers. An implementation by \textcite{akibaOptunaNextgenerationHyperparameter2019} is used to optimise for the accuracy on the validation set. Searches may be repeated multiple times with different initializations. +In contrast to \textcite{ronenMachineLearningTrade2022}, we emphasize a transparent hyperparameter tuning procedure. We tune with a novel Bayesian optimization based on the tree-structured parzen estimator algorithm. Compared to other approaches like a randomized search, unpromising search regions are omitted, thus requiring fewer search trails. Bayesian search is also reported to be superior to a randomized search \autocite{turnerBayesianOptimizationSuperior2021}. The search space for the parameters is based on the configurations in the corresponding papers. An implementation by \textcite{akibaOptunaNextgenerationHyperparameter2019} is used to optimize for the accuracy on the validation set. Searches may be repeated multiple times with different initializations. -We report the optimisation metric on the training, validation, and test set to study the impact of different learning schemes and the learning of generalizable features. Visualisation-wise, the chapter may include a study of loss surfaces. The expectation is that pre-training improves both the training and validation loss due to the larger sample size seen during training. A decline between the sets may be observed. +We report the optimization metric on the training, validation, and test set to study the impact of different learning schemes and the learning of generalizable features. Visualization-wise, the chapter may include a study of loss surfaces. The expectation is that pre-training improves both the training and validation loss due to the larger sample size seen during training. A decline between the sets may be observed. -Subsequently, the model is evaluated. Firstly, a comparison between the selected features is conducted. \textit{TabNet}, \textit{TabTransformer}, and gradient-boosted trees are interpretable by design but rely on model-specific techniques such as feature activation masks found only in transformer-based models rendering them useless for cross-model comparisons. Still, we rely on activation masks to study trades on the transaction level. To compare all models, we suggest kernel \gls{SHAP} \autocite{lundbergUnifiedApproachInterpreting2017} or random feature permutation by \textcite{breimanRandomForests2001} for local and global interpretability. Both approaches are advantageous over logistic regression, as previously used by \textcites{savickasInferringDirectionOption2003}{chakrabartyTradeClassificationAlgorithms2012}, with their ability to capture non-linear interactions between features. Due to the implementation of the classical rules as an estimator, we can perform a fair comparison between classical and machine learning-based approaches. We back the observed results with economic intuition. +Subsequently, the model is evaluated. Firstly, a comparison between the selected features is conducted. \emph{TabNet}, \emph{TabTransformer}, and gradient-boosted trees are interpretable by design but rely on model-specific techniques such as feature activation masks found only in transformer-based models rendering them useless for cross-model comparisons. Still, we rely on activation masks to study trades on the transaction level. To compare all models, we suggest kernel \gls{SHAP} \autocite{lundbergUnifiedApproachInterpreting2017} or random feature permutation by \textcite{breimanRandomForests2001} for local and global interpretability. Both approaches are advantageous over logistic regression, as previously used by \textcites{savickasInferringDirectionOption2003}{chakrabartyTradeClassificationAlgorithms2012}, with their ability to capture non-linear interactions between features. Due to the implementation of the classical rules as an estimator, we can perform a fair comparison between classical and machine learning-based approaches. We back the observed results with economic intuition. -Secondly, we benchmark \textit{TabNet}, \textit{TabTransformer}, and gradient-boosted trees against the classical trade classification rules. Following a common track in literature, accuracy is the decisive metric. The analysis may be supported with additional metrics like the receiver operator characteristic, area under the curve, or confusion matrices. We expect both semi-supervised and supervised algorithms to outperform the benchmarks with additional performance gains from learning on unlabelled data. +Secondly, we benchmark \emph{TabNet}, \emph{TabTransformer}, and gradient-boosted trees against the classical trade classification rules. Following a common track in literature, accuracy is the decisive metric. The analysis may be supported with additional metrics like the receiver operator characteristic, area under the curve, or confusion matrices. We expect both semi-supervised and supervised algorithms to outperform the benchmarks with additional performance gains from learning on unlabeled data. Based on preliminary tests, gradient-boosted trees outperform classical approaches on the \gls{ISE} data set in terms of classification accuracy with an accuracy of 69.84~\% \footnote{See \url{https://wandb.ai/fbv/thesis/runs/3dpde4cy} for the run configuration.}. The static testing period spans from November 2015 to May 2017. -Despite serious counter efforts, our models can still poorly generalise. We use rigorous robustness cheques to test if the accuracy is maintained across time, trade sizes, underlyings, and exchanges, among others. The procedure follows \textcites{chakrabartyTradeClassificationAlgorithms2012}{grauerOptionTradeClassification2022}{ronenMachineLearningTrade2022}{savickasInferringDirectionOption2003}. Motivated by research of \textcite{grinsztajnWhyTreebasedModels2022}, we conduct a robustness study of our models to both informative and uninformative features. +Despite serious counter efforts, our models can still poorly generalize. We use rigorous robustness checks to test if the accuracy is maintained across time, trade sizes, underlyings, and exchanges, among others. The procedure follows \textcites{chakrabartyTradeClassificationAlgorithms2012}{grauerOptionTradeClassification2022}{ronenMachineLearningTrade2022}{savickasInferringDirectionOption2003}. Motivated by research of \textcite{grinsztajnWhyTreebasedModels2022}, we conduct a robustness study of our models to both informative and uninformative features. -All in all, our empirical analysis aims for reproducibility. We implement sophisticated data set versioning and experiment tracking using \textit{weights \& biases}~\footnote{Experiments are tracked at \url{https://wandb.ai/fbv/thesis}.}. The correctness of the code is verified with automated tests \footnote{Code is available at~\url{https://github.com/KarelZe/thesis}.}. +All in all, our empirical analysis aims for reproducibility. We implement sophisticated data set versioning and experiment tracking using \emph{weights \& biases}~\footnote{Experiments are tracked at \url{https://wandb.ai/fbv/thesis}.}. The correctness of the code is verified with automated tests \footnote{Code is available at~\url{https://github.com/KarelZe/thesis}.}. \textbf{Discussion and Conclusion} diff --git a/reports/Content/related-work.tex b/reports/Content/related-work.tex index 192ce746..bb8ed2d9 100644 --- a/reports/Content/related-work.tex +++ b/reports/Content/related-work.tex @@ -16,9 +16,9 @@ \subsection{Trade Classification in Option Markets} \subsection{Trade Classification Using Machine Learning} \label{sec:trade-classification-using-machine-learning} -\textcite[5]{rosenthalModelingTradeDirection2012} bridges the gap between classical trade classification and machine learning by fitting a logistic regression model on lagged and unlagged features innate to the tick rule, quote rule, and \gls{EMO} algorithm, as well as a sector-specific and a time-specific term. Instead of using the rule's discretized outcome as a feature, he models the rules through so-called information strength functions \autocite[6--7]{rosenthalModelingTradeDirection2012}. The proximity to the quotes, central to the \gls{EMO} algorithm, is thus modelled by a proximity function. Likewise, the information strength of the quote and tick rule is estimated as the log return between the trade price and the midpoint or the previous trade price. However, it only improves the accuracy of the \gls{EMO} algorithm by a marginal \SI{2.00}{\percent} for \gls{NASDAQ} stocks and \SI{1.10}{\percent} for \gls{NYSE} stocks \autocite[15]{rosenthalModelingTradeDirection2012}. Our work aims to improve the model by exploring non-linear estimators and minimising data modelling assumptions. +\textcite[5]{rosenthalModelingTradeDirection2012} bridges the gap between classical trade classification and machine learning by fitting a logistic regression model on lagged and unlagged features innate to the tick rule, quote rule, and \gls{EMO} algorithm, as well as a sector-specific and a time-specific term. Instead of using the rule's discretized outcome as a feature, he models the rules through so-called information strength functions \autocite[6--7]{rosenthalModelingTradeDirection2012}. The proximity to the quotes, central to the \gls{EMO} algorithm, is thus modeled by a proximity function. Likewise, the information strength of the quote and tick rule is estimated as the log return between the trade price and the midpoint or the previous trade price. However, it only improves the accuracy of the \gls{EMO} algorithm by a marginal \SI{2.00}{\percent} for \gls{NASDAQ} stocks and \SI{1.10}{\percent} for \gls{NYSE} stocks \autocite[15]{rosenthalModelingTradeDirection2012}. Our work aims to improve the model by exploring non-linear estimators and minimizing data modeling assumptions. -The work of \textcite[483]{blazejewskiLocalNonParametricModel2005} compares a $k$-nearest neighbour classifier against logistic regression, as well as simple heuristics like the majority vote over past trades for signing trades at the Australian stock exchange. Their results indicate that the parametric $k$-nearest neighbour classifier improves upon a linear logistic regression in terms of classification accuracy, even when trained on fewer features. The work is unique from the remaining works about the feature set definition. Notably, \textcite[483]{blazejewskiLocalNonParametricModel2005} use no quote or trade prices, but rather the order book volumes, trade sizes, and past trade signs for classification. No accuracies for classical trade signing rules are reported, which impedes a comparison across different works. In line with their results, we focus on non-linear models. Additionally, our paper addresses the mentioned shortcomings by benchmarking against state-of-the-art trade classification rules. We share the idea of using the trade size, as well as the bid and ask sizes for classification for some of our feature sets, but greedily predict using non-historic features. +The work of \textcite[483]{blazejewskiLocalNonParametricModel2005} compares a $k$-nearest neighbor classifier against logistic regression, as well as simple heuristics like the majority vote over past trades for signing trades at the Australian stock exchange. Their results indicate that the parametric $k$-nearest neighbor classifier improves upon a linear logistic regression in terms of classification accuracy, even when trained on fewer features. The work is unique from the remaining works about the feature set definition. Notably, \textcite[483]{blazejewskiLocalNonParametricModel2005} use no quote or trade prices, but rather the order book volumes, trade sizes, and past trade signs for classification. No accuracies for classical trade signing rules are reported, which impedes a comparison across different works. In line with their results, we focus on non-linear models. Additionally, our paper addresses the mentioned shortcomings by benchmarking against state-of-the-art trade classification rules. We share the idea of using the trade size, as well as the bid and ask sizes for classification for some of our feature sets, but greedily predict using non-historic features. Closest to our work is a publication by \textcite[1--58]{ronenMachineLearningTrade2022}. Therein, the authors compare a selection of machine learning algorithms against classical trade signing rules in the bond and stock market. Their comparison is the first to consider logistic regression, a random forest, as well as \glspl{feed-forward-network}. Over a wide range of feature sets the tree-based ensemble consistently outperforms by out-of-sample accuracy the tick rule and \gls{LR} algorithm, as well as all remaining machine learning models. For the TRACE and \gls{NASDAQ} dataset, their best variant of the random forest outperforms the tick rule by \SI{8.30}{\percent} and \SI{3.30}{\percent}, respectively \autocite[57]{ronenMachineLearningTrade2022}. Whilst the superiority of random forests is consistent for the bond and equity market, fitted classifiers do not transfer across markets, as accuracies diminish in a transfer setting. @@ -30,7 +30,7 @@ \subsection{Research Framework}\label{sec:research-framework} The selection of machine learning methods in previous works is arbitrary and guided by computational constraints. Additionally, it leaves out advancements in machine learning. To address these limitations, we propose a comprehensive research framework for trade classification, outlined in \cref{fig:research-framework}. -Our approach revolves around two key ideas. First, we leverage \glspl{GBRT} and Transformers for trade classification. These methods are selected in \cref{sec:supervised-approaches} for their expected performance, scalability, and extensibility and later enhanced to learn on partially-labelled trades. Second, classical trade classification rules, such as the \gls{LR}, are realised as a rule-based classifier using a stacking principle describe in \cref{sec:stacked-rule}. This allows for a consistent evaluation and model interpretation, eventually bridging the gap between classical trade classification rules and machine learning. +Our approach revolves around two key ideas. First, we leverage \glspl{GBRT} and Transformers for trade classification. These methods are selected in \cref{sec:supervised-approaches} for their expected performance, scalability, and extensibility and later enhanced to learn on partially-labeled trades. Second, classical trade classification rules, such as the \gls{LR}, are realized as a rule-based classifier using a stacking principle describe in \cref{sec:stacked-rule}. This allows for a consistent evaluation and model interpretation, eventually bridging the gap between classical trade classification rules and machine learning. \begin{figure}[!ht] \centering diff --git a/reports/Content/results.tex b/reports/Content/results.tex index 06bfdaed..d5570b2b 100644 --- a/reports/Content/results.tex +++ b/reports/Content/results.tex @@ -46,7 +46,7 @@ \subsection{Results of Rule-Based Approaches}\label{sec:result-of-rule-based-app The performance of hybrids, such as the \gls{LR} algorithm, hinges on the reliance on the tick test. Thus, the \gls{EMO} rules and to a lesser extent the \gls{CLNV} rules perform worst, achieving accuracies between \SI{55.42}{\percent} and \SI{57.57}{\percent}. In turn, variants of the \gls{LR}, which uses the quote rule for most trades, are among the best-performing algorithms. By extension, \gls{GSU} method (small) further reduces the dependence on tick-based methods through the successive applications of quote rules, here $\operatorname{quote}_{\mathrm{nbbo}} \to \operatorname{quote}_{\mathrm{ex}}$. -Notably, the \gls{GSU} method (large) featuring overrides from the trade size and depth rules performs best, achieving \SI{67.61}{\percent} accuracy on the \gls{ISE} test set and \SI{75.49}{\percent} on the entire dataset. Yet, the performance deteriorates most sharply between sets, as visualised in \cref{fig:classical-accuracies-over-time}. +Notably, the \gls{GSU} method (large) featuring overrides from the trade size and depth rules performs best, achieving \SI{67.61}{\percent} accuracy on the \gls{ISE} test set and \SI{75.49}{\percent} on the entire dataset. Yet, the performance deteriorates most sharply between sets, as visualized in \cref{fig:classical-accuracies-over-time}. \begin{figure}[ht] \centering @@ -55,7 +55,7 @@ \subsection{Results of Rule-Based Approaches}\label{sec:result-of-rule-based-app \label{fig:classical-accuracies-over-time} \end{figure} -\begin{table}[h] +\begin{table}[ht] \centering \caption[Accuracies of Rule-Based Approaches on \glsentryshort{CBOE}]{This table shows the accuracy of common trade classification rules and their variations for option trades on \gls{CBOE} sample. Unclassifiable trades by the respective rule are assigned randomly as buy or sell. Hybrid methods are estimated using trade prices across all exchanges. We report the percentage of classifiable trades and the overall accuracy for subsets based on our train-test split and the entire dataset. The best rule is in bold.} \label{tab:cboe-classical} @@ -88,19 +88,19 @@ \subsection{Results of Rule-Based Approaches}\label{sec:result-of-rule-based-app We repeat the analysis on the \gls{CBOE} dataset in \cref{tab:cboe-classical} and observe a similar ranking to \cref{tab:ise-classical}. Overall, the performance of classical trade classification rules further diminishes or remains at a low level. Tick-based rules trail the performance of quote-based approaches, and the accuracy of hybrids varies with the dependence on the tick test. Different from the \gls{ISE} sample, the quote rule estimated on the \gls{NBBO}, leads to a degraded performance than the quote rule applied to \gls{CBOE} quotes. Again, \gls{GSU} method (small) and \gls{GSU} method (large) perform best, though the strong outperformance does not carry over to the test set as depicted \cref{fig:classical-accuracies-over-time}.\footnote{Performance on \gls{CBOE} can be improved if the order of quote rules is reversed. For full combinatoric coverage see \textcite[][33]{grauerOptionTradeClassification2022}. To avoid overfitting the test set by classical rules, we keep the baseline constant following our reasoning from \cref{sec:hyperparameter-tuning}.} -\begin{figure}[!h] +\begin{figure}[!ht] \centering \includegraphics{classical-coverage-over-time.pdf} \caption[Coverage of Rule-Based Classifiers Over Time]{Coverage of rule-based classifiers on \gls{ISE} and \gls{CBOE} sample over time. The bar \myline{} indicates the beginning of a new subset based on the train-test split.} \label{fig:classical-coverage-over-time} \end{figure} -From \cref{tab:ise-classical,tab:cboe-classical} we see, that practically all rule-based approaches leave trades unclassified. This is due to conceptual constraints in the rule itself, but also a result of missing or corrupted data, which equally affects rules with theoretical full coverage. As visualised in \cref{fig:classical-coverage-over-time} coverage decreases qualitatively for selected classification rules over time. It is particularly low when the trade initiator is inferred from the \gls{NBBO}. Theoretically, the tick test can achieve full coverage, in our sample it classifies only $\approx$ \SI{91.5}{\percent}, which is significantly lower than coverage rates reported in the stock market \autocite[][535]{ellisAccuracyTradeClassification2000}. The low, fluctuating coverage stems from the absence of a distinguishable trade price. For the quote rule, we isolate missing or inverted quotes from midspread trades. Through comparison between \cref{fig:classical-coverage-over-time} and \cref{fig:classical-at-mid-over-time} it is evident, that the majority of unclassified trades are midspread trades, whose share increases over time. In our datasets, hybrids, have the advantage of leveraging multiple data sources, resulting in more complete coverage. If, as in the combinations of \textcite[][18--19]{grauerOptionTradeClassification2022}, the basic rules are strong individually, higher coverage is associated with better performance, as fewer trades are classified by a fallback mechanism. +From \cref{tab:ise-classical,tab:cboe-classical} we see, that practically all rule-based approaches leave trades unclassified. This is due to conceptual constraints in the rule itself, but also a result of missing or corrupted data, which equally affects rules with theoretical full coverage. As visualized in \cref{fig:classical-coverage-over-time} coverage decreases qualitatively for selected classification rules over time. It is particularly low when the trade initiator is inferred from the \gls{NBBO}. Theoretically, the tick test can achieve full coverage, in our sample it classifies only $\approx$ \SI{91.5}{\percent}, which is significantly lower than coverage rates reported in the stock market \autocite[][535]{ellisAccuracyTradeClassification2000}. The low, fluctuating coverage stems from the absence of a distinguishable trade price. For the quote rule, we isolate missing or inverted quotes from midspread trades. Through comparison between \cref{fig:classical-coverage-over-time} and \cref{fig:classical-at-mid-over-time} it is evident, that the majority of unclassified trades are midspread trades, whose share increases over time. In our datasets, hybrids, have the advantage of leveraging multiple data sources, resulting in more complete coverage. If, as in the combinations of \textcite[][18--19]{grauerOptionTradeClassification2022}, the basic rules are strong individually, higher coverage is associated with better performance, as fewer trades are classified by a fallback mechanism. -\begin{figure}[!h] +\begin{figure}[!ht] \centering \includegraphics{classical_at_mid_over_time.pdf} - \caption[Mid-Spread Trades Over Time]{Percentage of mid-spread trades on \gls{ISE} and \gls{CBOE} sample over time. Estimated using \gls{NBBO} quotes. The bar \myline{} indicates the beginning of a new subset based on the train-test split.} + \caption[Mid-Spread Trades Over Time]{Percentage of midspread trades on \gls{ISE} and \gls{CBOE} sample over time. Estimated using \gls{NBBO} quotes. The bar \myline{} indicates the beginning of a new subset based on the train-test split.} \label{fig:classical-at-mid-over-time} \end{figure} @@ -130,12 +130,12 @@ \subsection{Results of Supervised While absolute improvements in accuracy over $\operatorname{gsu}_{\mathrm{small}}$ are modest on the smallest feature set, improvements are substantial for larger feature sets ranging between \SI{4.730000}{\percent} to \SI{7.860000}{\percent} over $\operatorname{gsu}_{\mathrm{large}}$. Specifically, the addition of trade size-related features positively contributes to the performance. We discuss feature importances in \cref{sec:feature-importance}. -The results can be enhanced through retraining on the validation set improving accuracies to \SI{76.162269}{\percent}, as documented in \cref{app:results-of-supervised-models-with-re-training}. In favour of conservative estimates, our models in the main text do not use this technique. +The results can be enhanced through re-training on the validation set improving accuracies to \SI{76.162269}{\percent}, as documented in \cref{app:results-of-supervised-models-with-re-training}. In favor of conservative estimates, our models in the main text do not use this technique. To formally test, whether differences between both classifiers are significant, we construct contingency tables and pair-wise compare predictions using McNemar's test \autocite[][153--157]{mcnemarNoteSamplingError1947}. We formulate the null hypothesis that both classifiers have the same error rate. Conceptually similar \textcite[][267]{odders-whiteOccurrenceConsequencesInaccurate2000}, uses contingency tables of rule-based methods and true labels. Here, contingency tables are used to pair-wise compare the predictions of \glspl{GBRT} against Transformers. -\begin{table}[!h] +\begin{table}[!ht] \centering \sisetup{table-number-alignment=right, table-format=7.0} \caption[Contingency Tables of Supervised Classifiers]{This table contains the contingency tables of the supervised classifiers on the \gls{CBOE} and \gls{ISE} test set for feature set classical, classical-size, and option. Cells sum the number of trades, correctly/falsely classified by both classifiers or one. Additionally, McNemar's test statistic $\chi^2$ and the associated $p$-value are reported.} @@ -163,14 +163,14 @@ \subsection{Results of Supervised Relative to related works performing trade classification with machine learning, the improvements are strong, as documented in \cref{app:literature-ml-tc}. As no other work studies the option market or identical model architectures, the results are indicative. The studies report improvements between \SI{1.1}{\percent} and \SI{13.3}{\percent} for their machine learning models over the benchmark. Our absolute improvements exceed all linear models, but the absolute improvements are smaller relative to some tree-based and deep learning models in \textcite[][49]{ronenMachineLearningTrade2022}. At the same time, our models are trained on significantly fewer features and on a static training set requiring a fraction of the training cost. We believe, our conservative framing aligns well with scenarios, where trade classification is only a prerequisite to other empirical research. -Visually, the performance differences between gradient boosting and Transformers on the same feature sets are minor, which is in accordance to \textcites{grinsztajnWhyTreebasedModels2022}{gorishniyRevisitingDeepLearning2021}. These studies conclude, generally for tabular modelling, that neither Transformers nor \glspl{GBRT} are universally superior. Our results validate this observation, specifically for trade classification. +Visually, the performance differences between gradient boosting and Transformers on the same feature sets are minor, which is in accordance to \textcites{grinsztajnWhyTreebasedModels2022}{gorishniyRevisitingDeepLearning2021}. These studies conclude, generally for tabular modeling, that neither Transformers nor \glspl{GBRT} are universally superior. Our results validate this observation, specifically for trade classification. % \todo{It is conceivable, that ...} % Our findings thereby contradict those of \textcite[][14--49]{ronenMachineLearningTrade2022}, who benchmark tree-based ensembles in the form of random forests and neural networks in the form of \gls{FFN} for trade classification in the equity and bond market and find clear dominance of the tree-based approach. Beyond differences in the market under study and variants, two methodological differences are evident, that explain the diverging results. First, unlike \gls{FFN}, the FT-Transformer is tailored to learn on tabular data through being a rotationally-invariant learner. Second, the data pre-processing and feature engineering is tailored to the requirements of neural networks. Without these measures, tree-based approaches excel due to their robustness in handling skewed and missing data. Despite the lack of adaption to \gls{CBOE} data, the performance improvements are highest for the \gls{CBOE} dataset. This result is in stark contrast to the of \textcite[][32]{ronenMachineLearningTrade2022}, who test random forests for trade classification and report subpar performance. Their setting differs from ours, as they apply ensembles trained in the bond market to equity trades. Moreover, it is unclear if data preprocessing procedures are shared between both sets, which may hamper performance. -Part of the strong performance on \gls{CBOE} trades hails from a weaker benchmark performance, but also from a stronger accuracy of classifiers on the smallest and mid-sized feature sets. One would expect a degradation between sets, assuming exchange-specific trading patterns. +Part of the strong performance on \gls{CBOE} trades hails from weaker benchmark performance, but also from a stronger accuracy of classifiers on the smallest and mid-sized feature sets. One would expect a degradation between sets, assuming exchange-specific trading patterns. In summary, our supervised methods establish a new state-of-the-art in option trade classification. Our approach achieves full coverage and outperforms all previously reported classification rules in terms of accuracy. Performance transfers across exchanges. We perform additional robustness checks in \cref{sec:robustness-checks} to identify any systematic misclassification. @@ -196,9 +196,9 @@ \subsection{Results of Semi-supervised Identical to the supervised case, our models consistently outperform their respective benchmarks. Gradient boosting with self-training surpasses $\operatorname{gsu}_{\mathrm{small}}$ by \SI{3.350000}{\percent} on \gls{ISE} and \SI{5.440000}{\percent} on \gls{CBOE} in accuracy. Improvements for larger feature sets over $\operatorname{gsu}_{\mathrm{large}}$ are marginally lower to the supervised model and range between \SI{4.550000}{\percent} and \SI{7.440000}{\percent}. We already observed a similar result on the validation set in \cref{sec:hyperparameter-tuning}. -Pre-training is beneficial for the performance of Transformers on \gls{ISE} trades, improving over Transformer with random initialisation by up to \SI{0.87000}{\percent}. Hence, the performance improvement from pre-training on the validation set carries over the test set. On the \gls{CBOE} dataset, pre-training hurts performance. +Pre-training is beneficial for the performance of Transformers on \gls{ISE} trades, improving over Transformer with random initialization by up to \SI{0.87000}{\percent}. Hence, the performance improvement from pre-training on the validation set carries over the test set. On the \gls{CBOE} dataset, pre-training hurts performance. -\begin{table}[!h] +\begin{table}[!ht] \centering \sisetup{table-number-alignment=right, table-format=7.0} \caption[Contingency Tables of Semi-Supervised Classifiers]{This table contains the contingency tables of the semi-supervised classifiers on the \gls{CBOE} and \gls{ISE} test set for feature set classical, classical-size, and option. Cells sum the number of trades, correctly/falsely classified by both classifiers or one. Additionally, McNemar's test statistic $\chi^2$ and the associated $p$-value are reported.} @@ -224,13 +224,13 @@ \subsection{Results of Semi-supervised As evident from \cref{tab:contigency-semi-supervised-classifiers}, a vast majority of trades are classified by both classifiers correctly. For the \gls{ISE}, performance improvements in larger feature sets are driven by trades that are distinctly classified by both classifiers. In turn, at the \gls{CBOE}, the share of common classifications continues to grow. Performance differences between classifiers estimated by the McNemar test are significant. -As no previous work performed semi-supervised classification, we focus our discussion on the performance difference between pre-training and self-training. On \gls{ISE} data, pre-training with the \gls{RTD} objective on unlabelled trades yields significantly stronger performance. The results align with the intuition from \cref{sec:extensions-to-transformer}, that pre-training exposes the model to a larger quantity of trades, which strengthens its ability to learn generalisable knowledge about the data useful in later trade classification. Also, the model is exposed to more diverse trades, as unlabelled trades are not restricted by customer type or trading activity, effectively preventing overfitting. +As no previous work performed semi-supervised classification, we focus our discussion on the performance difference between pre-training and self-training. On \gls{ISE} data, pre-training with the \gls{RTD} objective on unlabeled trades yields significantly stronger performance. The results align with the intuition from \cref{sec:extensions-to-transformer}, that pre-training exposes the model to a larger quantity of trades, which strengthens its ability to learn generalizable knowledge about the data useful in later trade classification. Also, the model is exposed to more diverse trades, as unlabeled trades are not restricted by customer type or trading activity, effectively preventing overfitting. -An explanation as to why pre-training improves performance on \gls{ISE} but not \gls{CBOE} trades, may be found in the pre-training data and setup. Trades used for pre-training are recorded at the \gls{ISE} only and are repeatedly shown to the model. While our pre-training objective is stochastic with different features being masked in each epoch, past research has shown that repeatedly presenting the same tokens in conjunction with a small-sized pre-training dataset, can degrade performance on the downstream classification task. For instance, \textcite[][27--28]{raffelExploringLimitsTransfer2020} document in the context of language modelling that a high degree of repetition encourages memorization in the model, but few repetitions are not harmful. As each trade is only shown $20\times$ to the model, but the size of the dataset is significantly smaller, the true impact remains unclear. Future work could revisit pre-training on a larger subset of LiveVol, incorporating trades from different exchanges, whereby each trade is only shown once to the model. We assume, that such a setup would, analogous to language modelling, improve performance on both \gls{ISE} and \gls{CBOE} trades, as the model is less prone to memorize data and learns a more diverse context. +An explanation as to why pre-training improves performance on \gls{ISE} but not \gls{CBOE} trades, may be found in the pre-training data and setup. Trades used for pre-training are recorded at the \gls{ISE} only and are repeatedly shown to the model. While our pre-training objective is stochastic with different features being masked in each epoch, past research has shown that repeatedly presenting the same tokens in conjunction with a small-sized pre-training dataset, can degrade performance on the downstream classification task. For instance, \textcite[][27--28]{raffelExploringLimitsTransfer2020} document in the context of language modeling that a high degree of repetition encourages memorization in the model, but few repetitions are not harmful. As each trade is only shown $20\times$ to the model, but the size of the dataset is significantly smaller, the true impact remains unclear. Future work could revisit pre-training on a larger subset of LiveVol, incorporating trades from different exchanges, whereby each trade is only shown once to the model. We assume, that such a setup would, analogous to language modeling, improve performance on both \gls{ISE} and \gls{CBOE} trades, as the model is less prone to memorize data and learns a more diverse context. -Self-training with \glspl{GBRT} as a base learner generally performs worse than \glspl{GBRT} trained on labelled trades, which contradicts our initial motivation for self-training in \cref{sec:extensions-to-gradient-boosted-trees}. With the pseudo labels derived from high-confident predictions, the success of self-training hinges on the reliability of the predicted class probabilities. In our analysis of the default \gls{GBRT} in \cref{sec:training-and-tuning} we observed that the validation loss in terms of sample-wise cross-entropy loss stagnates due to a growing number of overconfident but erroneous predictions. Although we cannot confirm for the self-training classifier, due to the absence of true labels, it is conceivable, that the increased number of confident yet incorrect predictions, affects the generated pseudo labels. Without the ability to correct for errors, self-training performance on the validation and test set is directly impacted. +Self-training with \glspl{GBRT} as a base learner generally performs worse than \glspl{GBRT} trained on labeled trades, which contradicts our initial motivation for self-training in \cref{sec:extensions-to-gradient-boosted-trees}. With the pseudo labels derived from high-confident predictions, the success of self-training hinges on the reliability of the predicted class probabilities. In our analysis of the default \gls{GBRT} in \cref{sec:training-and-tuning} we observed that the validation loss in terms of sample-wise cross-entropy loss stagnates due to a growing number of overconfident but erroneous predictions. Although we cannot confirm for the self-training classifier, due to the absence of true labels, it is conceivable, that the increased number of confident yet incorrect predictions, affects the generated pseudo labels. Without the ability to correct for errors, self-training performance on the validation and test set is directly impacted. -To summarise, unrewarded for higher training costs, semi-supervised variants of \glspl{GBRT} do not provide better generalisation performance than supervised approaches. Pre-training of Transformers improves performance on the \gls{ISE} sample but slightly deteriorates performance on the \gls{CBOE} set. We subsequently evaluate if semi-supervised learning improves robustness if not performance. +To summarize, unrewarded for higher training costs, semi-supervised variants of \glspl{GBRT} do not provide better generalization performance than supervised approaches. Pre-training of Transformers improves performance on the \gls{ISE} sample but slightly deteriorates performance on the \gls{CBOE} set. We subsequently evaluate if semi-supervised learning improves robustness if not performance. \subsection{Robustness of Results}\label{sec:robustness-checks} @@ -244,7 +244,7 @@ \subsection{Robustness of Results}\label{sec:robustness-checks} \textbf{Gradient Boosting} -Performance improvements of \glspl{GBRT} are consistent for calls and puts across all feature sets and exchanges. Conditional on the security type of the underlying, \gls{GBRT} achieves the largest improvements for index options in the \gls{CBOE} sample, but perform slightly worse than rule-based approaches on the \gls{ISE} set. on both datasets, accuracies are lowest for index options, which corroborates with the literature on rule-based classification. +Performance improvements of \glspl{GBRT} are consistent for calls and puts across all feature sets and exchanges. Conditional on the security type of the underlying, \gls{GBRT} achieves the largest improvements for index options in the \gls{CBOE} sample, but perform slightly worse than rule-based approaches on the \gls{ISE} set. On both datasets, accuracies are lowest for index options, which corroborates with the literature on rule-based classification. The performance is stable for different trade sizes and over time. Similarly, accuracy improvements are comparable for different maturities and moneyness ratios. Aligning with rule-based approaches, accuracies are lowest for option trades with long maturities and deep \gls{ITM} options, as reported in \textcite[][22]{grauerOptionTradeClassification2022}. The addition of option-specific features has annealing effects on accuracies by moneyness ratios and maturities. @@ -301,8 +301,7 @@ \subsection{Robustness of Results}\label{sec:robustness-checks} \tabindent Outside & 61.350064 & -5.430000 & 61.846608 & -3.070000 & 64.034087 & -0.890000 \\ \tabindent Unknown & 78.638385 & 2.230000 & 78.275744 & 1.870000 & 78.816285 & 2.410000 \\ \cmidrule(rl){1-7} - \multicolumn{7}{l}{ All} \\ - \tabindent All & 63.668637 & 3.620000 & 72.343640 & 4.730000 & 74.120496 & 6.510000 \\ +All & 63.668637 & 3.620000 & 72.343640 & 4.730000 & 74.120496 & 6.510000 \\ \bottomrule \end{tabular} \end{table} @@ -360,8 +359,7 @@ \subsection{Robustness of Results}\label{sec:robustness-checks} \tabindent Outside & 70.719978 & -4.200000 & 70.369607 & -1.560000 & 69.648255 & -2.280000 \\ \tabindent Unknown & 83.771336 & 1.110000 & 83.608778 & 0.950000 & 84.213854 & 1.550000 \\ \cmidrule(rl){1-7} - \multicolumn{7}{l}{ All} \\ - \tabindent All & 66.002029 & 5.260000 & 71.951794 & 5.430000 & 74.375033 & 7.860000 \\ +All & 66.002029 & 5.260000 & 71.951794 & 5.430000 & 74.375033 & 7.860000 \\ \bottomrule \end{tabular} \end{table} @@ -372,7 +370,7 @@ \subsection{Robustness of Results}\label{sec:robustness-checks} Performance results of Transformers are robust across all tested dimensions. The accuracy is approximately equal for calls and puts. We observe, that the benchmark performance of puts is consistently higher in our sub-samples, which contrasts the finding of \textcite[][22]{grauerOptionTradeClassification2022}. -Similar to \glspl{GBRT}, the FT-Transformer slightly underperforms the benchmark for index options in the \gls{ISE} sample. Even though the effect reverses on the \gls{CBOE} set, accuracies for index options are lower than the of any other underlying. Hence, we can extend the finding of \textcites[][22]{grauerOptionTradeClassification2022}[][9]{savickasInferringDirectionOption2003} that index options are notoriously difficult to classify to machine learning-based approaches. +Similar to \glspl{GBRT}, the FT-Transformer slightly underperforms the benchmark for index options in the \gls{ISE} sample. Even though the effect reverses on the \gls{CBOE} set, accuracies for index options are lower than those of any other underlying. Hence, we can extend the finding of \textcites[][22]{grauerOptionTradeClassification2022}[][9]{savickasInferringDirectionOption2003} that index options are notoriously difficult to classify to machine learning-based approaches. Classification is more accurate for near-expiring or deep \gls{ITM} options. In this sense, our finding contradicts the observation of \textcite[][891]{savickasInferringDirectionOption2003} made for rule-based classification. Again, we document that the addition of option-specific features, such as maturity or moneyness smooths out differences across maturity and moneyness levels. We defer discussing this aspect to \cref{sec:feature-importance}. @@ -429,8 +427,7 @@ \subsection{Robustness of Results}\label{sec:robustness-checks} \tabindent Outside & 63.228880 & -3.550000 & 63.792525 & -1.130000 & 65.610951 & 0.690000 \\ \tabindent Unknown & 78.268902 & 1.860000 & 77.824153 & 1.420000 & 78.522066 & 2.110000 \\ \cmidrule(rl){1-7} - \multicolumn{7}{l}{ All} \\ - \tabindent All & 63.783020 & 3.730000 & 72.581107 & 4.970000 & 73.921795 & 6.310000 \\ + All & 63.783020 & 3.730000 & 72.581107 & 4.970000 & 73.921795 & 6.310000 \\ \bottomrule \end{tabular} \end{table} @@ -486,8 +483,7 @@ \subsection{Robustness of Results}\label{sec:robustness-checks} \tabindent Outside & 73.145095 & -1.770000 & 72.581753 & 0.650000 & 71.908491 & -0.020000 \\ \tabindent Unknown & 83.807460 & 1.150000 & 83.220446 & 0.560000 & 83.491375 & 0.830000 \\ \cmidrule(rl){1-7} - \multicolumn{7}{l}{ All} \\ - \tabindent All & 66.182348 & 5.440000 & 72.153338 & 5.640000 & 74.278318 & 7.760000 \\ + All & 66.182348 & 5.440000 & 72.153338 & 5.640000 & 74.278318 & 7.760000 \\ \bottomrule \end{tabular} \end{table} @@ -496,9 +492,9 @@ \subsection{Robustness of Results}\label{sec:robustness-checks} \textbf{Gradient-Boosting With Self-Training} -We analyse the robustness of \gls{GBRT} with self-training on \gls{CBOE} data in \cref{tab:diff-ise-gbm-semi} and \gls{CBOE} data in \cref{tab:diff-cboe-gbm-semi}. Similar to what we observe for the vanilla \glspl{GBRT}, \glspl{GBRT} with self-training outperforms the respective benchmarks on almost all subsets. The only exceptions are index options and options traded outside the quotes, where the model performs worse than \gls{GSU} method (small/large). +We analyze the robustness of \gls{GBRT} with self-training on \gls{CBOE} data in \cref{tab:diff-ise-gbm-semi} and \gls{CBOE} data in \cref{tab:diff-cboe-gbm-semi}. Similar to what we observe for the standard \glspl{GBRT}, \glspl{GBRT} with self-training outperforms the respective benchmarks on almost all subsets. The only exceptions are index options and options traded outside the quotes, where the model performs worse than \gls{GSU} method (small/large). -Compared to the vanilla \glspl{GBRT}, performance degrades across almost all subsets. Quantitatively, we find no improvements in robustness as performance differences between sub-samples are of the same magnitude and the performance gap between rule-based classification extends for index options and trades outside the spread. +Compared to the standard \glspl{GBRT}, performance degrades across almost all subsets. Quantitatively, we find no improvements in robustness as performance differences between sub-samples are of the same magnitude and the performance gap between rule-based classification extends for index options and trades outside the spread. \begin{table} \centering @@ -551,8 +547,7 @@ \subsection{Robustness of Results}\label{sec:robustness-checks} \tabindent Outside & 60.665638 & -6.110000 & 61.222573 & -3.700000 & 63.007448 & -1.910000 \\ \tabindent Unknown & 78.419432 & 2.010000 & 78.173110 & 1.770000 & 78.775231 & 2.370000 \\ \cmidrule(rl){1-7} - \multicolumn{7}{l}{ All} \\ - \tabindent All & 63.397514 & 3.350000 & 72.156489 & 4.550000 & 73.536644 & 5.930000 \\ + All & 63.397514 & 3.350000 & 72.156489 & 4.550000 & 73.536644 & 5.930000 \\ \bottomrule \end{tabular} \end{table} @@ -608,8 +603,7 @@ \subsection{Robustness of Results}\label{sec:robustness-checks} \tabindent Outside & 72.265732 & -2.650000 & 69.682605 & -2.250000 & 68.810113 & -3.120000 \\ \tabindent Unknown & 83.581685 & 0.920000 & 83.292694 & 0.630000 & 83.861645 & 1.200000 \\ \cmidrule(rl){1-7} - \multicolumn{7}{l}{ All} \\ - \tabindent All & 66.189454 & 5.440000 & 71.922680 & 5.410000 & 73.953322 & 7.440000 \\ + All & 66.189454 & 5.440000 & 71.922680 & 5.410000 & 73.953322 & 7.440000 \\ \bottomrule \end{tabular} \end{table} @@ -620,9 +614,9 @@ \subsection{Robustness of Results}\label{sec:robustness-checks} Transformers with pre-training objectives outperform the benchmark in all subsets apart from index options and trades outside the quotes. For \gls{ISE} trades in \cref{tab:diff-ise-transformer-semi} pre-training improves performance across subsets, reaching accuracies greater than \SI{86}{\percent}. The only exception is index options, where the performance gap slightly widens. Deep-\gls{ITM} options and options with long maturity profit the most from the introduction of option features. -For trades at the \gls{CBOE} performance improvements associated with pre-training are slightly lower across several sub-groups. Positively, pre-training improves robustness, as the performance gap to the benchmarks narrows for trades outside the quotes. The results in conjunction with the identical model architecture suggest, that pre-training on unlabelled trades encodes exchange-specific knowledge, which improves performance and robustness on \gls{ISE} trades, but does not universally profit \gls{CBOE} trades. +For trades at the \gls{CBOE} performance improvements associated with pre-training are slightly lower across several sub-groups. Positively, pre-training improves robustness, as the performance gap to the benchmarks narrows for trades outside the quotes. The results in conjunction with the identical model architecture suggest, that pre-training on unlabeled trades encodes exchange-specific knowledge, which improves performance and robustness on \gls{ISE} trades, but does not universally profit \gls{CBOE} trades. -So far it remains opens open, to why most classifiers struggle to correctly classify index options and options traded outside the quotes. Index options are notoriously difficult to classify by standard algorithms, as unanimously documented in \textcites[][898-898]{savickasInferringDirectionOption2003}[][20]{grauerOptionTradeClassification2022}. \textcite[][898-898]{savickasInferringDirectionOption2003} trace back the low accuracy to the intensified use of complex trades in index option trading, such as bull spreads, which typically involve simultaneous buys and sells of options. Conceptually, it remains unclear if the components should be classified separately or as single complex trade. The explanation sheds light on why classification is difficult as a whole, but it does not address why accuracies trail the benchmark. \todo{We cannot test as we do not have simultanous buy buy and sell orders?... ok different option series, but no other trade} Some insights can be gained from the data distribution: index trades make up only \SI{1.0731}{\percent} of all trades on the \gls{ISE} training set, resulting in a highly imbalanced distribution of the security type. Consequently, the model has fewer index option samples to train on and is susceptible to overfitting if it learns distinguishable patterns for security types. A sample weighting scheme could place more emphasis on index options. +So far it remains open, as to why most classifiers struggle to correctly classify index options and options traded outside the quotes. Index options are notoriously difficult to classify by standard algorithms, as unanimously documented in \textcites[][898-898]{savickasInferringDirectionOption2003}[][20]{grauerOptionTradeClassification2022}. \textcite[][898-898]{savickasInferringDirectionOption2003} trace back the low accuracy to the intensified use of complex trades in index option trading, such as bull spreads, which typically involve simultaneous buys and sells of options. Conceptually, it remains unclear if the components should be classified separately or as single complex trade. The explanation sheds light on why classification is difficult as a whole, but it does not address why accuracies trail the benchmark. \todo{We cannot test as we do not have simultaneous buy and sell orders?... ok different option series, but no other trade} Some insights can be gained from the data distribution: index trades make up only \SI{1.0731}{\percent} of all trades on the \gls{ISE} training set, resulting in a highly imbalanced distribution of the security type. Consequently, the model has fewer index option samples to train on and is susceptible to overfitting if it learns distinguishable patterns for security types. A sample weighting scheme could place more emphasis on index options. In our test sets options traded outside the quotes can be reliably classified with the quote rule, which aligns with the intuition that customers are willing to trade at an additional liquidity premium, hence outside the quotes. We suspect the reason, why our methods fail to learn in this simple pattern in the infrequent distribution in \SI{0.7535}{\percent} of the \gls{ISE} dataset. Following our reasoning from above, the model can overfit the training samples more easily, eventually resulting in poor out-of-sample performance. As both subsets account for only a fraction of the entire set and differences in performance are only minor. We conclude that our approaches are stable across multiple dimensions and between exchanges. @@ -677,8 +671,7 @@ \subsection{Robustness of Results}\label{sec:robustness-checks} \tabindent Outside & 65.067436 & -1.710000 & 65.241898 & 0.320000 & 66.000134 & 1.080000 \\ \tabindent Unknown & 78.330482 & 1.920000 & 77.954157 & 1.550000 & 78.966815 & 2.560000 \\ \cmidrule(rl){1-7} - \multicolumn{7}{l}{ All} \\ - \tabindent All & 64.655751 & 4.600000 & 72.859054 & 5.250000 & 74.551410 & 6.940000 \\ + All & 64.655751 & 4.600000 & 72.859054 & 5.250000 & 74.551410 & 6.940000 \\ \bottomrule \end{tabular} \end{table} @@ -734,8 +727,7 @@ \subsection{Robustness of Results}\label{sec:robustness-checks} \tabindent Outside & 74.484749 & -0.430000 & 73.996977 & 2.070000 & 72.822204 & 0.890000 \\ \tabindent Unknown & 83.897769 & 1.240000 & 83.130136 & 0.470000 & 83.671995 & 1.010000 \\ \cmidrule(rl){1-7} - \multicolumn{7}{l}{ All} \\ - \tabindent All & 65.668441 & 4.920000 & 71.783984 & 5.270000 & 74.095833 & 7.580000 \\ + All & 65.668441 & 4.920000 & 71.783984 & 5.270000 & 74.095833 & 7.580000 \\ \bottomrule \end{tabular} \end{table} @@ -746,22 +738,22 @@ \subsection{Feature Importance}\label{sec:feature-importance} Transformers outperform all rule-based approaches by a large margin on the \gls{ISE} dataset. To gain insights into the factors driving this performance, we conduct a qualitative analysis of the attention mechanism and learned embeddings. For an evaluation of feature importances, that suffices for a cross-model comparison, we utilize \gls{SAGE}, building upon our rationale from \cref{sec:feature-importance-measure}. -\textbf{Attention Visualisation} +\textbf{Attention Visualization} The analysis of attention follows a top-down approach. Initially, we generate model-wide attention maps using the methodology of \textcite[][2--4]{cheferTransformerInterpretabilityAttention2021}. Subsequently, we detail the analysis by probing specific attention heads adapting a procedure of \textcite[][4]{clarkWhatDoesBERT2019}. -Attention maps offer transparency at the trade or dataset level. To aid visualisation, we focus on subsets of trades, where the performance of Transformers is particularly strong and select \num{16} trades at the quotes and \num{16} mid-spread trades from the \gls{ISE} test set. The resulting attention maps are shown in \cref{fig:attention-maps-ise}. +Attention maps offer transparency at the trade or dataset level. To aid visualization, we focus on subsets of trades, where the performance of Transformers is particularly strong and select \num{16} trades at the quotes and \num{16} midspread trades from the \gls{ISE} test set. The resulting attention maps are shown in \cref{fig:attention-maps-ise}. \begin{figure}[h!] \centering \includegraphics[width=1\textwidth]{attention_maps_ise_quotes_mid.pdf} - \caption[Attention Maps of FT-Transformer]{Attention maps of FT-Transformer trained on \gls{ISE} data with \gls{FS} option. The left plot contains attention weights of \num{16} trades at the quotes and the right plot of \num{16} mid-spread trades. Each column represents a trade and each row a feature. The intensity of the pixel represents the importance. $\mathtt{[CLS]}$ token excluded, as suggested in \textcite[][4]{cheferGenericAttentionmodelExplainability2021}. The green area marks a trade, that was correctly classified by the network. Details on the trade are given below.} + \caption[Attention Maps of FT-Transformer]{Attention maps of FT-Transformer trained on \gls{ISE} data with \gls{FS} option. The left plot contains attention weights of \num{16} trades at the quotes and the right plot of \num{16} midspread trades. Each column represents a trade and each row represents a feature. The intensity of the pixel represents the importance. $\mathtt{[CLS]}$ token excluded, as suggested in \textcite[][4]{cheferGenericAttentionmodelExplainability2021}. The green area marks a trade, that was correctly classified by the network. Details on the trade are given below.} \label{fig:attention-maps-ise} \end{figure} -Visually, the trade price and quotes at the exchange or inter-exchange level are important and frequently used. This aligns with theory, as these features are core to the quote rule and numerous hybrid algorithms. Also, quote-based algorithms are among the best performing in our dataset. Aside from the trade price, features required to estimate the tick rule attain only spurious attributions. Considering the devastating performance of tick-based algorithms in option trade classification, this is unsurprising. Features from the depth and trade size rule, such as the trade size, are used selectively for trades at the quotes. In this subset, option-specific features like the issue type, moneyness, time to maturity, or daily trading volume of the option series receive relatively high attention scores. Overall, engineered features, like the proximity to quotes, attain low attention scores, which suggests that the Transformer itself can synthesise the feature from the \emph{raw} bid, ask, and trade price. +Visually, the trade price and quotes at the exchange or inter-exchange level are important and frequently used. This aligns with theory, as these features are core to the quote rule and numerous hybrid algorithms. Also, quote-based algorithms are among the best performing in our dataset. Aside from the trade price, features required to estimate the tick rule attain only spurious attributions. Considering the devastating performance of tick-based algorithms in option trade classification, this is unsurprising. Features from the depth and trade size rule, such as the trade size, are used selectively for trades at the quotes. In this subset, option-specific features like the issue type, moneyness, time to maturity, or daily trading volume of the option series receive relatively high attention scores. Overall, engineered features, like the proximity to quotes, attain low attention scores, which suggests that the Transformer itself can synthesize the feature from the \emph{raw} bid, ask, and trade price. -The model assigns higher attention scores to features present in rule-based algorithms. Due to the possible link to rule-based trade classification, it is worthwhile to explore, if the fine-grained patterns learned by specific attention heads translate to existing trade classification rules i.e., if specific tokens attend to features that are jointly used in rule-based classification. This information is sacrificed when aggregating over multiple attention heads and layers, as done for \cref{fig:attention-maps-ise}, but readily available from individual attention heads. To analyse this further, we adapt the approach of \textcite[][4]{clarkWhatDoesBERT2019} to our context and probe individual attention heads. +The model assigns higher attention scores to features present in rule-based algorithms. Due to the possible link to rule-based trade classification, it is worthwhile to explore, if the fine-grained patterns learned by specific attention heads translate to existing trade classification rules i.e., if specific tokens attend to features that are jointly used in rule-based classification. This information is sacrificed when aggregating over multiple attention heads and layers, as done for \cref{fig:attention-maps-ise}, but readily available from individual attention heads. To analyze this further, we adapt the approach of \textcite[][4]{clarkWhatDoesBERT2019} to our context and probe individual attention heads. \begin{figure}[h!] \subfloat[Tick Rule-like Head (3,5)\label{fig:head-tick}]{\includegraphics[width=0.3\textwidth]{attention_head_5_layer_3_color_green_ise_quotes_mid.pdf}} @@ -769,31 +761,31 @@ \subsection{Feature Importance}\label{sec:feature-importance} \subfloat[Trade Size Rule-like Head (3,8)\label{fig:head-tsize}]{\includegraphics[width=0.3\textwidth]{attention_head_8_layer_3_color_green_ise_quotes_mid.pdf}} \hfill \subfloat[\glsentryshort{LR}-Like Head (4,8)\label{fig:head-lr}]{\includegraphics[width=0.3\textwidth]{attention_head_8_layer_4_color_green_ise_quotes_mid.pdf}} - \caption[Rule-like Roles of Selected Attention Heads]{Attention heads that correspond to trade classification rules. Tuple denotes the location of the attention head in the model in the form of (layer, head). The intensity of the line represents the strength of attention weight. Attentions are only visualised for the $\mathtt{[CLS]}$ token. The model is trained on \gls{ISE} data. Visualisations based on code by \textcite[][4]{clarkWhatDoesBERT2019}.} + \caption[Rule-like Roles of Selected Attention Heads]{Attention heads that correspond to trade classification rules. Tuple denotes the location of the attention head in the model in the form of (layer, head). The intensity of the line represents the strength of attention weight. Attentions are only visualized for the $\mathtt{[CLS]}$ token. The model is trained on \gls{ISE} data. Visualizations based on code by \textcite[][4]{clarkWhatDoesBERT2019}.} \label{fig:rule-like-attention-heads} \end{figure} -We study attention weights of one specific trade in detail, which is marked in green in \cref{fig:attention-maps-ise}. The trade has the following properties: trade price \SI{3.5}[\$]{}, trade size \SI{5}[]{} contracts, ask at exchange, \SI{3.85}[\$]{}, bid at exchange \SI{3.5}[\$]{}, ask size \SI{11}[]{} contracts, and bid size \SI{10}[]{} contracts, classified as sell. \cref{fig:rule-like-attention-heads} depicts the result for selected attention heads involved in classifying the specific trade. The remaining attention heads are visualised in \cref{app:attention-heads-of-transformer}. Each subplot depicts the features to which the classification token $\mathtt{[CLS]}$ attends too. The attention weight determines the intensity of the line between the two. +We study attention weights of one specific trade in detail, which is marked in green in \cref{fig:attention-maps-ise}. The trade has the following properties: trade price \SI{3.5}[\$]{}, trade size \SI{5}[]{} contracts, ask at exchange, \SI{3.85}[\$]{}, bid at exchange \SI{3.5}[\$]{}, ask size \SI{11}[]{} contracts, and bid size \SI{10}[]{} contracts, classified as sell. \cref{fig:rule-like-attention-heads} depicts the result for selected attention heads involved in classifying the specific trade. The remaining attention heads are visualized in \cref{app:attention-heads-of-transformer}. Each subplot depicts the features to which the classification token $\mathtt{[CLS]}$ attends too. The attention weight determines the intensity of the line between the two. -Referring to the results from the appendix, we note that attention heads learn diverse patterns, as most heads attend to different tokens at once learning different relations. However, certain heads exhibit redundancy. For earlier layers in the network, the classification tokens gather from multiple tokens with uniform attention weights, whereas for the final self-attention layers, attention heads specialise in relations that seem related to rule-based trade classification. \cref{fig:head-tick} depicts a classification head that focuses solely on the change in trade price akin to the tick rule. In \cref{fig:head-tsize} the classification token in the neighbouring head gathers simultaneously from multiple size-related features similar to the trade size rule. Finally, \cref{fig:head-lr} is alike to the \gls{LR} algorithm with additional dependencies on the moneyness. For other attention heads the purpose they serve in the network remains open. +Referring to the results from the appendix, we note that attention heads learn diverse patterns, as most heads attend to different tokens at once learning different relations. However, certain heads exhibit redundancy. For earlier layers in the network, the classification tokens gather from multiple tokens with uniform attention weights, whereas for the final self-attention layers, attention heads specialize in relations that seem related to rule-based trade classification. \cref{fig:head-tick} depicts a classification head that focuses solely on the change in trade price akin to the tick rule. In \cref{fig:head-tsize} the classification token in the neighboring head gathers simultaneously from multiple size-related features similar to the trade size rule. Finally, \cref{fig:head-lr} is alike to the \gls{LR} algorithm with additional dependencies on the moneyness. For other attention heads the purpose they serve in the network remains open. The redundancy between attention heads is possibly explained by the use of attention dropout in our networks (cp. \cref{sec:hyperparameter-tuning}), which randomly deactivates units of the network during training and forces the network to learn redundant representations. A similar point is made by \textcite[][8--9]{clarkWhatDoesBERT2019} for the related \gls{BERT} model. Our finding of uniform attention weights in earlier layers of the network is consistent with the of \textcite[][4]{abnarQuantifyingAttentionFlow2020} made for \gls{BERT}. When repeated for other trades, the identified roles of the attention heads are partially retained, but it is important to highlight that a more comprehensive analysis is required. We suggest revisiting this topic in future research as it potentially enables uncovering new rule-based approaches and understanding Transformer-based trade classification in more detail. -\textbf{Embedding Visualisation} +\textbf{Embedding Visualization} -For the Transformer we know from \cref{sec:token-embeddings}, that embeddings can capture similarities by arranging related objects closer in embedding space. Visualising the learnt embeddings enables insights into the model. +For the Transformer we know from \cref{sec:token-embeddings}, that embeddings can capture similarities by arranging related objects closer in embedding space. Visualizing the learned embeddings enables insights into the model. The embeddings are queried from the feature tokenizer in FT-Transformer. The similarity between embeddings is measured by cosine distance in embedding space. The high-dimensional embeddings are then projected into 2D space using $t$-SNE \autocite[][2587]{vandermaatenVisualizingDataUsing2008}. As straightforward to interpret, we focus our analysis on the root, but note, that it applies to any numerical and categorical embeddings. -\cref{fig:categorical-embeddings} illustrates the embeddings exemplary for SPDR S\&P 500 Trust ($\mathtt{SPY}$) and JPMorgan Chase \& Co ($\mathtt{JPM}$) which can be \emph{qualitatively} interpreted.\footnote{As our analysis is condensed to two randomly chosen examples, we encourage the reader to use our interactive visualisation for further exploration. Accessible here \url{https://projector.tensorflow.org/?config=https://raw.githubusercontent.com/KarelZe/Embeddings/main/embedding_projector.config.json}.} +\cref{fig:categorical-embeddings} illustrates the embeddings exemplary for SPDR S\&P 500 Trust ($\mathtt{SPY}$) and JPMorgan Chase \& Co ($\mathtt{JPM}$) which can be \emph{qualitatively} interpreted.\footnote{As our analysis is condensed to two randomly chosen examples, we encourage the reader to use our interactive visualization for further exploration. Accessible here \url{https://projector.tensorflow.org/?config=https://raw.githubusercontent.com/KarelZe/Embeddings/main/embedding_projector.config.json}.} \begin{figure}[h!] \subfloat[Most Similar Embeddings to $\mathtt{SPY}$\label{fig:cat-embeddings-spy}]{\includegraphics[width=0.6\textwidth]{categorical_embeddings_SPY.pdf}} \vfill \subfloat[Most Similar Embeddings to $\mathtt{JPM}$\label{fig:cat-embeddings-jpm}]{\includegraphics[width=0.6\textwidth]{categorical_embeddings_JPM.pdf}} - \caption[Embeddings of Selected Underlyings]{Embeddings of selected underlyings. The plot depicts the projected embedding of SPDR S\&P 500 ETF ($\mathtt{SPY}$) and JPMorgan Chase \& Co ($\mathtt{JPM}$) and their most similar embeddings. Embeddings are projected into 2D-space using $t$-SNE. The ten most similar embeddings by cosine distance in the original space are coloured and annotated. The model is trained on \gls{ISE} data.} + \caption[Embeddings of Selected Underlyings]{Embeddings of selected underlyings. The plot depicts the projected embedding of SPDR S\&P 500 ETF ($\mathtt{SPY}$) and JPMorgan Chase \& Co ($\mathtt{JPM}$) and their most similar embeddings. Embeddings are projected into 2D-space using $t$-SNE. The ten most similar embeddings by cosine distance in the original space are colored and annotated. The model is trained on \gls{ISE} data.} \label{fig:categorical-embeddings} \end{figure} @@ -801,11 +793,11 @@ \subsection{Feature Importance}\label{sec:feature-importance} Regarding JPMorgan Chase \& Co. ($\mathtt{JPM}$) in \cref{fig:cat-embeddings-jpm}, the most similar embedding is the of Bank of America ($\mathtt{BAC}$). Other similar embeddings include financial service providers like Amerigroup ($\mathtt{XGZ}$) and Janus Henderson Group ($\mathtt{ZPR}$). These results suggest that the model learned to group US financials, even without sector information provided. However, this argumentation does not apply to other related embeddings such as the Apollo Group ($\mathtt{OKO}$) or United Parcel Service of America ($\mathtt{YUP}$). % Autodesk Inc. ($\mathtt{ADQ}$) , Centex Corp. ($\mathtt{YYV}$), United Parcel Service of America ($\mathtt{YUP}$), Wild Oats Markets ($\mathtt{ZAC}$), SPDR S\&P 500 ETF ($\mathtt{SUE}$), and SPDR Dow Jones Industrial Average ($\mathtt{DIA}$). -While these exemplary results indicate that the model can learn meaningful representations of the underlying, we must acknowledge its limitations. Both underlyings are frequently traded in our dataset, which may lead to meaningful embeddings. For infrequent underlyings, embeddings are likely to be close to their random initialisation and lack meaningful patterns due to limited parameter updates and missing context. This issue is analogous to handling rare vocabulary items found in natural language processing. As the underlying plays a subordinate role in classification, this caveat is accepted. +While these exemplary results indicate that the model can learn meaningful representations of the underlying, we must acknowledge its limitations. Both underlyings are frequently traded in our dataset, which may lead to meaningful embeddings. For infrequent underlyings, embeddings are likely to be close to their random initialization and lack meaningful patterns due to limited parameter updates and missing context. This issue is analogous to handling rare vocabulary items found in natural language processing. As the underlying plays a subordinate role in classification, this caveat is accepted. \textbf{SAGE Values} -We compare the feature importances of rule-based and machine learning-based classifiers using \gls{SAGE}, which offers a clear interpretation of each feature's contribution to the prediction. The zero-one loss is chosen as loss function, which is appealing due to the direct link to accuracy. Based on the distribution of the \gls{ISE} test set, a na\"ive prediction of the majority class yields an accuracy of \SI{51.4027}{\percent} or a zero-one loss of $1- \num{0.514027} = \num{0.485973}$. \gls{SAGE} attributes the outperformance of machine learning or rule-based classifiers over the na\"ive prediction to the features based on Shapley values. The sum of all \gls{SAGE} values for a given predictor represents the difference in loss compared to the na\"ive classification. +We compare the feature importances of rule-based and machine learning-based classifiers using \gls{SAGE}, which offers a clear interpretation of each feature's contribution to the prediction. The zero-one loss is chosen as a loss function, which is appealing due to the direct link to accuracy. Based on the distribution of the \gls{ISE} test set, a na\"ive prediction of the majority class yields an accuracy of \SI{51.4027}{\percent} or a zero-one loss of $1- \num{0.514027} = \num{0.485973}$. \gls{SAGE} attributes the outperformance of machine learning or rule-based classifiers over the na\"ive prediction to the features based on Shapley values. The sum of all \gls{SAGE} values for a given predictor represents the difference in loss compared to the na\"ive classification. \begin{figure}[h!] \centering @@ -814,7 +806,7 @@ \subsection{Feature Importance}\label{sec:feature-importance} \label{fig:sage-importances} \end{figure} -From \cref{fig:sage-importances} that all models achieve the largest improvement in loss from quoted prices and if provided from the quoted sizes. The contribution of the \gls{NBBO} to performance is roughly equal for all models, suggesting that even simple heuristics effectively exploit the data. For machine learning-based predictors, quotes at the exchange level hold equal importance in classification. This contrast with \gls{GSU} methods, which rely less on exchange-level quotes and mostly classify trades based on upstream rules. The performance improvements from the trade size and quoted size, are slightly lower for rule-based methods compared to machine learning-based methods. Transformers and \glspl{GBRT} gain performance from the addition of option features, i.e., moneyness and time-to-maturity. In conjunction with the results from the robustness checks, this suggests that the improvement observed for long-running options or \gls{ITM} options is directly linked to the moneyness or time to maturity of the traded option itself. However, it remains unclear how these features interact with others. Regardless of the method used, changes in trade price before or after the trade are irrelevant for classification and can even harm performance. Similarly, additional features such as option type, issue type, the trading volume of the option series, and the underlying are also irrelevant. Thus, we note that there is a significant overlap between the importance of features in classical trade classification rules and machine learning-based predictors. +From \cref{fig:sage-importances} that all models achieve the largest improvement in loss from quoted prices and if provided from the quoted sizes. The contribution of the \gls{NBBO} to performance is roughly equal for all models, suggesting that even simple heuristics effectively exploit the data. For machine learning-based predictors, quotes at the exchange level hold equal importance in classification. This contrast with \gls{GSU} methods, which rely less on exchange-level quotes and mostly classify trades based on upstream rules. The performance improvements from the trade size and quoted size, are slightly lower for rule-based methods compared to machine learning-based methods. Transformers and \glspl{GBRT} gain performance from the addition of option features, i.e., moneyness and time-to-maturity. In conjunction with the results from the robustness checks, this suggests that the improvements observed for long-running options or \gls{ITM} options are directly linked to the moneyness or time to maturity of the traded option itself. However, it remains unclear how these features interact with others. Regardless of the method used, changes in trade price before or after the trade are irrelevant for classification and can even harm performance. Similarly, additional features such as option type, issue type, the trading volume of the option series, and the underlying are also irrelevant. Thus, we note that there is a significant overlap between the importance of features in classical trade classification rules and machine learning-based predictors. \todo{Importance of Moneyness and Time-to-Maturity} \todo{Distribution in Sample: TTM, Trade Size, Moneyness} @@ -833,7 +825,7 @@ \section{Application in Transaction Cost Estimation}\label{sec:application} \label{eq:effective-spread} \end{equation} -Like before, $i$ indexes the security and $t$ the point in time. Here, $D_{i,t}$ is the trade direction, which is either $1$ for customer buy orders and $-1$ for sell orders. If the trade initiator is known, we set $D_{i,t} = y_{i,t}$ and $D_{i,t}=\hat{y}_{it}$, if inferred from a rule or classifier. As the fundamental value $V_{i,t}$ is unobserved at the time of the trade, we follow a common track in research and use the midpoint of the prevailing quotes as an observable proxy.\footnote{An alternative treatment for options is discussed in \textcite[][4975--4976]{muravyevOptionsTradingCosts2020} Our focus is on the midspread, as it is the most common proxy for the value.} This is also a natural choice, under the assumption that, on average, the spread is symmetric and centred around the true fundamental value \autocite[][1018]{leeMarketIntegrationPrice1993}. We multiply the so-obtained half-spread by $2 \times$ to obtain the effective spread, which represents the cost for a round trip trade involving a buy and sell excluding commissions. +Like before, $i$ indexes the security and $t$ the point in time. Here, $D_{i,t}$ is the trade direction, which is either $1$ for customer buy orders and $-1$ for sell orders. If the trade initiator is known, we set $D_{i,t} = y_{i,t}$ and $D_{i,t}=\hat{y}_{it}$, if inferred from a rule or classifier. As the fundamental value $V_{i,t}$ is unobserved at the time of the trade, we follow a common track in research and use the midpoint of the prevailing quotes as an observable proxy.\footnote{An alternative treatment for options is discussed in \textcite[][4975--4976]{muravyevOptionsTradingCosts2020} Our focus is on the midspread, as it is the most common proxy for the value.} This is also a natural choice, under the assumption that, on average, the spread is symmetric and centered around the true fundamental value \autocite[][1018]{leeMarketIntegrationPrice1993}. We multiply the so-obtained half-spread by $2 \times$ to obtain the effective spread, which represents the cost for a round trip trade involving a buy and sell excluding commissions. Apparent from \cref{eq:effective-spread}, poor estimates for the predicted trade direction, lead to an under or overestimated effective spread, and hence to a skewed trade cost estimate. Only for trades at the midspread, the predicted trade direction is irrelevant, since the effective spread is zero. By comparing the true effective spread from the estimated, we can derive the economic significance. A classifier correctly classifying every trade, achieves an effective spread estimate equal to the true spread. For a random classifier, the effective spread is around zero, as misclassification estimates the spread with the opposite sign, which offsets with correct, random estimates for other trades. @@ -847,7 +839,7 @@ \section{Application in Transaction Cost Estimation}\label{sec:application} \textbf{Results} -The true and the estimated effective spreads for the test sets are shown in the \cref{tab:effective-spread} aggregated by mean. \textcite[][896--897]{savickasInferringDirectionOption2003} estimated the effective spreads of rules on a older subset of option trades at the \gls{CBOE}, which can be compared against. Our results match theirs in magnitude. +The true and the estimated effective spreads for the test sets are shown in the \cref{tab:effective-spread} aggregated by mean. \textcite[][896--897]{savickasInferringDirectionOption2003} estimated the effective spreads of rules on an older subset of option trades at the \gls{CBOE}, which can be compared against. Our results match theirs in magnitude. \begin{table}[!ht] \centering @@ -856,14 +848,14 @@ \section{Application in Transaction Cost Estimation}\label{sec:application} \label{tab:effective-spread} \end{table} -In summary, quote-based algorithms like the quote rule and the \gls{LR} algorithm severely overestimate the effective spread. The overestimate is less severe for the \gls{CLNV} algorithm due to stronger dependency on the tick rule. The tick rule itself achieves estimates closest to the true effective spread, which is \num[round-mode=places, round-precision=3]{0.004926}[\$]{} and \num[round-mode=places, round-precision=3]{0.012219}[\$]{} for the \gls{ISE} and \gls{CBOE} sample respectively. As primarily tick-based algorithms, like the tick rule or \gls{EMO} rule, act as a random classifier in our samples, we conclude that the close estimate is an artifact to randomness, not due to superior predictive power. This observation is in line with \textcite[][897]{savickasInferringDirectionOption2003}, who make a similar argument for the \gls{EMO} rule on \gls{CBOE} trades. For rule-based algorithms $\operatorname{gsu}_{\mathrm{large}}$ provides reasonable estimates of the effective spread while achieving high classification accuracy. +In summary, quote-based algorithms like the quote rule and the \gls{LR} algorithm severely overestimate the effective spread. The overestimate is less severe for the \gls{CLNV} algorithm due to stronger dependency on the tick rule. The tick rule itself achieves estimates closest to the true effective spread, which is \num[round-mode=places, round-precision=3]{0.004926}[\$]{} and \num[round-mode=places, round-precision=3]{0.012219}[\$]{} for the \gls{ISE} and \gls{CBOE} sample respectively. As primarily tick-based algorithms, like the tick rule or \gls{EMO} rule, act as a random classifier in our samples, we conclude that the close estimate is an artifact of randomness, not due to superior predictive power. This observation is in line with \textcite[][897]{savickasInferringDirectionOption2003}, who make a similar argument for the \gls{EMO} rule on \gls{CBOE} trades. For rule-based algorithms $\operatorname{gsu}_{\mathrm{large}}$ provides reasonable estimates of the effective spread while achieving high classification accuracy. From our supervised classifiers the FT-Transformer or \gls{GBRT} trained on \gls{FS} option provides estimates closest to the true effective spread, in particular on the \gls{CBOE} sample. For semi-supervised classifiers, Transformer-based models approximate the true effective spread best. This best manifests in a predicted effective spread at the \gls{ISE} of \SI[round-mode=places, round-precision=3]{0.013118}[\$]{} versus \SI[round-mode=places, round-precision=3]{0.004926}[\$]{}. The null hypothesis of equal medians is rejected at the \SI{1}{\percent} level for all classifiers. -Thus, \gls{GSU} method (large) provides the best estimate of the effective spread if the true labels are absent. For labelled data, Transformer or gradient boosting-based approaches can provide more accurate estimates. The de facto standard, the \gls{LR} algorithm, fails to deliver accurate estimates and may bias research. +Thus, \gls{GSU} method (large) provides the best estimate of the effective spread if the true labels are absent. For labeled data, Transformer or gradient boosting-based approaches can provide more accurate estimates. The de facto standard, the \gls{LR} algorithm, fails to deliver accurate estimates and may bias research. \todo{“In addition, my results offer little help in answering why option bid-ask spreads are so large. This is one of the biggest puzzles in the options literature—existing theories of the option spread fail to explain its magnitude and shape (Muravyev and Pearson (2014)).”} \todo{compare against \textcite[][4981]{muravyevOptionsTradingCosts2020} or \autocite{kaeckPriceImpactBid2022}} -\todo{Think about reporting as percentage? Adjust formula from above.} +\todo{Think about reporting as a percentage. Adjust the formula from above.} \todo{Look into \textcite{muravyevOptionsTradingCosts2020}} \todo{Options listed on multiple exchanges have narrower spreads than those listed on a single exchange, but the difference diminishes as option volume increases. Option spreads become wider when a competing exchange delists the option.\autocite{mayhewCompetitionMarketStructure2002}} diff --git a/reports/Content/rule-approaches.tex b/reports/Content/rule-approaches.tex index 12eaa427..74384942 100644 --- a/reports/Content/rule-approaches.tex +++ b/reports/Content/rule-approaches.tex @@ -113,7 +113,7 @@ \subsubsection{Trade Size Rule}\label{sec:trade-size-rule} \end{cases} \label{eq:trade-size-rule} \end{equation} -The trade size rule in \cref{eq:trade-size-rule} classifies based on a match between the size of the trade $\tilde{P}_{i, t}$ and the quoted bid and ask sizes. The rationale is, that the market maker tries to fill the limit order of a customer, which results in the trade being executed at the contemporaneous bid or ask, with a trade size equalling the quoted size \autocite[][13]{grauerOptionTradeClassification2022}. When both the size of the ask and bid correspond with the trade size or the trade size does not match the quoted sizes, the result is ambiguous. +The trade size rule in \cref{eq:trade-size-rule} classifies based on a match between the size of the trade $\tilde{P}_{i, t}$ and the quoted bid and ask sizes. The rationale is, that the market maker tries to fill the limit order of a customer, which results in the trade being executed at the contemporaneous bid or ask, with a trade size equaling the quoted size \autocite[][13]{grauerOptionTradeClassification2022}. When both the size of the ask and bid correspond with the trade size or the trade size does not match the quoted sizes, the result is ambiguous. Expectedly, the improvement is highest for trades at the quotes and reverses for trades outside the quote \autocite[][15]{grauerOptionTradeClassification2022}. Based on these results, the trade size rule may only be applied selectively to trades near or at the quote. Since only a fraction of all trades can be classified with the trade size rule, the rule must be combined with other basic or hybrid rules for complete coverage. The subsequent section introduces four hybrid algorithms, that combine basic rules into more sophisticated algorithms. @@ -144,14 +144,14 @@ \subsection{Hybrid Rules}\label{sec:hybrid-rules} \input{./Graphs/grauer-algo.pdf_tex}} } \hfill\null - \caption[Overview Over Hybrid Trade Classification Rules]{Overview Over hybrid trade classification rules. The Figure visualises the components of the \acrshort{LR} algorithm, \acrshort{EMO} rule, the \acrshort{CLNV} method, and an arbitrary, stacked combination relative to the quotes. Rules at the midpoint or the quotes are slightly exaggerated for better readability. Own work inspired by \textcite[][167]{poppeSensitivityVPINChoice2016}.} + \caption[Overview Over Hybrid Trade Classification Rules]{Overview Over hybrid trade classification rules. The Figure visualizes the components of the \acrshort{LR} algorithm, \acrshort{EMO} rule, the \acrshort{CLNV} method, and an arbitrary, stacked combination relative to the quotes. Rules at the midpoint or the quotes are slightly exaggerated for better readability. Own work inspired by \textcite[][167]{poppeSensitivityVPINChoice2016}.} \label{fig:hybrid-algorithms} \end{figure} -Popular variants include the \gls{LR} algorithm, the \gls{EMO} rule, and the \gls{CLNV} method. All three algorithms utilise the quote and tick rule to a varying extent, as depicted in \cref{fig:hybrid-lr,fig:hybrid-emo,fig:hybrid-clnv}. Basic rules are selected based on the proximity of the trade price to the quotes. We study all algorithms in detail in \cref{sec:lee-and-ready-algorithm,sec:ellis-michaely-ohara-rule,sec:chakarabarty-li-nguyen-van-ness-method}. +Popular variants include the \gls{LR} algorithm, the \gls{EMO} rule, and the \gls{CLNV} method. All three algorithms utilize the quote and tick rule to a varying extent, as depicted in \cref{fig:hybrid-lr,fig:hybrid-emo,fig:hybrid-clnv}. Basic rules are selected based on the proximity of the trade price to the quotes. We study all algorithms in detail in \cref{sec:lee-and-ready-algorithm,sec:ellis-michaely-ohara-rule,sec:chakarabarty-li-nguyen-van-ness-method}. -As put forth by \textcite[][18]{grauerOptionTradeClassification2022}, basic or hybrid rules can be combined through stacking. One such combination is depicted in \cref{fig:hybrid-grauer}. This approach generalises the aforementioned algorithms, as the applied rule is no longer dependent on the proximity to the quotes, but rather on the classifiability of the trade with the primary rules given by the domains and their ordering. We cover this approach last. +As put forth by \textcite[][18]{grauerOptionTradeClassification2022}, basic or hybrid rules can be combined through stacking. One such combination is depicted in \cref{fig:hybrid-grauer}. This approach generalizes the aforementioned algorithms, as the applied rule is no longer dependent on the proximity to the quotes, but rather on the classifiability of the trade with the primary rules given by the domains and their ordering. We cover this approach last. \subsubsection{Lee and Ready Algorithm}\label{sec:lee-and-ready-algorithm} @@ -171,7 +171,7 @@ \subsubsection{Lee and Ready Algorithm}\label{sec:lee-and-ready-algorithm} \subsubsection{Ellis-Michaely-O'Hara Rule}\label{sec:ellis-michaely-ohara-rule} -\textcite[][536]{ellisAccuracyTradeClassification2000} examine the performance of the previous algorithms for stocks traded at \gls{NASDAQ}. By analysing miss-classified trades with regard to the proximity of the trade to the quotes, they observe, that the quote rule and by extension, the \gls{LR} algorithm, perform particularly well at classifying trades executed at the bid and the ask price but trail the performance of the tick rule for trades inside or outside the spread \autocite[][535--536]{ellisAccuracyTradeClassification2000}. The authors combine these observations into a single rule, known as the \gls{EMO} algorithm. +\textcite[][536]{ellisAccuracyTradeClassification2000} examine the performance of the previous algorithms for stocks traded at \gls{NASDAQ}. By analyzing miss-classified trades with regard to the proximity of the trade to the quotes, they observe, that the quote rule and by extension, the \gls{LR} algorithm, perform particularly well at classifying trades executed at the bid and the ask price but trail the performance of the tick rule for trades inside or outside the spread \autocite[][535--536]{ellisAccuracyTradeClassification2000}. The authors combine these observations into a single rule, known as the \gls{EMO} algorithm. As such, the \gls{EMO} algorithm extends the tick rule by classifying trades at the quotes using the quote rule, and all other trades with the tick test. Formally, the classification rule is given by: \begin{equation} @@ -189,7 +189,7 @@ \subsubsection{Ellis-Michaely-O'Hara \subsubsection{Chakrabarty-Li-Nguyen-Van-Ness Method}\label{sec:chakarabarty-li-nguyen-van-ness-method} -Like the previous two algorithms, the \gls{CLNV} method of \textcite[][3809]{chakrabartyTradeClassificationAlgorithms2012} is a hybrid of the quote and tick rule and extends the \gls{EMO} rule by a differentiated treatment of trades inside the quotes, which are notoriously hard to classify. The authors segment the bid-ask spread into deciles (ten equal-width bins) and classify trades around the midpoint (fourth to seventh decile) by the tick rule and trades close or outside the quotes are categorised by the tick rule. +Like the previous two algorithms, the \gls{CLNV} method of \textcite[][3809]{chakrabartyTradeClassificationAlgorithms2012} is a hybrid of the quote and tick rule and extends the \gls{EMO} rule by a differentiated treatment of trades inside the quotes, which are notoriously hard to classify. The authors segment the bid-ask spread into deciles (ten equal-width bins) and classify trades around the midpoint (fourth to seventh decile) by the tick rule and trades close or outside the quotes are categorized by the tick rule. \begin{equation} \operatorname{clnv} \colon \mathbb{N}^2 \to \mathcal{Y}, \quad \operatorname{clnv}(i, t)= @@ -201,7 +201,7 @@ \subsubsection{Chakrabarty-Li-Nguyen-Van-Ness \label{eq:CLNV-rule} \end{equation} -The algorithm is summarised in \cref{eq:CLNV-rule}. It is derived from a performance comparison of the tick rule (\gls{EMO} rule) against the quote rule (\gls{LR} algorithm) on stock data, whereby the accuracy was assessed separately for each decile.\footnote{The spread is assumed to be positive and evenly divided into ten deciles and the first to third deciles are classified by the quote rule. Counted from the bid, the first decile starts at $B_{i,t}$ and ends at $B_{i,t} + \tfrac{3}{10} (A_{i,t} - B_{i,t}) = \tfrac{7}{10} B_{i,t} + \tfrac{3}{10} A_{i,t}$ third decile. As all trade prices are below the midpoint, they are classified as a sell.} The classical \gls{CLNV} method uses the backward-looking tick rule. In the spirit of \textcite[][735]{leeInferringTradeDirection1991}, the tick test can be exchanged for the reverse tick test. +The algorithm is summarized in \cref{eq:CLNV-rule}. It is derived from a performance comparison of the tick rule (\gls{EMO} rule) against the quote rule (\gls{LR} algorithm) on stock data, whereby the accuracy was assessed separately for each decile.\footnote{The spread is assumed to be positive and evenly divided into ten deciles and the first to third deciles are classified by the quote rule. Counted from the bid, the first decile starts at $B_{i,t}$ and ends at $B_{i,t} + \tfrac{3}{10} (A_{i,t} - B_{i,t}) = \tfrac{7}{10} B_{i,t} + \tfrac{3}{10} A_{i,t}$ third decile. As all trade prices are below the midpoint, they are classified as a sell.} The classical \gls{CLNV} method uses the backward-looking tick rule. In the spirit of \textcite[][735]{leeInferringTradeDirection1991}, the tick test can be exchanged for the reverse tick test. \subsubsection{Stacked Rule}\label{sec:stacked-rule} diff --git a/reports/Content/semisupervised-approaches.tex b/reports/Content/semisupervised-approaches.tex index 557607e4..4f29621a 100644 --- a/reports/Content/semisupervised-approaches.tex +++ b/reports/Content/semisupervised-approaches.tex @@ -4,65 +4,65 @@ \section{Semi-Supervised Approaches}\label{sec:semi-supervised-approaches} \subsection{Framing as a Semi-supervised Learning Problem}\label{sec:problem-framing-2} -The supervised approaches depend on the availability of the trade initiator as the true label. Yet, obtaining the label is often restricted to the rare cases, where the trade initiator is provided by the exchange or to subsets of trades where the initiator can be inferred through matching procedures (cp. \cref{sec:trade-initiator}), which may bias the selection. Unlabelled trades, though, are abundant and can help improve the generalisation performance of the classifier. This concern is addressed by semi-supervised methods. +The supervised approaches depend on the availability of the trade initiator as the true label. Yet, obtaining the label is often restricted to the rare cases, where the trade initiator is provided by the exchange or to subsets of trades where the initiator can be inferred through matching procedures (cp. \cref{sec:trade-initiator}), which may bias the selection. Unlabeled trades, though, are abundant and can help improve the generalization performance of the classifier. This concern is addressed by semi-supervised methods. -Semi-supervised methods leverage partially-labelled data by learning an algorithm on unlabelled instances alongside true labels \autocite[][6]{chapelleSemisupervisedLearning2006}. They are centred around the semi-supervised assumption of smoothness, which states that if two samples say $\mathbf{x}_{1}$ and $\mathbf{x}_{2}$, are nearby in a high-density region, their class labels $y_{1}$ and $y_{2}$ should also be similar. Vice versa, if data points are separated by a low-density region, their labels may be different \autocite[][5]{chapelleSemisupervisedLearning2006}. +Semi-supervised methods leverage partially-labeled data by learning an algorithm on unlabeled instances alongside true labels \autocite[][6]{chapelleSemisupervisedLearning2006}. They are centered around the semi-supervised assumption of smoothness, which states that if two samples say $\mathbf{x}_{1}$ and $\mathbf{x}_{2}$, are nearby in a high-density region, their class labels $y_{1}$ and $y_{2}$ should also be similar. Vice versa, if data points are separated by a low-density region, their labels may be different \autocite[][5]{chapelleSemisupervisedLearning2006}. \begin{figure}[ht] \centering \includegraphics[width=0.8\linewidth]{decision-boundary-semi-supervised.pdf} - \caption[Decision Boundary of Supervised and Semi-supervised Classifiers]{Decision boundary of a supervised and semi-supervised classifier. The supervised classifier is trained entirely on labelled data. The semi-supervised classifier uses both labelled and unlabelled instances to determine the decision boundary. Predicted class probabilities of \mycircle{viz-red} are visualised as a contour. Here, the unlabelled data points, drawn as \mycircle{viz-white}, lead to more confident predictions and a stretched class.} + \caption[Decision Boundary of Supervised and Semi-supervised Classifiers]{Decision boundary of a supervised and semi-supervised classifier. The supervised classifier is trained entirely on labeled data. The semi-supervised classifier uses both labeled and unlabeled instances to determine the decision boundary. Predicted class probabilities of \mycircle{viz-red} are visualized as a contour. Here, the unlabeled data points, drawn as \mycircle{viz-white}, lead to more confident predictions and a stretched class.} \label{fig:supervised-semi-supervised} \end{figure} -Applied to trade classification, with semi-supervised methods we implicitly assume that trades with similar features, such as a common trade price and quotes, conform to the same class. The purpose of unlabelled trades is to help efficiently determine the boundary around regions of neighbouring trades resulting in improved generalisation performance. A visualisation of a decision boundary of (semi-)supervised classifier is given in \cref{fig:supervised-semi-supervised}. +Applied to trade classification, with semi-supervised methods we implicitly assume that trades with similar features, such as a common trade price and quotes, conform to the same class. The purpose of unlabeled trades is to help efficiently determine the boundary around regions of neighboring trades resulting in improved generalization performance. A visualization of a decision boundary of (semi-)supervised classifier is given in \cref{fig:supervised-semi-supervised}. -The semi-supervised setting requires extending our notation from \cref{sec:problem-framing}, by distinguishing between labelled and unlabelled instances. Like before, $\mathcal{D}=\left\{\left(\mathbf{x}_i, y_i\right)\right\}_{i=1}^N$ denotes all labelled trades. Unlabelled datapoints are stored in separate set $\mathcal{U} = \left\{\mathbf{x}_i\right\}_{i=1}^{K}$. Our coverage of semi-supervised approaches includes self-training for gradient boosting and pre-training of Transformers, which we derive from a subsequent discussion. +The semi-supervised setting requires extending our notation from \cref{sec:problem-framing}, by distinguishing between labeled and unlabeled instances. Like before, $\mathcal{D}=\left\{\left(\mathbf{x}_i, y_i\right)\right\}_{i=1}^N$ denotes all labeled trades. Unlabeled datapoints are stored in separate set $\mathcal{U} = \left\{\mathbf{x}_i\right\}_{i=1}^{K}$. Our coverage of semi-supervised approaches includes self-training for gradient boosting and pre-training of Transformers, which we derive from a subsequent discussion. \subsection{Selection of Approaches}\label{sec:selection-of-approaches-1} -Our goal is to extend gradient-boosted trees and Transformers for the semi-supervised setting to make use of the abundant, unlabelled trade data. We are aimed to make minimally intrusive changes to maintain a fair comparison with the supervised counterparts. We find self-training for gradient boosting and pre-training of Transformers suitable for training on labelled and unlabelled trades, as our subsequent discussion derives. +Our goal is to extend gradient-boosted trees and Transformers for the semi-supervised setting to make use of the abundant, unlabeled trade data. We are aimed to make minimally intrusive changes to maintain a fair comparison with the supervised counterparts. We find self-training for gradient boosting and pre-training of Transformers suitable for training on labeled and unlabeled trades, as our subsequent discussion derives. \textbf{Gradient Boosting} -The success of supervised gradient boosting led to the development of gradient boosting for the semi-supervised setting. An early work of \textcite[][3--4]{dalche-bucSemisupervisedMarginBoost2001} explores replacing supervised weak learners, i.e., regression trees, with semi-supervised weak learners, i.e., mixture models and minimises a loss function over labelled and unlabelled instances. Another line of research, including \textcites[][290--291]{bennettExploitingUnlabeledData2002}[][2003--2004]{mallapragadaSemiBoostBoostingSemiSupervised2009}, retain supervised weak learners to generate pseudo labels of unlabelled instances per iteration. True labelled and pseudo-labelled data is then used in fitting weak learners of subsequent iterations. Approaches differ regarding the selection criterion of the pseudo-labelled instances. Both lines of work, however, require changes to the boosting procedure or the base learners. +The success of supervised gradient boosting led to the development of gradient boosting for the semi-supervised setting. An early work of \textcite[][3--4]{dalche-bucSemisupervisedMarginBoost2001} explores replacing supervised weak learners, i.e., regression trees, with semi-supervised weak learners, i.e., mixture models and minimizes a loss function over labeled and unlabeled instances. Another line of research, including \textcites[][290--291]{bennettExploitingUnlabeledData2002}[][2003--2004]{mallapragadaSemiBoostBoostingSemiSupervised2009}, retain supervised weak learners to generate pseudo labels of unlabeled instances per iteration. True labeled and pseudo-labeled data is then used in fitting weak learners of subsequent iterations. Approaches differ regarding the selection criterion of the pseudo-labeled instances. Both lines of work, however, require changes to the boosting procedure or the base learners. -An alternative is to pair gradient boosting with self-training. Self-training is a wrapper algorithm around a supervised classifier, that incorporates its most-confident predictions of unlabelled instances into the training procedure \autocite[][190]{yarowskyUnsupervisedWordSense1995}. In contrast to previous methods, pseudo-labels are generated exclusively from the fully-fledged ensemble, which is grown multiple times at a higher computational cost. Being a model-agnostic wrapper, it does not change the classifier and ensures maximum comparability. This, together with the widespread adoption in the literature, makes it a compelling choice for semi-supervised trade classification. +An alternative is to pair gradient boosting with self-training. Self-training is a wrapper algorithm around a supervised classifier, that incorporates its most-confident predictions of unlabeled instances into the training procedure \autocite[][190]{yarowskyUnsupervisedWordSense1995}. In contrast to previous methods, pseudo-labels are generated exclusively from the fully-fledged ensemble, which is grown multiple times at a higher computational cost. Being a model-agnostic wrapper, it does not change the classifier and ensures maximum comparability. This, together with the widespread adoption in the literature, makes it a compelling choice for semi-supervised trade classification. \textbf{Transformer} -Whilst Transformers could be combined with self-training, a more promising approach is to pre-train Transformers on unlabelled data, and then fine-tune the network on the remaining labelled instances. Various studies report unanimously performance improvements from pre-training tabular Transformers, including \textcites[][8]{somepalliSaintImprovedNeural2021}[][7]{huangTabTransformerTabularData2020}. +Whilst Transformers could be combined with self-training, a more promising approach is to pre-train Transformers on unlabeled data, and then fine-tune the network on the remaining labeled instances. Various studies report unanimously performance improvements from pre-training tabular Transformers, including \textcites[][8]{somepalliSaintImprovedNeural2021}[][7]{huangTabTransformerTabularData2020}. -Until now we assumed the parameters e.g., weights and biases, of the Transformer to be initialised randomly. The joint goal of pre-training objectives is to initialise a neural network with weights that capture expressive representations of the input and thereby improve generalisation performance over a random initialisation when fine-tuning on a specific task \autocite[][12]{erhanWhyDoesUnsupervised}. The training is now decomposed into two stages: in the first stage the model is trained with respect to the pre-training objective to obtain the parameter estimates on unlabelled instances, and in the second stage the Transformer is initialised with the parameters and then finetuned on the labelled dataset. Particularly beneficial, general embeddings can be learnt during pre-training, even if the true label, i.e., the trade initiator, is unknown or its definition varies between tasks. +Until now we assumed the parameters e.g., weights and biases, of the Transformer to be initialized randomly. The joint goal of pre-training objectives is to initialize a neural network with weights that capture expressive representations of the input and thereby improve generalization performance over a random initialization when fine-tuning on a specific task \autocite[][12]{erhanWhyDoesUnsupervised}. The training is now decomposed into two stages: in the first stage the model is trained with respect to the pre-training objective to obtain the parameter estimates on unlabeled instances, and in the second stage the Transformer is initialized with the parameters and then finetuned on the labeled dataset. Particularly beneficial, general embeddings can be learned during pre-training, even if the true label, i.e., the trade initiator, is unknown or its definition varies between tasks. Pre-training objectives for tabular data differ vastly in their methodology and are often directly adapted from other domains including \gls{MLM} \autocite[][4174]{devlinBERTPretrainingDeep2019}, \gls{RTD} \autocite[][1--3]{clarkElectraPretrainingText2020}, or contrastive learning \autocite[][2]{chenSimpleFrameworkContrastive2020}. As such, \textcite[][7]{huangTabTransformerTabularData2020} adapt \gls{MLM}, whereby features are randomly masked and the objective is to reconstruct the original input. Pre-training by \gls{RTD} aims to identify randomly replaced features and recover a binary mask used for replacement \autocite[][7]{huangTabTransformerTabularData2020}. \textcites[][3]{bahriSCARFSelfsupervisedContrastive2022}[][4--5]{yoonVIMEExtendingSuccess2020} reconstruct both the binary feature mask and the original input simultaneously. \textcite[][3]{somepalliSaintImprovedNeural2021} alter the methodology of \textcite[][4--5]{yoonVIMEExtendingSuccess2020} through a contrastive loss function. -With a multitude of methods, tested on different datasets and neural architectures, a fair comparison between pre-training methods is tedious. Yet, \textcite[][2-3]{rubachevRevisitingPretrainingObjectives2022} provide guidance in selecting objectives. Among the pre-training objectives that they benchmark, the \gls{RTD} objective was among the best-performing approaches. The \gls{RTD} objective is easy to optimise, unsupervised, and leaves the model architecture unaltered, which makes \gls{RTD} a compelling choice for pre-training on unlabelled data. +With a multitude of methods, tested on different datasets and neural architectures, a fair comparison between pre-training methods is tedious. Yet, \textcite[][2-3]{rubachevRevisitingPretrainingObjectives2022} provide guidance in selecting objectives. Among the pre-training objectives that they benchmark, the \gls{RTD} objective was among the best-performing approaches. The \gls{RTD} objective is easy to optimize, unsupervised, and leaves the model architecture unaltered, which makes \gls{RTD} a compelling choice for pre-training on unlabeled data. The next chapter covers self-training in detail. \subsection{Gradient Boosted Trees With Self-Training}\label{sec:extensions-to-gradient-boosted-trees} -Self-training is a wrapper algorithm around a probabilistic classifier, that incorporates its predictions of unlabelled instances as pseudo labels \autocite[][190]{yarowskyUnsupervisedWordSense1995}. +Self-training is a wrapper algorithm around a probabilistic classifier, that incorporates its predictions of unlabeled instances as pseudo labels \autocite[][190]{yarowskyUnsupervisedWordSense1995}. -Initially, a base classifier is fitted on the labelled data points in a supervised manner. The classifier then assigns labels, so-called pseudo labels, to unlabelled instances. A subset of unlabelled instances with high-confidence predictions is selected, removed from the unlabelled dataset and added to the pseudo-labelled data dataset. A new classifier is then retrained on the labelled and pseudo-labelled instances \autocite[][190--192]{yarowskyUnsupervisedWordSense1995}. The process is repeated for several iterations until an abortion criterion applies, such as the maximum number of iterations is exhausted or when no unlabelled instances are left to label. +Initially, a base classifier is fitted on the labeled data points in a supervised manner. The classifier then assigns labels, so-called pseudo labels, to unlabeled instances. A subset of unlabeled instances with high-confidence predictions is selected, removed from the unlabeled dataset and added to the pseudo-labeled data dataset. A new classifier is then retrained on the labeled and pseudo-labeled instances \autocite[][190--192]{yarowskyUnsupervisedWordSense1995}. The process is repeated for several iterations until an abortion criterion applies, such as the maximum number of iterations is exhausted or when no unlabeled instances are left to label. -Recall from our discussion on gradient-boosted trees in \cref{sec:gradient-boosting-procedure} that we optimised for the cross-entropy loss on the training set. When coupled with self-training in each training iteration the classifier $F$ now jointly minimises the loss over the labelled samples $\mathcal{D}$ and the pseudo-labelled samples $\not{\mathcal{U}}$: +Recall from our discussion on gradient-boosted trees in \cref{sec:gradient-boosting-procedure} that we optimized for the cross-entropy loss on the training set. When coupled with self-training in each training iteration the classifier $F$ now jointly minimizes the loss over the labeled samples $\mathcal{D}$ and the pseudo-labeled samples $\not{\mathcal{U}}$: \begin{equation} L_{\mathrm{ST}}=\frac{1}{\left|\mathcal{D}\right|} \sum_{(\mathbf{x}, y) \in \mathcal{D}} L(F(\mathbf{x}), y)+\frac{\epsilon}{\left|\not{\mathcal{U}}\right|} \sum_{(\mathbf{x}, \tilde{y}) \in \not{\mathcal{U}}} L(F(\mathbf{x}), \tilde{y})+\lambda\|F\|^2, \end{equation} -where $\epsilon$ is a hyperparameter to control the impact of the pseudo-labelled data, $\tilde{y}$ is the pseudo-labelled instance, and $\lambda$ weights the regularisation term \autocite[][4]{aminiSelfTrainingSurvey2023}. +where $\epsilon$ is a hyperparameter to control the impact of the pseudo-labeled data, $\tilde{y}$ is the pseudo-labeled instance, and $\lambda$ weights the regularization term \autocite[][4]{aminiSelfTrainingSurvey2023}. -In every iteration, only unlabelled instances are added to the training set, for which the predicted class probability exceeds a confidence threshold, say $\tau$. This approach has implications, as highlighted by \textcite[][2]{chenDebiasedSelfTrainingSemiSupervised2022}. The threshold $\tau$ becomes an important hyperparameter in controlling that no noisy labels are added to the training set, but a restriction to highly-confidence samples may lead to a data bias and over-confidence in the prediction. Self-training is prone to a confirmation bias, as confident but wrong pseudo labels are erroneously incorporated into the training set, which in effect leads to a propagation of errors in the subsequent training rounds. +In every iteration, only unlabeled instances are added to the training set, for which the predicted class probability exceeds a confidence threshold, say $\tau$. This approach has implications, as highlighted by \textcite[][2]{chenDebiasedSelfTrainingSemiSupervised2022}. The threshold $\tau$ becomes an important hyperparameter in controlling that no noisy labels are added to the training set, but a restriction to highly-confidence samples may lead to a data bias and over-confidence in the prediction. Self-training is prone to a confirmation bias, as confident but wrong pseudo labels are erroneously incorporated into the training set, which in effect leads to a propagation of errors in the subsequent training rounds. -At the same time, self-training puts a high emphasis on the correctness of the probability estimates in the base classifier. This is problematic for decision trees, known to produce poor probability estimates, as probabilities are derived from the class frequency in the leaf node containing few samples \autocite[][357--358]{tanhaSemisupervisedSelftrainingDecision2017}. However, as gradient boosting directly optimises for the cross-entropy loss, the problem found for its ensemble member no longer occurs. +At the same time, self-training puts a high emphasis on the correctness of the probability estimates in the base classifier. This is problematic for decision trees, known to produce poor probability estimates, as probabilities are derived from the class frequency in the leaf node containing few samples \autocite[][357--358]{tanhaSemisupervisedSelftrainingDecision2017}. However, as gradient boosting directly optimizes for the cross-entropy loss, the problem found for its ensemble member no longer occurs. Independent of the base classifier, self-training increases computational cost, as training is repeated over several iterations on a growing training set \autocite[][9]{zophRethinkingPretrainingSelftraining2020}. Despite these limitations, the potentially improved decision boundary outweighs the concerns. \subsection{Transformers With Pre-training}\label{sec:extensions-to-transformer} -\gls{RTD} is a pre-training objective proposed by \textcite[][2--3]{clarkElectraPretrainingText2020} for the use in language models. The core idea is to randomly replace tokens with plausible alternatives and learn a binary classifier to distinguish between original and replaced tokens. Intuitionally, the random replacement forces the model to learn generalisable representations of the input, rather than memorising the co-occurrence of certain tokens. Additionally, surprising the model with random tokens strengthens its ability to incorporate contextual information. +\gls{RTD} is a pre-training objective proposed by \textcite[][2--3]{clarkElectraPretrainingText2020} for the use in language models. The core idea is to randomly replace tokens with plausible alternatives and learn a binary classifier to distinguish between original and replaced tokens. Intuitionally, the random replacement forces the model to learn generalizable representations of the input, rather than memorizing the co-occurrence of certain tokens. Additionally, surprising the model with random tokens strengthens its ability to incorporate contextual information. \begin{figure}[ht] \centering @@ -75,7 +75,7 @@ \subsection{Transformers With Pre-training}\label{sec:extensions-to-transformer} \todo{Adapt to tabular data} -The approach uses two neural networks, namely the generator and the discriminator, typically implemented as Transformers, as visualised in \cref{fig:random-token-replacement}. The generator is responsible for generating replacement tokens and receives an input sequence, i.e., a sentence, that has been intentionally masked out. It learns to predict the original token of the now-masked token through tokens in the bidirectional context (cp. \cref{sec:attention}). For masking, an additional $\mathtt{[MASK]}$ token is introduced, which extends the vocabulary (cp. \cref{sec:token-embeddings}). Separately for each token, the final hidden state of the masked token is fed through a softmax activation to obtain the predicted probability distribution of the masked token and the cross entropy loss is used to compare against the true distribution. By replacing the masked token with a token from the generator distribution, convincing replacements now take place for some of the original inputs \autocite[][2--3]{clarkElectraPretrainingText2020}. +The approach uses two neural networks, namely the generator and the discriminator, typically implemented as Transformers, as visualized in \cref{fig:random-token-replacement}. The generator is responsible for generating replacement tokens and receives an input sequence, i.e., a sentence, that has been intentionally masked out. It learns to predict the original token of the now-masked token through tokens in the bidirectional context (cp. \cref{sec:attention}). For masking, an additional $\mathtt{[MASK]}$ token is introduced, which extends the vocabulary (cp. \cref{sec:token-embeddings}). Separately for each token, the final hidden state of the masked token is fed through a softmax activation to obtain the predicted probability distribution of the masked token and the cross entropy loss is used to compare against the true distribution. By replacing the masked token with a token from the generator distribution, convincing replacements now take place for some of the original inputs \autocite[][2--3]{clarkElectraPretrainingText2020}. The discriminator then receives the corrupted input sequence and is trained to distinguish between original and replaced tokens originating from the generator. The output is a binary mask to be compared against the mask initially used for masking tokens in the generator \autocite[][2--3]{clarkElectraPretrainingText2020}. diff --git a/reports/Content/supervised-approaches.tex b/reports/Content/supervised-approaches.tex index f1d612b7..a877ac82 100644 --- a/reports/Content/supervised-approaches.tex +++ b/reports/Content/supervised-approaches.tex @@ -11,7 +11,7 @@ \subsection{Framing as a Supervised Learning Problem}\label{sec:problem-framing} More insightful, is to not just obtain the most probable class, but also the associated class probabilities for a trade to be a buy or sell. This gives insights into the quality of the prediction. Thus, we frame trade signing as a supervised, probabilistic classification task. This is similar to \textcite[][272]{easleyDiscerningInformationTrade2016}, who alter the tick rule and \gls{BVC} algorithm to obtain the probability estimates of a buy from an individual or aggregated trades, but with a sole focus on trade signing on a trade-by-trade basis and supervised. For machine learning-based classifiers, a probabilistic view enables a richer evaluation but restricts the selection of classifiers. Trade classification rules, as presented in \cref{sec:rule-based-approaches}, do not profit from this alternative formulation as they yield hard probabilities only and so no insight into the confidence of the prediction is gained. -We introduce more notation, which is used throughout. Each data instance consists of a feature vector and the target. The former is given by $\mathbf{x} \in \mathbb{R}^{1 \times M}$ and described by a random variable $X$. Any of the $M$ features in $\mathbf{x}$ may be numerical, e.g., the trade price or categorical e.g., the security type. Like before, the target is given by $y \in \mathcal{Y}$ and described by a random variable $Y$. Each data instance is sampled from a joint probability distribution $\Pr(X, Y)$. The labelled data set with $N$ i.i.d. samples is denoted by $\mathcal{D} =\left\{\left(\mathbf{x}_i, y_i\right)\right\}_{i=1}^N$. For convienience, we define a feature matrix $\mathbf{X}=\left[\mathbf{x}_1,\ldots, \mathbf{x}_N\right]^{\top}$, that stores all instances and a corresponding vector of labels $\mathbf{y}=\left[y_1,\ldots, y_N \right]^{\top}$. +We introduce more notation, which is used throughout. Each data instance consists of a feature vector and the target. The former is given by $\mathbf{x} \in \mathbb{R}^{1 \times M}$ and described by a random variable $X$. Any of the $M$ features in $\mathbf{x}$ may be numerical, e.g., the trade price or categorical e.g., the security type. Like before, the target is given by $y \in \mathcal{Y}$ and described by a random variable $Y$. Each data instance is sampled from a joint probability distribution $\Pr(X, Y)$. The labeled data set with $N$ i.i.d. samples is denoted by $\mathcal{D} =\left\{\left(\mathbf{x}_i, y_i\right)\right\}_{i=1}^N$. For convienience, we define a feature matrix $\mathbf{X}=\left[\mathbf{x}_1,\ldots, \mathbf{x}_N\right]^{\top}$, that stores all instances and a corresponding vector of labels $\mathbf{y}=\left[y_1,\ldots, y_N \right]^{\top}$. For our machine learning classifiers, we aim to model $\Pr_{\theta}(y \mid \mathbf{x})$ by fitting a classifier with the parameters $\theta$ on the training set. Given the estimated class probabilities, we retrieve the most probable class in $\mathcal{Y}$ as: \begin{equation} @@ -27,27 +27,27 @@ \subsection{Selection of Approaches}\label{sec:selection-of-approaches} \begin{enumerate}[label=(\roman*),noitemsep] \item \emph{performance:} The approach must deliver state-of-the-art performance in tabular classification tasks. Trades are typically provided as tabular datasets, consisting of rows representing instances and columns representing features. The classifier must be well-suited for probabilistic classification on tabular data. \item \emph{scalability:} The approach must scale to datasets with > 10~Mio. samples. Due to the high trading activity and long data history, datasets may comprise millions of samples, so classifiers must cope with large quantities of trades. -\item \emph{extensibility:} The approach must be extendable to train on partially-labelled trades. +\item \emph{extensibility:} The approach must be extendable to train on partially-labeled trades. \end{enumerate} Trade classification, as we framed it, fits into supervised learning on tabular data, which is comprehensively covered by the research community with several studies reviewing and benchmarking newly proposed approaches against established machine learning methods. \textbf{Wide Tree-Based Ensembles} -Traditionally, tree-based ensembles, in particular, \gls{GBRT} have dominated modelling on tabular data concerning predictive performance \autocites[][24--25]{grinsztajnWhyTreebasedModels2022}[][7]{kadraWelltunedSimpleNets2021}[][8]{gorishniyRevisitingDeepLearning2021}. At its core, tree-based ensembles combine the estimates of individual decision trees into an ensemble to obtain a more accurate prediction. For \gls{GBRT} \autocite[][9]{friedmanGreedyFunctionApproximation2001} the ensemble is constructed by sequentially adding small-sized trees into the ensemble that improve upon the error of the previous trees. Conceptually related to \glspl{GBRT} are random forests. Random forests \autocite[][6]{breimanRandomForests2001} fuse decision trees with the bagging principle \autocite[][123]{breimanBaggingPredictors1996} by growing multiple deep decision trees on random subsets of data and aggregating the individual estimates. +Traditionally, tree-based ensembles, in particular, \gls{GBRT} have dominated modeling on tabular data concerning predictive performance \autocites[][24--25]{grinsztajnWhyTreebasedModels2022}[][7]{kadraWelltunedSimpleNets2021}[][8]{gorishniyRevisitingDeepLearning2021}. At its core, tree-based ensembles combine the estimates of individual decision trees into an ensemble to obtain a more accurate prediction. For \gls{GBRT} \autocite[][9]{friedmanGreedyFunctionApproximation2001} the ensemble is constructed by sequentially adding small-sized trees into the ensemble that improve upon the error of the previous trees. Conceptually related to \glspl{GBRT} are random forests. Random forests \autocite[][6]{breimanRandomForests2001} fuse decision trees with the bagging principle \autocite[][123]{breimanBaggingPredictors1996} by growing multiple deep decision trees on random subsets of data and aggregating the individual estimates. \textcite[][7-9]{grinsztajnWhyTreebasedModels2022} trace back the strong performance of tree-based ensembles in tabular classification tasks to being a non-rotationally-invariant learner and tabular data being non-invariant to rotation. By intuition, rows and columns in a tabular dataset may be arranged in an arbitrary order, but each features carries a distinct meaning, which implies that feature values cannot be simply rotated without affecting the overall meaning. Thus, tabular data is non-invariant by rotation. So are tree-based ensembles, as they attend to each feature separately. This property also strengthens the model's ability to uninformative features \autocite[][8-9]{grinsztajnWhyTreebasedModels2022}. -\textcite[][13--14]{ronenMachineLearningTrade2022} have unparalleled success in classifying trades through random forests. Due to the framing as a probabilistic classification task, random forests are not optimal. This is because decision trees yield poorly calibrated probability estimates caused by limited samples in leaf nodes, which propagate to the ensemble \autocite[][356--360]{tanhaSemisupervisedSelftrainingDecision2017}. Gradient boosting is unaffected by this problem, and scales to large data sets due to the availability of highly optimised implementations that approximate the construction of ensemble members and can simultaneously learn from labelled and unlabelled instances. The state-of-the-art performance in tabular classification tasks, together with its ability to scale and extend, renders it suitable for trade classification. +\textcite[][13--14]{ronenMachineLearningTrade2022} have unparalleled success in classifying trades through random forests. Due to the framing as a probabilistic classification task, random forests are not optimal. This is because decision trees yield poorly calibrated probability estimates caused by limited samples in leaf nodes, which propagate to the ensemble \autocite[][356--360]{tanhaSemisupervisedSelftrainingDecision2017}. Gradient boosting is unaffected by this problem, and scales to large data sets due to the availability of highly optimized implementations that approximate the construction of ensemble members and can simultaneously learn from labeled and unlabeled instances. The state-of-the-art performance in tabular classification tasks, together with its ability to scale and extend, renders it suitable for trade classification. \textbf{Deep Neural Networks} -Neural networks have emerged as powerful models for tabular data with several publications claiming to surpass \glspl{GBRT} in terms of performance. For brevity, we focus on two lines of research: regularised networks and attention-based networks, which have accumulated significant interest in the field. A recent overview of tabular deep learning can be found in \textcite[][1--22]{borisovDeepNeuralNetworks2022}. +Neural networks have emerged as powerful models for tabular data with several publications claiming to surpass \glspl{GBRT} in terms of performance. For brevity, we focus on two lines of research: regularized networks and attention-based networks, which have accumulated significant interest in the field. A recent overview of tabular deep learning can be found in \textcite[][1--22]{borisovDeepNeuralNetworks2022}. -\emph{Regularised Networks} +\emph{Regularized Networks} -Among the simplest neural networks are \glspl{MLP}, which consists of multiple linear layers with non-linear activation functions in between. \textcite[][9--10]{kadraWelltunedSimpleNets2021} among others, advocate for the use of vanilla \gls{MLP} with an extensive mix of regularisation techniques, such as dropout \autocite{srivastavaDropoutSimpleWay} or residual connections \autocite{heDeepResidualLearning2015}, and report performance improvements over complex tabular-specific architectures or \glspl{GBRT}. Regularisation is expected to enhance generalisation performance, but the benefit is non-exclusive to \gls{MLP}. Conversely, when regularisation is equally applied to tabular-specific architectures, the effect reverses and multiple works including \textcites[][7]{gorishniyRevisitingDeepLearning2021}[][5]{grinsztajnWhyTreebasedModels2022} suggest that regularised \gls{MLP} actually trail the performance of specialised tabular-specific architectures. Also, \glspl{MLP} are rotatinally-invariant learners, as showed in \textcite[][5]{grinsztajnWhyTreebasedModels2022}, which contradicts our reasoning from above. To meet our performance criterion we instead focus on specialised architectures, particularly attention-based networks, while still emphasising the importance of a careful regularisation and optimisation. +Among the simplest neural networks are \glspl{MLP}, which consists of multiple linear layers with non-linear activation functions in between. \textcite[][9--10]{kadraWelltunedSimpleNets2021} among others, advocate for the use of vanilla \gls{MLP} with an extensive mix of regularization techniques, such as dropout \autocite{srivastavaDropoutSimpleWay} or residual connections \autocite{heDeepResidualLearning2015}, and report performance improvements over complex tabular-specific architectures or \glspl{GBRT}. Regularization is expected to enhance generalization performance, but the benefit is non-exclusive to \gls{MLP}. Conversely, when regularization is equally applied to tabular-specific architectures, the effect reverses and multiple works including \textcites[][7]{gorishniyRevisitingDeepLearning2021}[][5]{grinsztajnWhyTreebasedModels2022} suggest that regularized \gls{MLP} actually trail the performance of specialized tabular-specific architectures. Also, \glspl{MLP} are rotatinally-invariant learners, as showed in \textcite[][5]{grinsztajnWhyTreebasedModels2022}, which contradicts our reasoning from above. To meet our performance criterion we instead focus on specialized architectures, particularly attention-based networks, while still emphasizing the importance of a careful regularization and optimization. \emph{Attention-based Networks} @@ -55,13 +55,13 @@ \subsection{Selection of Approaches}\label{sec:selection-of-approaches} TabNet \autocite[][3--5]{arikTabnetAttentiveInterpretable2020}, fuses the concept of decision trees with attention. Similar to growing a decision tree, several sub-networks are used to process the input in a sequential, hierarchical fashion. Sequential attention, a variant of attention, is used to decide which features to select in each step. The output of TabNet is the aggregate of all sub-networks. Its poor performance in independent comparisons e.g., \textcites[][7]{kadraWelltunedSimpleNets2021}[][7]{gorishniyRevisitingDeepLearning2021}, raises doubts about its usefulness. -The Self-Attention and Intersample Attention Transformer uses a specialised attention mechanism, the intersample attention, to perform attention over both columns and rows \autocite[][4--5]{somepalliSaintImprovedNeural2021}. Applied to our setting, the model would contextualise information from the trade itself, but also from neighbouring trades, which is an unfair advantage over classical trade classification rules. Similarly, the Non-Parametric Transformer of \textcite[][3--4]{kossenSelfAttentionDatapointsGoing2021} uses the entire data set as a context, which rules out the application in our work. +The Self-Attention and Intersample Attention Transformer uses a specialized attention mechanism, the intersample attention, to perform attention over both columns and rows \autocite[][4--5]{somepalliSaintImprovedNeural2021}. Applied to our setting, the model would contextualize information from the trade itself, but also from neighboring trades, which is an unfair advantage over classical trade classification rules. Similarly, the Non-Parametric Transformer of \textcite[][3--4]{kossenSelfAttentionDatapointsGoing2021} uses the entire data set as a context, which rules out the application in our work. -Differently, TabTransformer \autocite[][2--3]{huangTabTransformerTabularData2020} performs attention per sample on categorical features-only. All numerical features are processed in a separate stream, a \gls{MLP}, which breaks correlations between categorical and numerical features \autocite[][2]{somepalliSaintImprovedNeural2021}. Most importantly though, most features in trade datasets are numerical. As such, trade classification would hardly profit from the Transformer architecture, causing the model to collapse to a vanilla \gls{MLP}. A more comprehensive approach is provided by \textcite[][4--5]{gorishniyRevisitingDeepLearning2021} in the form of FT-Transformer, that processes both numerical inputs and categorical input in Transformer blocks featuring an attention mechanism. Since it achieved state-of-the-art performance in independent empirical studies, like \textcite[][5]{grinsztajnWhyTreebasedModels2022}, and is non-rotationally invariant, we further consider FT-Transformer in our empirical study. Being based on the Transformer architecture, FT-Transformer naturally scales to large amounts of data and can utilise unlabelled data through self-training procedures. +Differently, TabTransformer \autocite[][2--3]{huangTabTransformerTabularData2020} performs attention per sample on categorical features-only. All numerical features are processed in a separate stream, a \gls{MLP}, which breaks correlations between categorical and numerical features \autocite[][2]{somepalliSaintImprovedNeural2021}. Most importantly though, most features in trade datasets are numerical. As such, trade classification would hardly profit from the Transformer architecture, causing the model to collapse to a vanilla \gls{MLP}. A more comprehensive approach is provided by \textcite[][4--5]{gorishniyRevisitingDeepLearning2021} in the form of FT-Transformer, that processes both numerical inputs and categorical input in Transformer blocks featuring an attention mechanism. Since it achieved state-of-the-art performance in independent empirical studies, like \textcite[][5]{grinsztajnWhyTreebasedModels2022}, and is non-rotationally invariant, we further consider FT-Transformer in our empirical study. Being based on the Transformer architecture, FT-Transformer naturally scales to large amounts of data and can utilize unlabeled data through self-training procedures. -The findings of \textcite[][50]{ronenMachineLearningTrade2022} do not support the use of neural networks in trade classification. But due to the lack of details regarding the model architecture, regularisation techniques, and training insights, it is necessary to reevaluate these findings in the context of option trades. +The findings of \textcite[][50]{ronenMachineLearningTrade2022} do not support the use of neural networks in trade classification. But due to the lack of details regarding the model architecture, regularization techniques, and training insights, it is necessary to reevaluate these findings in the context of option trades. -To summarise, our study considers gradient boosting and the FT-Transformer, each trained on labelled or partially-labelled trades. This comparison is particularly appealing, as it enables a multi-faceted comparison of wide tree-based ensembles versus deep neural networks, as well as supervised versus semi-supervised methods. +To summarize, our study considers gradient boosting and the FT-Transformer, each trained on labeled or partially-labeled trades. This comparison is particularly appealing, as it enables a multi-faceted comparison of wide tree-based ensembles versus deep neural networks, as well as supervised versus semi-supervised methods. \subsection{Gradient Boosted Trees}\label{sec:gradient-boosted-trees} @@ -74,12 +74,12 @@ \subsubsection{Decision Tree}\label{sec:decision-tree} A decision tree splits the feature space into several disjoint regions $R$ through a sequence of recursive splits. For a binary decision tree, a single split leads to two new sub-regions, whose shape is determined by the features considered for splitting and the preceding splits. Trees are grown in depth until a minimum threshold for the number of samples within a node or some other stopping criterion applies \autocite[][42]{breimanClassificationRegressionTrees2017}. A region corresponds to a terminal node in the tree. For each terminal node of the tree or unsplit region, the predicted response value is constant for the entire region and shared by all its samples \autocite[][229]{breimanClassificationRegressionTrees2017}. -For a tree with $J$ regions $R_1, R_2,\ldots, R_J$, and some numerical input $\mathbf{x}$ the tree can be modelled as: +For a tree with $J$ regions $R_1, R_2,\ldots, R_J$, and some numerical input $\mathbf{x}$ the tree can be modeled as: \begin{equation} h(\mathbf{x})=\sum_{j=1}^{J} \gamma_{j} \mathbb{I}\left(\mathbf{x} \in R_{j}\right), \label{eq:decision-tree} \end{equation} -where $\mathbb{I}$ is the indicator function for region conformance and $\gamma_j$ the region's constant \autocite[][326]{hastietrevorElementsStatisticalLearning2009}. In the regression case, $\gamma_j$ is the mean of all target variables $y_i$ in the specific region. Since all samples of a region share a common response value, the tree estimates resemble a histogram that approximates the true regression surface, as visualised in \cref{fig:decision-boundary-dt}. +where $\mathbb{I}$ is the indicator function for region conformance and $\gamma_j$ the region's constant \autocite[][326]{hastietrevorElementsStatisticalLearning2009}. In the regression case, $\gamma_j$ is the mean of all target variables $y_i$ in the specific region. Since all samples of a region share a common response value, the tree estimates resemble a histogram that approximates the true regression surface, as visualized in \cref{fig:decision-boundary-dt}. \begin{figure}[ht] \centering @@ -92,17 +92,17 @@ \subsubsection{Decision Tree}\label{sec:decision-tree} \begin{equation} \operatorname{L}_{\mathrm{SSE}} =\sum_{\mathbf{x}_{i} \in R_j}\left(y_{i}-\gamma_{j}\right)^{2}, \end{equation} -which is subsequently minimised \autocite[][231]{breimanClassificationRegressionTrees2017}. As documented in \textcite[][326]{hastietrevorElementsStatisticalLearning2009} we start with the entire dataset and scan through all combinations of features and possible split values. For a split by the feature $k$ at the value $s$, the child nodes are given by a pair of half-planes: +which is subsequently minimized \autocite[][231]{breimanClassificationRegressionTrees2017}. As documented in \textcite[][326]{hastietrevorElementsStatisticalLearning2009} we start with the entire dataset and scan through all combinations of features and possible split values. For a split by the feature $k$ at the value $s$, the child nodes are given by a pair of half-planes: \begin{equation} R_1(k, s)=\left\{X \mid X_k \leq s\right\} \text { and } R_2(k, s)=\left\{X \mid X_k>s\right\}. \end{equation} -Thereby, the feature $k$ and value $s$ are selected in a way, that the squared error in the child nodes is minimised: +Thereby, the feature $k$ and value $s$ are selected in a way, that the squared error in the child nodes is minimized: \begin{equation} \min _{k, s}\left[\min _{\gamma_1} \sum_{\mathbf{x}_i \in R_1(k, s)}\left(y_i-\gamma_1\right)^2+\min _{\gamma_2} \sum_{\mathbf{x}_i \in R_2(k, s)}\left(y_i-\gamma_2\right)^2\right]. \end{equation} Clearly, growing deeper trees leads to an improvement in the \gls{SSE}. Considering the extreme, where each sample has its region, the tree would achieve a perfect fit in-sample but perform poorly on out-of-sample data. To reduce the sensitivity of the tree to changes in the training data, hence \emph{variance}, size complexity pruning procedures are employed. Likewise, if the decision tree is too simplistic, a high bias contributes to the model's overall expected error. Both extremes are to be avoided. -Ensemble methods decrease the expected error of the decision tree by combining multiple trees in a single model through minimising the bias or variance term or both. Specifically, boosting addresses the bias and variance \autocites[][1672]{schapireBoostingMarginNew1998}[][29]{breimanRandomForests2001}. Next, we focus on \gls{GBRT}, a variant of boosting. +Ensemble methods decrease the expected error of the decision tree by combining multiple trees in a single model through minimizing the bias or variance term or both. Specifically, boosting addresses the bias and variance \autocites[][1672]{schapireBoostingMarginNew1998}[][29]{breimanRandomForests2001}. Next, we focus on \gls{GBRT}, a variant of boosting. \subsubsection{Gradient Boosting Procedure}\label{sec:gradient-boosting-procedure} @@ -110,7 +110,7 @@ \subsubsection{Gradient Boosting Gradient boosting iteratively combines oversimplified models, the weak learners, into an additive model to obtain an improved ensemble estimate. This chapter draws on \textcite[][9]{friedmanGreedyFunctionApproximation2001} to derive gradient boosting for binary classification. % classifier with outputs in [-1, 1] -By \cref{sec:problem-framing} we perform binary probabilistic classification and by \cref{sec:trade-initiator} we defined the labels to be $y \in \{-1,1\}$. For gradient boosting, instead of modelling the class-conditional probabilities directly, we model the conditional log odds instead, which can be interpreted as the probability of observing class $1$ or a buyer-initiated trade, and covert to class-conditional probabilities as needed. +By \cref{sec:problem-framing} we perform binary probabilistic classification and by \cref{sec:trade-initiator} we defined the labels to be $y \in \{-1,1\}$. For gradient boosting, instead of modeling the class-conditional probabilities directly, we model the conditional log odds instead, which can be interpreted as the probability of observing class $1$ or a buyer-initiated trade, and covert to class-conditional probabilities as needed. Following \textcite[][9]{friedmanStochasticGradientBoosting2002} we set the loss function to be the cross-entropy loss, given by: \begin{equation} @@ -124,7 +124,7 @@ \subsubsection{Gradient Boosting \end{equation} $F(\mathbf{x})$ is the model's prediction in terms of conditional log odds. The cross-entropy loss, is a reasonable choice, as it is suitable for binary classification, convex, and twice differentiable; properties we exploit later. -We first intialise the model with a naïve prediction, based on the average class $\bar{y}$ from all training samples: +We first initialize the model with a naïve prediction, based on the average class $\bar{y}$ from all training samples: \begin{equation} F_0(\mathbf{x})= \frac{1}{2} \log \left[\frac{1+\bar{y}}{1-\bar{y}}\right]. @@ -133,8 +133,8 @@ \subsubsection{Gradient Boosting \begin{equation} r_i=-\left[\frac{\partial L_{\mathrm{BCE}}\left(y_i, F\left(\mathbf{x}_i\right)\right)}{\partial F\left(\mathbf{x}_i\right)}\right]_{F(\mathbf{x})=F_{m-1}(\mathbf{x})}=2 y_i /\left(1+\exp \left(2 y_i F_{m-1}\left(\mathbf{x}_i\right)\right)\right). \end{equation} -\todo{yields the maximum decrease are similar to the components of the negative gradient descent. However, the major drawback is tha tthe gradient is only defined for data points xi seen during training, contradicting the creation of a generalising model.} -Typically, regression trees (cp. \cref{sec:decision-tree}) are chosen as weak learners since they are computationally cheap and can produce continuous estimates for the residual. The $m$-th regression tree contains $J$ terminal regions, denoted by $R_{j m}, j=1,2, \ldots, J_{m}$. We search for an estimate $\gamma_{j,m}$ for the terminal node $R_{jm}$ that minimises the cross-entropy over all samples within the node: +\todo{yields the maximum decrease are similar to the components of the negative gradient descent. However, the major drawback is that the gradient is only defined for data points xi seen during training, contradicting the creation of a generalizing model.} +Typically, regression trees (cp. \cref{sec:decision-tree}) are chosen as weak learners since they are computationally cheap and can produce continuous estimates for the residual. The $m$-th regression tree contains $J$ terminal regions, denoted by $R_{j m}, j=1,2, \ldots, J_{m}$. We search for an estimate $\gamma_{j,m}$ for the terminal node $R_{jm}$ that minimizes the cross-entropy over all samples within the node: \begin{equation} \gamma_{j m}=\arg \min _\gamma \sum_{\mathbf{x}_i \in R_{j m}} \log \left(1+\exp \left(-2 y_i\left(F_{m-1}\left(\mathbf{x}_i\right)+\gamma\right)\right)\right) \label{eq:region-estimate-gbm} @@ -152,7 +152,7 @@ \subsubsection{Gradient Boosting \end{equation} After $M$ iterations we obtain the final estimate calculated as $F_{M}\left(\mathbf{x}\right)$. To avoid \gls{overfitting} the residuals, only proportional steps towards the negative gradient are taken, which is controlled by the learning rate \eta~\autocite[][13]{friedmanGreedyFunctionApproximation2001}. The learning rate \eta~and the size of the ensemble $M$ are deeply intertwined and best tuned together \autocite[][13]{friedmanGreedyFunctionApproximation2001}. -Gradient boosting is still prone to \gls{overfitting} due to fitting trees to point-wise gradients. One solution is to employ early stopping, whereby the ensemble is only grown in size, as long as adding more weak learners leads to a decrease in loss on the validation set \autocite[][384]{hastietrevorElementsStatisticalLearning2009}. Another approach is to limit the amount of data seen during training by fitting trees on random subset of samples, as proposed in \textcite[][3]{friedmanStochasticGradientBoosting2002}, or on a subset of features, as popularised by \textcite[][3]{chenXGBoostScalableTree2016}. \textcite[][6]{prokhorenkovaCatBoostUnbiasedBoosting2018} grow oblivious trees, which use the same splitting criterion for all nodes of one level in a tree. The rationale is, that these arguably simplistic trees, and achieve an imperfect fit, which regularises the model. Finally, the loss function can be extended for a $\ell_2$ regularisation term to penalise the model for complexity \autocite[][2]{chenXGBoostScalableTree2016}. +Gradient boosting is still prone to \gls{overfitting} due to fitting trees to point-wise gradients. One solution is to employ early stopping, whereby the ensemble is only grown in size, as long as adding more weak learners leads to a decrease in loss on the validation set \autocite[][384]{hastietrevorElementsStatisticalLearning2009}. Another approach is to limit the amount of data seen during training by fitting trees on random subset of samples, as proposed in \textcite[][3]{friedmanStochasticGradientBoosting2002}, or on a subset of features, as popularized by \textcite[][3]{chenXGBoostScalableTree2016}. \textcite[][6]{prokhorenkovaCatBoostUnbiasedBoosting2018} grow oblivious trees, which use the same splitting criterion for all nodes of one level in a tree. The rationale is, that these arguably simplistic trees, and achieve an imperfect fit, which regularizes the model. Finally, the loss function can be extended for a $\ell_2$ regularization term to penalize the model for complexity \autocite[][2]{chenXGBoostScalableTree2016}. In recent years, several variants of gradient boosting have been proposed and studied in the literature, including CatBoost \autocite[][1--23]{prokhorenkovaCatBoostUnbiasedBoosting2018}, XGBoost \autocite[][1--13]{chenXGBoostScalableTree2016}, and LightGBM \autocite[][3]{keLightGBMHighlyEfficient2017}, which differ by the policy how trees are grown and how \gls{overfitting} is addressed. Performance-wise, differences between the implementations are negligible, as empirical studies suggest \autocites[][8]{grinsztajnWhyTreebasedModels2022}[][19--20]{gorishniyRevisitingDeepLearning2021}[][7]{somepalliSaintImprovedNeural2021}[][14]{borisovDeepNeuralNetworks2022}. @@ -168,19 +168,19 @@ \subsection{Transformer Networks}\label{sec:transformer-networks} \subsubsection{Architectural Overview}\label{sec:architectural-overview} -The Transformer is a neural network architecture by \textcite[][2--6]{vaswaniAttentionAllYou2017} proposed for sequence-to-sequence modelling. Its original application is in machine translation, whereby sentences in the source language are translated into sentences in the target language. More precisely, the sentence is first decomposed into individual \glspl{token} and mapped into a sequence of \glspl{embedding}, which are rich vector representations of the raw input. The Transformer then processes the \glspl{embedding} to generate the output sequence. +The Transformer is a neural network architecture by \textcite[][2--6]{vaswaniAttentionAllYou2017} proposed for sequence-to-sequence modeling. Its original application is in machine translation, whereby sentences in the source language are translated into sentences in the target language. More precisely, the sentence is first decomposed into individual \glspl{token} and mapped into a sequence of \glspl{embedding}, which are rich vector representations of the raw input. The Transformer then processes the \glspl{embedding} to generate the output sequence. As the network operates on \glspl{embedding}, rather than strings, the architecture is not constrained to process textual data. It has been adapted to other modalities including image data \autocites[][2--5]{parmarImageTransformer2018}[][3]{dosovitskiyImageWorth16x162021} and tabular data \autocite[cp.][4]{gorishniyRevisitingDeepLearning2021}. The latter is important for our work, as derived in \cref{sec:selection-of-approaches}. Following the architecture for machine translation of \textcite[][3]{sutskeverSequenceSequenceLearning2014}, the network features two main components: the encoder and the decoder. A sequence of \glspl{token} is first mapped to a sequence of \glspl{embedding} and augmented with positional information. The encoder receives these \glspl{embedding} and creates an enriched representation from it by encoding the context in which the input appears i.e., the surrounding words. The output of the encoder is then fed to the decoder. The decoder takes the embedded target sequence along with parts of the encoded representation of the input, to autoregressively generate the output sequence, i.e., the translation in the target language \gls{token} by \gls{token} \autocite[][3]{vaswaniAttentionAllYou2017}. \cref{fig:transformer-architecture-overview} depicts the complete architecture and serves as a guide through the subsequent sub-chapters. -The encoder consists of $\gls{L}=6$ stacked Transformer blocks \autocite[][6]{vaswaniAttentionAllYou2017}. Each block itself is composed of two sub-layers: a multi-head self-attention layer, followed by a position-wise, \gls{feed-forward-network}. Both components serve a distinct purpose in the Transformer. The self-attention mechanism encodes the context in which the input appears onto the \glspl{embedding}, whereas the \gls{feed-forward-network} serves as a long-term memory persisting information outside the immediate context. In the multi-head self-attention mechanism of the encoder, inputs can learn from any \gls{token} of the input sequence, even if a \gls{token} appears causally before the other input. Each of the sub-layers is surrounded by skip connections \autocite[][2]{heDeepResidualLearning2015} and followed by layer normalisation \autocite[][4]{baLayerNormalization2016} to facilitate and stabilise training. Stacking multiple Transformer blocks enables the model to learn hierarchical features from the inputs and targets. Applied to language processing, the first layers in the stack extract coarse-grained syntactic features, and subsequent layers learn fine-grained semantic features \autocites[][3651]{jawaharWhatDoesBERT2019}[][4596]{tenneyBERTRediscoversClassical2019}. For tabular data, this translates to frequent feature combinations or infrequent feature interactions. +The encoder consists of $\gls{L}=6$ stacked Transformer blocks \autocite[][6]{vaswaniAttentionAllYou2017}. Each block itself is composed of two sub-layers: a multi-head self-attention layer, followed by a position-wise, \gls{feed-forward-network}. Both components serve a distinct purpose in the Transformer. The self-attention mechanism encodes the context in which the input appears onto the \glspl{embedding}, whereas the \gls{feed-forward-network} serves as a long-term memory persisting information outside the immediate context. In the multi-head self-attention mechanism of the encoder, inputs can learn from any \gls{token} of the input sequence, even if a \gls{token} appears causally before the other input. Each of the sub-layers is surrounded by skip connections \autocite[][2]{heDeepResidualLearning2015} and followed by layer normalization \autocite[][4]{baLayerNormalization2016} to facilitate and stabilize training. Stacking multiple Transformer blocks enables the model to learn hierarchical features from the inputs and targets. Applied to language processing, the first layers in the stack extract coarse-grained syntactic features, and subsequent layers learn fine-grained semantic features \autocites[][3651]{jawaharWhatDoesBERT2019}[][4596]{tenneyBERTRediscoversClassical2019}. For tabular data, this translates to frequent feature combinations or infrequent feature interactions. Aside from the feed-forward sub-layer, the decoder contains a sub-layer for multi-head self-attention on the output of the encoder, known as cross-attention, and a masked variant of the multi-head self-attention for use on the output sequence. Here, causal masking enforces the autoregressive properties of the decoder. The output of the decoder is finally passed through a linear layer with a softmax activation function to unembed the output and retrieve the probabilities of the next \gls{token} \autocite[][5]{vaswaniAttentionAllYou2017}. Since the output sequence is generated autoregressively, the most probable \gls{token} is fed back as input to the decoder to provide context for the following \glspl{token} until the remaining sequence is generated. -For its original application, machine translation, both the encoder and decoder are used. Yet, the modular design allows adapting Transformers to a wider range of use cases, some of which only require the encoder or decoder. \textcite[][16--17]{raffelExploringLimitsTransfer2020} differentiate these modes: encoder-only architecture, which encodes the input to obtain an enriched representation, decoder-only architectures to generate new \glspl{token} and encoder-decoder models for sequence-to-sequence modelling autoregressively. As our focus is on the probabilistic classification of tabular data, the goal is to learn an enriched representation of the input for classifying the label, here $\gls{y}$, rather than generating new samples. As such, encoder-only Transformers suffice. This insight also guides the structure in the next chapters, which focus on \glspl{embedding} and the inner workings of the encoder. +For its original application, machine translation, both the encoder and decoder are used. Yet, the modular design allows adapting Transformers to a wider range of use cases, some of which only require the encoder or decoder. \textcite[][16--17]{raffelExploringLimitsTransfer2020} differentiate these modes: encoder-only architecture, which encodes the input to obtain an enriched representation, decoder-only architectures to generate new \glspl{token} and encoder-decoder models for sequence-to-sequence modeling autoregressively. As our focus is on the probabilistic classification of tabular data, the goal is to learn an enriched representation of the input for classifying the label, here $\gls{y}$, rather than generating new samples. As such, encoder-only Transformers suffice. This insight also guides the structure in the next chapters, which focus on \glspl{embedding} and the inner workings of the encoder. \begin{landscape} \begin{figure}[ht] @@ -197,7 +197,7 @@ \subsubsection{Token Embedding}\label{sec:token-embeddings} As explained previously, Transformers operate on sequences of numeric vector representations, the \emph{token embeddings}. The classical Transformer was trained on \emph{word embeddings}. Nevertheless, \gls{token} embeddings are generic and arbitrary inputs that can be embedded and then processed by the Transformer. In the spirit of \textcite[][5]{vaswaniAttentionAllYou2017}, we first explore word embeddings for textual data, before adapting embeddings to the tabular domain. -\todo{write down, how sequence of token ids is constructed.} +\todo{write down, how the sequence of token ids is constructed.} \textbf{Embeddings For Textual Data} @@ -207,12 +207,12 @@ \subsubsection{Token Embedding}\label{sec:token-embeddings} The conversion to token-ids, however, loses the semantics, as token-ids may be assigned arbitrarily or ordering by semantics may not be feasible. This limitation can be overcome by embeddings, as pioneered by \textcite[][1139]{bengioNeuralProbabilisticLanguage}, which map each token-id into a high-dimensional space. By representing words as a vector, semantic and syntactic relationships between tokens can be encoded. As such, related words share a similar embedding vector \autocite[][1139]{bengioNeuralProbabilisticLanguage}. Moreover, word embeddings are semantically meaningful and can capture linguistic regularities, like gender through offsets between vectors \autocite[][748--749]{mikolovLinguisticRegularitiesContinuous2013}. -The embedding layer from \cref{fig:transformer-architecture-overview} is ultimately a lookup table to retrieve the embedding vector $\gls{e} \in \mathbb{R}^{d_{e}}$ from a learnt embedding matrix $\gls{W-e} \in \mathbb{R}^{d_{e} \times N_{V}}$ with the token-id $v \in V \cong\left[N_{V}\right]$ as shown:\footnote{Throughout our discussion on Transformers we adopt a notation proposed in \textcite[][1--16]{phuongFormalAlgorithmsTransformers2022}.} +The embedding layer from \cref{fig:transformer-architecture-overview} is ultimately a lookup table to retrieve the embedding vector $\gls{e} \in \mathbb{R}^{d_{e}}$ from a learned embedding matrix $\gls{W-e} \in \mathbb{R}^{d_{e} \times N_{V}}$ with the token-id $v \in V \cong\left[N_{V}\right]$ as shown:\footnote{Throughout our discussion on Transformers we adopt a notation proposed in \textcite[][1--16]{phuongFormalAlgorithmsTransformers2022}.} \begin{equation} \gls{e}=\gls{W-e}\left[:, v\right]. \label{eq:word-embeddings} \end{equation} -The weights of $\gls{W-e}$ are initialised randomly and updated using gradient descent to obtain the learnt embeddings. The dimension of the embedding $d_e$ affects the expressiveness of the network and is thus an important tuneable hyperparameter of the model. All embeddings of the input sequence are finally gathered in a matrix $\mathbf{S} \in \mathbb{R}^{d_e \times \ell_s}$. +The weights of $\gls{W-e}$ are initialized randomly and updated using gradient descent to obtain the learned embeddings. The dimension of the embedding $d_e$ affects the expressiveness of the network and is thus an important tuneable hyperparameter of the model. All embeddings of the input sequence are finally gathered in a matrix $\mathbf{S} \in \mathbb{R}^{d_e \times \ell_s}$. Concluding the example from above with artificial embeddings of $d_e=3$: \begin{equation} @@ -227,36 +227,36 @@ \subsubsection{Token Embedding}\label{sec:token-embeddings} \textbf{Embeddings For Numerical Data} -Transformer networks can handle numerical features, such as the trade price, by mapping the scalar value to a high-dimensional embedding vector and process sequences thereof \autocite[][3]{gorishniyEmbeddingsNumericalFeatures2022}. In the simplest case, a learnt linear projection is utilised to obtain the embedding. Linear embeddings of numerical features were previously explored in \textcites[][3]{kossenSelfAttentionDatapointsGoing2021}[][4]{somepalliSaintImprovedNeural2021}[][4]{gorishniyRevisitingDeepLearning2021}. +Transformer networks can handle numerical features, such as the trade price, by mapping the scalar value to a high-dimensional embedding vector and process sequences thereof \autocite[][3]{gorishniyEmbeddingsNumericalFeatures2022}. In the simplest case, a learned linear projection is utilized to obtain the embedding. Linear embeddings of numerical features were previously explored in \textcites[][3]{kossenSelfAttentionDatapointsGoing2021}[][4]{somepalliSaintImprovedNeural2021}[][4]{gorishniyRevisitingDeepLearning2021}. -In analogon to the word case, if the $m$-th feature, $\mathbf{x}[m]$, is numerical, it is projected to its embedding $\gls{e} \in \mathbb{R}^{d_e}$ by element-wise multiplication with a learnt vector $\mathbf{W}_m \in \mathbb{R}^{d_{e}}$. Moreover, a feature-dependent bias term $\mathbf{b}_m \in \mathbb{R}^{d_{e}}$ is added, as noted in \cref{eq:numerical-embeddings}. +In analogon to the word case, if the $m$-th feature, $\mathbf{x}[m]$, is numerical, it is projected to its embedding $\gls{e} \in \mathbb{R}^{d_e}$ by element-wise multiplication with a learned vector $\mathbf{W}_m \in \mathbb{R}^{d_{e}}$. Moreover, a feature-dependent bias term $\mathbf{b}_m \in \mathbb{R}^{d_{e}}$ is added, as noted in \cref{eq:numerical-embeddings}. \begin{equation} \gls{e}= \mathbf{W}_m \mathbf{x}[m] +\mathbf{b}_m \label{eq:numerical-embeddings} \end{equation} More sophisticated approaches rely on parametric embeddings, like the \emph{piece-wise linear encoding} or the \emph{periodic encoding} of \textcite[][10]{gorishniyEmbeddingsNumericalFeatures2022}. Both enforce non-linearity. The authors show that these can alleviate the model's performance but at a non-neglectable computational cost. For this reason, our focus is on the computational more tractable linear embedding. -More generally, the works of \textcites[][1]{gorishniyEmbeddingsNumericalFeatures2022}[][1]{somepalliSaintImprovedNeural2021} suggest, that numerical embedding can significantly improve robustness to missing values or noise. Their work miss a theoretical explanation. \textcite[][8--9]{grinsztajnWhyTreebasedModels2022} fill this void and attribute the increased robustness to the broken rotational invariance. +More generally, the works of \textcites[][1]{gorishniyEmbeddingsNumericalFeatures2022}[][1]{somepalliSaintImprovedNeural2021} suggest, that numerical embedding can significantly improve robustness to missing values or noise. Their works miss a theoretical explanation. \textcite[][8--9]{grinsztajnWhyTreebasedModels2022} fill this void and attribute the increased robustness to the broken rotational invariance. \textbf{Embeddings For Categorical Data} -Datasets often comprise categorical features like the underlying. In the context of tabular Transformers, learnt categorical embeddings are widely used, which are similar to the word embedding -\autocites[][4]{gorishniyRevisitingDeepLearning2021}[][2]{huangTabTransformerTabularData2020}[][4]{somepalliSaintImprovedNeural2021}. Analogous, each category is mapped to an embedding vector using a learnt embedding matrix. Due to the heterogeneous nature of tabular data, embeddings may not be shared between features. +Datasets often comprise categorical features like the underlying. In the context of tabular Transformers, learned categorical embeddings are widely used, which are similar to the word embedding +\autocites[][4]{gorishniyRevisitingDeepLearning2021}[][2]{huangTabTransformerTabularData2020}[][4]{somepalliSaintImprovedNeural2021}. Analogous, each category is mapped to an embedding vector using a learned embedding matrix. Due to the heterogeneous nature of tabular data, embeddings may not be shared between features. For categorical inputs, the embedding is implemented as a lookup table, analogous to \cref{eq:word-embeddings}. However, each feature has -its vocabulary $C_t$ with $N_{C_m}$ categories. Assume, the $m$-th feature is categorical. The specific embeddings $\gls{e}$ are queried with a unique integer key $c_{m} \in C_m \cong\left[N_{C_t}\right]$ from the learnt embedding matrix $\mathbf{W}_m \in \mathbb{R}^{d_e \times N_{C_m}}$. Finally, a feature-specific bias term $\mathbf{b}_m \in \mathbb{R}^{d_{e}}$ is added as shown in \cref{eq:categorical-embeddings}. Like for the word case, all embeddings of an instance are gathered in $\mathbf{S}$. +its vocabulary $C_t$ with $N_{C_m}$ categories. Assume, the $m$-th feature is categorical. The specific embeddings $\gls{e}$ are queried with a unique integer key $c_{m} \in C_m \cong\left[N_{C_t}\right]$ from the learned embedding matrix $\mathbf{W}_m \in \mathbb{R}^{d_e \times N_{C_m}}$. Finally, a feature-specific bias term $\mathbf{b}_m \in \mathbb{R}^{d_{e}}$ is added as shown in \cref{eq:categorical-embeddings}. Like for the word case, all embeddings of an instance are gathered in $\mathbf{S}$. \begin{equation} \gls{e}=\mathbf{W}_m[:,c_{m}] +\mathbf{b}_m \label{eq:categorical-embeddings} \end{equation} These categorical embeddings can potentially capture the intrinsic properties of categorical variables by arranging similar categories closer in the embedding space. For instance, consider the underlyings $\mathtt{GOOGL}$ (Alphabet Inc.), $\mathtt{MSFT}$ (Microsoft Inc.), and $\mathtt{K}$ (Kellogg Company). Due to the overlapping field of operations, one would anticipate greater similarity between Alphabet and Microsoft. -Despite these advantages, high-cardinal features present a challenge for embeddings since they are typically learnt from a few samples, which promotes \gls{overfitting}. Handling high-dimensional categorical data remains an open research problem, as noted by \textcite[][2]{borisovDeepNeuralNetworks2022}. +Despite these advantages, high-cardinal features present a challenge for embeddings since they are typically learned from a few samples, which promotes \gls{overfitting}. Handling high-dimensional categorical data remains an open research problem, as noted by \textcite[][2]{borisovDeepNeuralNetworks2022}. \textbf{Link To Positional Encoding and Attention} \todo{verify invariant property. Not sure if I got it right.} -Embeddings can only encode the semantic relationship of tokens, but they do not provide a clue to the model about the relative and absolute ordering of tokens in which they appear in the sequence, since all stages of the encoder and decoder are invariant to the token's position. Positional information must be induced into the model to preserve the ordering (cp. \cref{sec:positional-encoding}). Another limitation of embeddings is, that identical tokens share the embedding, even if they are ambiguous and their meaning is different from the context in which they appear. To resolve this issue, embeddings get contextualised in the self-attention mechanism (cp. \cref{sec:attention}). +Embeddings can only encode the semantic relationship of tokens, but they do not provide a clue to the model about the relative and absolute ordering of tokens in which they appear in the sequence, since all stages of the encoder and decoder are invariant to the token's position. Positional information must be induced into the model to preserve the ordering (cp. \cref{sec:positional-encoding}). Another limitation of embeddings is, that identical tokens share the embedding, even if they are ambiguous and their meaning is different from the context in which they appear. To resolve this issue, embeddings get contextualized in the self-attention mechanism (cp. \cref{sec:attention}). \subsubsection{Positional Encoding}\label{sec:positional-encoding} @@ -264,7 +264,7 @@ \subsubsection{Positional Encoding}\label{sec:positional-encoding} Contrary to sentences, columns in tabular datasets are arranged in an arbitrary order, which weakens the need for positional information. However, unless the embeddings per feature are unique, a positional embedding is also required so that the model can relate the otherwise identical embeddings to specific features and distinguish them \autocites[][3]{huangTabTransformerTabularData2020}[][15]{somepalliSaintImprovedNeural2021}. -Like \gls{token} embeddings, positional embeddings can also be learnt \autocite[cp.][4174]{devlinBERTPretrainingDeep2019}. Due to better, extrapolation capabilities, \textcite[][6]{vaswaniAttentionAllYou2017}, propose an positional encoding with the mapping $\gls{W-p}: \mathbb{N} \rightarrow \mathbb{R}^{d_{e}}$ based on sine and cosine signals to encode the \emph{absolute} position of the \gls{token}: +Like \gls{token} embeddings, positional embeddings can also be learned \autocite[cp.][4174]{devlinBERTPretrainingDeep2019}. Due to better, extrapolation capabilities, \textcite[][6]{vaswaniAttentionAllYou2017}, propose an positional encoding with the mapping $\gls{W-p}: \mathbb{N} \rightarrow \mathbb{R}^{d_{e}}$ based on sine and cosine signals to encode the \emph{absolute} position of the \gls{token}: \begin{equation} \begin{aligned} \gls{W-p}\left[2 i-1, t\right] & =\sin \left(t / \gls{ellmax}^{2 i / \gls{d}_e}\right), \\ @@ -277,15 +277,15 @@ \subsubsection{Positional Encoding}\label{sec:positional-encoding} \begin{figure}[ht] \centering \includegraphics{positional-encoding.pdf} - \caption[Positional Encoding of Transformer]{Positional encoding. The encoding is added onto the \gls{token} embeddings to infer positional information. The heatmap visualises the uniquely identifying pattern created from sine and cosine signals at increasing frequencies across the embedding dimension.} + \caption[Positional Encoding of Transformer]{Positional encoding. The encoding is added onto the \gls{token} embeddings to infer positional information. The heatmap visualizes the uniquely identifying pattern created from sine and cosine signals at increasing frequencies across the embedding dimension.} \label{fig:positional-embedding} \end{figure} -The positional encoding is visualised in \cref{fig:positional-embedding}. One can see the alternating pattern between even and odd columns and the unique pattern for each \gls{token}'s position. +The positional encoding is visualized in \cref{fig:positional-embedding}. One can see the alternating pattern between even and odd columns and the unique pattern for each \gls{token}'s position. -Using trigonometric functions for the positional embedding is favourable, due to being zero-centred and resulting in values in the closed range of $[-1,1]$. These properties are long known to promote convergence of neural networks \autocites[][8-9]{lecunEfficientBackProp2012}[][2]{ioffeBatchNormalizationAccelerating2015}. +Using trigonometric functions for the positional embedding is favorable, due to being zero-centered and resulting in values in the closed range of $[-1,1]$. These properties are long known to promote convergence of neural networks \autocites[][8-9]{lecunEfficientBackProp2012}[][2]{ioffeBatchNormalizationAccelerating2015}. -The reason for encoding with both the sine and cosine is more subtle, as either one would suffice for absolute embeddings. \textcite[][6]{vaswaniAttentionAllYou2017} hypothesise, that besides learning the \emph{absolute} position i.e., fifth place in sequence, providing both sine and cosine also enables the model to attend to \emph{relative} positions, i.e., two places from a given \gls{token}. +The reason for encoding with both the sine and cosine is more subtle, as either one would suffice for absolute embeddings. \textcite[][6]{vaswaniAttentionAllYou2017} hypothesize, that besides learning the \emph{absolute} position i.e., fifth place in sequence, providing both sine and cosine also enables the model to attend to \emph{relative} positions, i.e., two places from a given \gls{token}. The positional embedding is finally added per element to the token embedding to form a \gls{token}'s initial embedding $\gls{e}$. For the $\gls{t}$-th \gls{token} of a sequence $\mathbf{s}$, the embedding becomes: \begin{equation} @@ -302,11 +302,11 @@ \subsubsection{Attention Mechanism}\label{sec:attention} Attention can be thought of as a mapping between a query and a set of key-value pairs to an output. In general, the current token is first projected onto a query vector, and all tokens in the context are mapped to key and value vectors. Similar to a soft dictionary lookup, the goal is to retrieve the values from tokens in the context for which the keys are similar to the query and return an aggregate estimate of the values weighted by the similarity of the keys and the query. Naturally, if a token in the context is important for predicting the queried token, indicated by a high similarity, the value of the context token has a large contribution to the output \autocites[][5]{phuongFormalAlgorithmsTransformers2022}[][3]{vaswaniAttentionAllYou2017}. -Attention first appeared in \textcite[][4]{bahdanauNeuralMachineTranslation2016} and was popularised by \textcite[][4]{vaswaniAttentionAllYou2017}. The latter introduced a specific attention mechanism, known as \emph{scaled dot-product attention}, which we introduce in detail. +Attention first appeared in \textcite[][4]{bahdanauNeuralMachineTranslation2016} and was popularized by \textcite[][4]{vaswaniAttentionAllYou2017}. The latter introduced a specific attention mechanism, known as \emph{scaled dot-product attention}, which we introduce in detail. \textbf{Scaled Dot-Product Attention} -Analogous to before, \emph{scaled dot-product attention} estimates the similarity between queries and keys, as the dot product. The resulting attention scores are divided by some constant and normalised using a softmax function to obtain the attention weights. Multiplication of the attention weights with the values yields the outputs. Scaled dot-product attention is visualised in \cref{fig:transformer-architecture-overview} (left). +Analogous to before, \emph{scaled dot-product attention} estimates the similarity between queries and keys, as the dot product. The resulting attention scores are divided by some constant and normalized using a softmax function to obtain the attention weights. Multiplication of the attention weights with the values yields the outputs. Scaled dot-product attention is visualized in \cref{fig:transformer-architecture-overview} (left). For computational efficiency, attention is performed simultaneously over multiple queries. Thus, the author's group queries, keys, and values in matrices. In matrix notation outputs are estimated as: @@ -317,11 +317,11 @@ \subsubsection{Attention Mechanism}\label{sec:attention} \end{aligned} \label{eq:attention} \end{equation} -where $\mathbf{S} \in \mathbb{R}^{d_s \times \ell_s}$ and $\mathbf{Z} \in \mathbb{R}^{d_z \times \ell_z}$ are vector representations of the primary input sequence and of the context sequence. Both the primary and the context sequences are identical for the encoder but are different for the decoder. The query, key, and value matrices $\mathbf{Q}=\mathbf{W}_q \mathbf{S} + \mathbf{b}_q\mathbf{1}^{\top}$, $\mathbf{K}=\mathbf{W}_k \mathbf{Z} + \mathbf{b}_k\mathbf{1}^{\top}$, and $\mathbf{V}=\mathbf{W}_v \mathbf{Z} + \mathbf{b}_v\mathbf{1}^{\top}$ are linear projections of the input and context sequences, and $\mathbf{W}_q, \mathbf{W}_k \in \mathbb{R}^{d_{\mathrm{attn}\times d_{s}}}$; $\mathbf{W}_v \in \mathbb{R}^{d_{\mathrm{out}\times d_{z}}}$; $\mathbf{b}_q, \mathbf{b}_k \in \mathbb{R}^{d_{\mathrm{attn}}}$, and $\mathbf{b}_v \in \mathbb{R}^{d_{\mathrm{out}}}$ are learnable parameters. The dimensionality of the attention mechanism, $d_{\mathrm{attn}}$, is typically a fraction of the model dimensionality to accelerate computation. Likewise, the output dimension, $d_{out}$, is another hyperparameter to the models. The attention scores are $\mathbf{A}$, which are scaled by $\sqrt{d_{\mathrm{attn}}}$ to avoid unstable gradients, and the softmax activation normalises all scores. As normalised attention scores have a clear interpretation as the weights of how much a token contributes to the model's output, the attention mechanism provides a window into the model, which we explore in \cref{sec:feature-importance-measure}. +where $\mathbf{S} \in \mathbb{R}^{d_s \times \ell_s}$ and $\mathbf{Z} \in \mathbb{R}^{d_z \times \ell_z}$ are vector representations of the primary input sequence and of the context sequence. Both the primary and the context sequences are identical for the encoder but are different for the decoder. The query, key, and value matrices $\mathbf{Q}=\mathbf{W}_q \mathbf{S} + \mathbf{b}_q\mathbf{1}^{\top}$, $\mathbf{K}=\mathbf{W}_k \mathbf{Z} + \mathbf{b}_k\mathbf{1}^{\top}$, and $\mathbf{V}=\mathbf{W}_v \mathbf{Z} + \mathbf{b}_v\mathbf{1}^{\top}$ are linear projections of the input and context sequences, and $\mathbf{W}_q, \mathbf{W}_k \in \mathbb{R}^{d_{\mathrm{attn}\times d_{s}}}$; $\mathbf{W}_v \in \mathbb{R}^{d_{\mathrm{out}\times d_{z}}}$; $\mathbf{b}_q, \mathbf{b}_k \in \mathbb{R}^{d_{\mathrm{attn}}}$, and $\mathbf{b}_v \in \mathbb{R}^{d_{\mathrm{out}}}$ are learnable parameters. The dimensionality of the attention mechanism, $d_{\mathrm{attn}}$, is typically a fraction of the model dimensionality to accelerate computation. Likewise, the output dimension, $d_{out}$, is another hyperparameter to the models. The attention scores are $\mathbf{A}$, which are scaled by $\sqrt{d_{\mathrm{attn}}}$ to avoid unstable gradients, and the softmax activation normalizes all scores. As normalized attention scores have a clear interpretation as the weights of how much a token contributes to the model's output, the attention mechanism provides a window into the model, which we explore in \cref{sec:feature-importance-measure}. \textbf{Multi-Head Attention} -Rather than relying on a single attention function, \textcite[][4--5]{vaswaniAttentionAllYou2017} introduce multiple \emph{attention heads}, which perform attention in parallel on $H$ \emph{different} linear projections of queries, keys, and values. The \emph{multi-head attention} enables the model to learn richer representations of the input, as attention heads operate independently, they can pick up unique patterns or focus on different positions in the sequence at once. Multi-head attention is visualised in \cref{fig:transformer-architecture-overview} (centre). +Rather than relying on a single attention function, \textcite[][4--5]{vaswaniAttentionAllYou2017} introduce multiple \emph{attention heads}, which perform attention in parallel on $H$ \emph{different} linear projections of queries, keys, and values. The \emph{multi-head attention} enables the model to learn richer representations of the input, as attention heads operate independently, they can pick up unique patterns or focus on different positions in the sequence at once. Multi-head attention is visualized in \cref{fig:transformer-architecture-overview} (center). \todo{introduce word modalities} Exemplary for machine translation, \textcite[][5795]{voitaAnalyzingMultiHeadSelfAttention2019} show, that heads serve indeed distinct purposes like learning positional or syntactic relations between tokens. It is conceivable, that for tabular data this maps to dependencies between features. In practice, Transformers may not leverage all attention heads and some heads could even be pruned without impacting the performance \autocites[][9]{michelAreSixteenHeads2019}[][5805]{voitaAnalyzingMultiHeadSelfAttention2019}. @@ -356,25 +356,25 @@ \subsubsection{Position-Wise Feed-Forward Networks}\label{sec:position-wise-ffn} \textcite[][9]{vaswaniAttentionAllYou2017} set the hidden dimension to be two to eight magnitudes of the embedding dimension. The large capacity strengthens the model's ability to retain information but also contributes significantly to the high computational requirements and memory footprint of Transformers \autocites[][5]{tayEfficientTransformersSurvey2022}[][1]{kitaevReformerEfficientTransformer2020}. Both linear transformations are separated by a \gls{ReLU} \gls{activation-function} \autocite[][318]{glorotDeepSparseRectifier2011} to introduce non-linearities to the network. -Like the attention layer, the position-wise \gls{FFN} is surrounded by residual connections, followed by layer normalisation (cp. \cref{sec:residual-connections-layer-norm}). Both are vital for the training process and convergence of the overall network. Optionally, dropout is added to prevent the model from \gls{overfitting}. +Like the attention layer, the position-wise \gls{FFN} is surrounded by residual connections, followed by layer normalization (cp. \cref{sec:residual-connections-layer-norm}). Both are vital for the training process and convergence of the overall network. Optionally, dropout is added to prevent the model from \gls{overfitting}. -\subsubsection{Residual Connections and Layer Normalisation}\label{sec:residual-connections-layer-norm} +\subsubsection{Residual Connections and Layer Normalization}\label{sec:residual-connections-layer-norm} -Recall from earlier chapters, that the encoder stacks multiple Transformer blocks, each of which consists of several sub-layers, resulting in a deep network. While depth is inevitable to learn hierarchical representations, the training of such a network is complicated. As neural networks are commonly trained using backpropagation, which relies on the gradient of the error to be propagated through the network starting at the last layer, vanishing or \glspl{exploding-gradient} pose a major difficulty in training deep neural nets \autocite[][1]{heDeepResidualLearning2015}. Without countermeasures, stacking multiple layers in the encoder and decoder of the Transformers impedes the gradient information to flow efficiently through the network and hampers the training behaviour \autocite[][1811]{wangLearningDeepTransformer2019}. +Recall from earlier chapters, that the encoder stacks multiple Transformer blocks, each of which consists of several sub-layers, resulting in a deep network. While depth is inevitable to learn hierarchical representations, the training of such a network is complicated. As neural networks are commonly trained using backpropagation, which relies on the gradient of the error to be propagated through the network starting at the last layer, vanishing or \glspl{exploding-gradient} pose a major difficulty in training deep neural nets \autocite[][1]{heDeepResidualLearning2015}. Without countermeasures, stacking multiple layers in the encoder and decoder of the Transformers impedes the gradient information to flow efficiently through the network and hampers the training behavior \autocite[][1811]{wangLearningDeepTransformer2019}. -As a remedy, \textcite[][3]{vaswaniAttentionAllYou2017} employ residual connections around each sub-layer, whereby the output of the sub-layer is added element-wisely to its input. Intuitively, the residual connection provides an alternative path for information to flow through the network, since some information can bypass the sub-layer and thereby reach deeper layers within the stack. Vanishing or \glspl{exploding-gradient} are also mitigated, as gradients can bypass the sub-layer, eventually contributing towards an easier optimisation \autocite[][3591]{liuRethinkingSkipConnection2020}. Residual connections moreover help to preserve the positional embeddings (cp. \cref{sec:positional-encoding}), as the layer's inputs are maintained in the identity mapping. Another technique to improve the training behaviour is layer normalisation. +As a remedy, \textcite[][3]{vaswaniAttentionAllYou2017} employ residual connections around each sub-layer, whereby the output of the sub-layer is added element-wisely to its input. Intuitively, the residual connection provides an alternative path for information to flow through the network, since some information can bypass the sub-layer and thereby reach deeper layers within the stack. Vanishing or \glspl{exploding-gradient} are also mitigated, as gradients can bypass the sub-layer, eventually contributing towards an easier optimization \autocite[][3591]{liuRethinkingSkipConnection2020}. Residual connections moreover help to preserve the positional embeddings (cp. \cref{sec:positional-encoding}), as the layer's inputs are maintained in the identity mapping. Another technique to improve the training behavior is layer normalization. -\textcite[][3]{vaswaniAttentionAllYou2017} extensively draw on layer normalisation \autocite[][4]{baLayerNormalization2016} after the multi-head attention and feed-forward sub-layers. It is used for normalising the activations of the sub-layer and to stabilise and accelerate the training of the network \autocite[][2]{baLayerNormalization2016}. The normalisation statistics are calculated separately for every instance, which guarantees scalability across different batch sizes. +\textcite[][3]{vaswaniAttentionAllYou2017} extensively draw on layer normalization \autocite[][4]{baLayerNormalization2016} after the multi-head attention and feed-forward sub-layers. It is used for normalizing the activations of the sub-layer and to stabilize and accelerate the training of the network \autocite[][2]{baLayerNormalization2016}. The normalization statistics are calculated separately for every instance, which guarantees scalability across different batch sizes. -Until now it remains unclear, how the layer normalisation intertwines with the sub-layers and the residual connections. Transformers are distinguished by the order in which layer normalisation is added into the pre-norm and post-norm Transformer. Post-norm Transformers add layer normalisation to the sub-layer \emph{after} adding the input from the residual connections. The arrangement is depicted in \cref{fig:transformer-architecture-overview}. In contrast for pre-norm Transformers, the normalisation is applied \emph{before} the self-attention and feed-forward sub-layers and inside the residual connections. Pre-norm requires one additional normalisation layer to pass only well-conditioned outputs from the Transformer block to the successive layers \autocite[][5]{xiongLayerNormalizationTransformer2020}. +Until now it remains unclear, how the layer normalization intertwines with the sub-layers and the residual connections. Transformers are distinguished by the order in which layer normalization is added into the pre-norm and post-norm Transformer. Post-norm Transformers add layer normalization to the sub-layer \emph{after} adding the input from the residual connections. The arrangement is depicted in \cref{fig:transformer-architecture-overview}. In contrast for pre-norm Transformers, the normalization is applied \emph{before} the self-attention and feed-forward sub-layers and inside the residual connections. Pre-norm requires one additional normalization layer to pass only well-conditioned outputs from the Transformer block to the successive layers \autocite[][5]{xiongLayerNormalizationTransformer2020}. -\textcite[][3]{vaswaniAttentionAllYou2017} employ post-layer normalisation, but recent research has shown a shift towards pre-norm setups \autocite[][4]{narangTransformerModificationsTransfer2021}. Parts of the widespread adaption lie in faster training and omitting of the need for costly learning rate warm-up stages, whereby the learning rate is initially decreased to keep the gradients balanced \autocites[][2]{xiongLayerNormalizationTransformer2020}[][8]{liuUnderstandingDifficultyTraining2020}. In addition, post-norm Transformers have been found brittle to train and prone to convergence failures with its root cause in vanishing gradients, \glspl{exploding-gradient}, and an overall higher dependency on the residual stream \autocites[][8]{liuUnderstandingDifficultyTraining2020}[][1812]{wangLearningDeepTransformer2019}. Pre-norm Transformers, although they may sacrifice some performance, introduce a certain robustness to the training process. We come back to this property in our discussion on the FT-Transformer. +\textcite[][3]{vaswaniAttentionAllYou2017} employ post-layer normalization, but recent research has shown a shift towards pre-norm setups \autocite[][4]{narangTransformerModificationsTransfer2021}. Parts of the widespread adaption lie in faster training and omitting of the need for costly learning rate warm-up stages, whereby the learning rate is initially decreased to keep the gradients balanced \autocites[][2]{xiongLayerNormalizationTransformer2020}[][8]{liuUnderstandingDifficultyTraining2020}. In addition, post-norm Transformers have been found brittle to train and prone to convergence failures with its root cause in vanishing gradients, \glspl{exploding-gradient}, and an overall higher dependency on the residual stream \autocites[][8]{liuUnderstandingDifficultyTraining2020}[][1812]{wangLearningDeepTransformer2019}. Pre-norm Transformers, although they may sacrifice some performance, introduce a certain robustness to the training process. We come back to this property in our discussion on the FT-Transformer. \subsubsection{FT-Transformer}\label{sec:fttransformer} \todo{try to introduce BERT here somewhere.} -Many of the previous concepts can be adapted to the tabular domain with minor architectural changes. \textcite[][5]{gorishniyRevisitingDeepLearning2021} propose with FT-Transformer an adaption, that pairs an embedding unit for both numerical and categorical inputs, dubbed the feature tokenizer, with a Transformer. The complete architecture is depicted in \cref{fig:fttransformer}. Notably, the Transformer units use a pre-norm setup for easier optimisation, whereby the very first normalisation layer in the encoder is removed due to a propitious performance \textcite[][17]{gorishniyRevisitingDeepLearning2021}. The upstream feature tokenizer transforms every feature in $\mathbf{x}$ to their embeddings. The embeddings are given by \cref{eq:numerical-embeddings,eq:categorical-embeddings}. +Many of the previous concepts can be adapted to the tabular domain with minor architectural changes. \textcite[][5]{gorishniyRevisitingDeepLearning2021} propose with FT-Transformer an adaption, that pairs an embedding unit for both numerical and categorical inputs, dubbed the feature tokenizer, with a Transformer. The complete architecture is depicted in \cref{fig:fttransformer}. Notably, the Transformer units use a pre-norm setup for easier optimization, whereby the very first normalization layer in the encoder is removed due to a propitious performance \textcite[][17]{gorishniyRevisitingDeepLearning2021}. The upstream feature tokenizer transforms every feature in $\mathbf{x}$ to their embeddings. The embeddings are given by \cref{eq:numerical-embeddings,eq:categorical-embeddings}. \begin{figure}[ht] \centering @@ -385,15 +385,15 @@ \subsubsection{FT-Transformer}\label{sec:fttransformer} \label{fig:fttransformer} \end{figure} -Recall from our discussion on self-attention (cp. \cref{sec:attention}), that each \gls{token} encodes the \glspl{token} within the sequence. Based on this notion, \textcite[][4174]{devlinBERTPretrainingDeep2019} prepend a specialised $\mathtt{[CLS]}$ \gls{token} to the sequence, which stores the sequence's aggregate representation. Like any other \gls{token}, the $\mathtt{[CLS]}$ \gls{token} is embedded first and contextualised in the encoder. Its final hidden state is then used for classification. +Recall from our discussion on self-attention (cp. \cref{sec:attention}), that each \gls{token} encodes the \glspl{token} within the sequence. Based on this notion, \textcite[][4174]{devlinBERTPretrainingDeep2019} prepend a specialized $\mathtt{[CLS]}$ \gls{token} to the sequence, which stores the sequence's aggregate representation. Like any other \gls{token}, the $\mathtt{[CLS]}$ \gls{token} is embedded first and contextualized in the encoder. Its final hidden state is then used for classification. \textcite[][4]{gorishniyRevisitingDeepLearning2021} adapt the idea of a $\mathtt{[CLS]}$ \gls{token} for tabular representation models. Similar to the embeddings of categorical or numerical features, the embedding of the $[\mathtt{CLS}]$ \gls{token} $\gls{e}_\mathtt{[CLS]} \in \mathbb{R}^{d_{e}}$ is prepended to the column embeddings with $\mathbf{S} = \left[\gls{e}_\mathtt{[CLS]}, \gls{e}_1, \ldots \gls{e}_{M}\right]$, where $\mathbf{S} \in \mathbb{R}^{d_{e} \times M +1}$. Like before, $\mathbf{S}$ is passed through a stack of Transformer layers. The updated representation of the $\mathtt{[CLS]}$ \gls{token} is used exclusively for prediction: \begin{equation} P=\operatorname{Linear}\left(\operatorname{ReLU}\left(\operatorname{LayerNorm}\left(\mathbf{S}\left[:,0\right]\right)\right)\right). \label{eq:bert-ft} \end{equation} -\todo{Add softmax, think about ReLU, change linear layer to Weight matrix?} +\todo{Add softmax, think about ReLU, change the linear layer to Weight matrix?} -\textcite[][8]{gorishniyRevisitingDeepLearning2021} achieve state-of-the-art performance through numerical and categorical embeddings. Embedding both categorical and numerical inputs enables the Transformer to attend to all other features, but at considerable computational cost, that may only be justified by higher classification accuracies. +\textcite[][8]{gorishniyRevisitingDeepLearning2021} achieve state-of-the-art performance through numerical and categorical embeddings. Embedding both categorical and numerical inputs enables the Transformer to attend to all other features, but at a considerable computational cost, that may only be justified by higher classification accuracies. -Next, all models are extended for learning on partially-labelled data. \ No newline at end of file +Next, all models are extended for learning on partially-labeled data. \ No newline at end of file diff --git a/reports/Content/training-tuning.tex b/reports/Content/training-tuning.tex index e6f5f900..14992353 100644 --- a/reports/Content/training-tuning.tex +++ b/reports/Content/training-tuning.tex @@ -16,7 +16,7 @@ \subsubsection{Training of Supervised \cref{fig:gbm-train-val-loss-acc} displays the loss and accuracies of the default implementation on the \gls{ISE} training and validation set using classical features. The plots reveal several insights. -First, the model overfits the training data, as evident from the generalisation gap between training and validation accuracies. To improve generalisation performance, we apply regularisation techniques. +First, the model overfits the training data, as evident from the generalization gap between training and validation accuracies. To improve generalization performance, we apply regularization techniques. Second, validation loss spikes for larger ensembles, while validation accuracy continues to improve. This discrepancy suggests that the predicted class's correctness improves, but the ensemble becomes less confident in the correctness of the prediction. @@ -27,16 +27,16 @@ \subsubsection{Training of Supervised \label{fig:gbm-loss-distribution} \end{figure} -This behaviour can be explained by the log loss being unbound, where single incorrect predictions can cause the loss to explode. We verify this assumption by plotting the distribution of the sample-wise log loss in \cref{fig:gbm-loss-distribution}. As visible, loss per sample decreases for larger ensembles, at the same time few predictions contribute to the loss unproportionally, causing the average validation loss to stagnate. +This behavior can be explained by the log loss being unbound, where single incorrect predictions can cause the loss to explode. We verify this assumption by plotting the distribution of the sample-wise log loss in \cref{fig:gbm-loss-distribution}. As visible, loss per sample decreases for larger ensembles, at the same time few predictions contribute to the loss unproportionally, causing the average validation loss to stagnate. \begin{figure}[ht] \centering \includegraphics{gbm-optimisations-loss-acc.pdf} - \caption[Training and Validation Accuracy of \glsentryshort{GBRT} with Optimisations]{Training and validation accuracy of \gls{GBRT} on \gls{ISE} sample with optimisations. Metrics are estimated on the classical feature set. One iteration corresponds to an additional regression tree added to the ensemble. Loss is expected to decrease for more complex ensembles and accuracy to increase.} - \label{fig:gbm-optimisations-loss-acc} + \caption[Training and Validation Accuracy of \glsentryshort{GBRT} with Optimisations]{Training and validation accuracy of \gls{GBRT} on \gls{ISE} sample with optimizations. Metrics are estimated on the classical feature set. One iteration corresponds to an additional regression tree added to the ensemble. Loss is expected to decrease for more complex ensembles and accuracy to increase.} + \label{fig:gbm-optimizations-loss-acc} \end{figure} -We leverage several architectural changes to reduce the loss, further improve performance and mitigate overfitting in gradient boosting, as shown in \cref{fig:gbm-optimisations-loss-acc}, where the effects on validation accuracy and log loss over the default configuration are visualised. Following standard practice, e.g., \textcite{tuningplaybookgithub}, all other parameters are kept at their default values, while a single parameter is varied to derive the plots. Although this approach ignores parameter interactions, it still can guide the optimal training configuration. We train on the ISE training set with classical features and report metrics on the validation set. +We leverage several architectural changes to reduce the loss, further improve performance and mitigate overfitting in gradient boosting, as shown in \cref{fig:gbm-optimizations-loss-acc}, where the effects on validation accuracy and log loss over the default configuration are visualized. Following standard practice, e.g., \textcite[][]{tuningplaybookgithub}, all other parameters are kept at their default values, while a single parameter is varied to derive the plots. Although this approach ignores parameter interactions, it still can guide the optimal training configuration. We train on the \gls{ISE} training set with classical features and report metrics on the validation set. \emph{Growth Strategy} @@ -44,7 +44,7 @@ \subsubsection{Training of Supervised \emph{Sample Weighting} -The work of \textcite[][36--38]{grauerOptionTradeClassification2022} suggests a strong temporal shift in the data, with the performance of classical trade classification rules deteriorating over time. As a result, the predictability of features derived from these rules diminishes over time, and patterns learnt from old observations become less relevant for predicting test samples. To address this, we introduce a sample weighting scheme that assigns higher weights to recent training samples and gradually decays weights over time, which we incorporate into the log loss. Validation and test samples are equally weighted. Sample weighting proves to be essential for achieving high validation performance, and it positively impacts the accuracy and confidence in the prediction mitigating the problem from above. +The work of \textcite[][36--38]{grauerOptionTradeClassification2022} suggests a strong temporal shift in the data, with the performance of classical trade classification rules deteriorating over time. As a result, the predictability of features derived from these rules diminishes over time, and patterns learned from old observations become less relevant for predicting test samples. To address this, we introduce a sample weighting scheme that assigns higher weights to recent training samples and gradually decays weights over time, which we incorporate into the log loss. Validation and test samples are equally weighted. Sample weighting proves to be essential for achieving high validation performance, and it positively impacts the accuracy and confidence in the prediction mitigating the problem from above. \emph{Border Count} @@ -58,36 +58,36 @@ \subsubsection{Training of Supervised \textbf{FT-Transformer} -We rely on the FT-Transformer of \textcite[][4--5]{gorishniyRevisitingDeepLearning2021} as our second model. The training of Transformers has been found non-trivial and requires a carefully designed training setup of model, optimizer, and learning rate schedule \autocite[][1]{liuUnderstandingDifficultyTraining2020}. We investigate minor modifications to the default FT-Transformer to stabilise training and improve overall performance. The default FT-Transformer is trained for 10 epochs on \gls{ISE} dataset with classical features and loss and accuracy are visualised in \cref{fig:fttransformer-optimisations-loss-acc}.\footnote{Default configuration documented in \textcite[][18]{gorishniyRevisitingDeepLearning2021}.} +We rely on the FT-Transformer of \textcite[][4--5]{gorishniyRevisitingDeepLearning2021} as our second model. The training of Transformers has been found non-trivial and requires a carefully designed training setup of model, optimizer, and learning rate schedule \autocite[][1]{liuUnderstandingDifficultyTraining2020}. We investigate minor modifications to the default FT-Transformer to stabilize training and improve overall performance. The default FT-Transformer is trained for 10 epochs on \gls{ISE} dataset with classical features and loss and accuracy are visualized in \cref{fig:fttransformer-optimizations-loss-acc}.\footnote{Default configuration documented in \textcite[][18]{gorishniyRevisitingDeepLearning2021}.} -The convergence behaviour of our model is similar to that of gradient boosting. Equally, a significant generalisation gap exists between the training and validation loss. Particularly concerning, the training loss decreases sharply, while the validation loss spuriously improves over its initial estimate. Despite this, validation accuracy improves throughout the entire training cycle. We reason that the network learns to correctly classify trades, indicated by the improved accuracy, but only attains low-confident correct predictions or confident but erroneous predictions which both contribute to a large validation loss. +The convergence behavior of our model is similar to that of gradient boosting. Equally, a significant generalization gap exists between the training and validation loss. Particularly concerning, the training loss decreases sharply, while the validation loss spuriously improves over its initial estimate. Despite this, validation accuracy improves throughout the entire training cycle. We reason that the network learns to correctly classify trades, indicated by the improved accuracy, but only attains low-confident correct predictions or confident but erroneous predictions which both contribute to a large validation loss. \begin{figure}[!ht] \centering \includegraphics{fttransformer-optimisations-loss-acc.pdf} - \caption[Training and Validation Accuracy of FT-Transformer with Optimisations]{Training and validation accuracy of FT-Transformer on \gls{ISE} sample with optimisations. Metrics are estimated on the classical feature set. One iteration corresponds to one gradient update. The end of each epoch is marked with a dashed bar. Loss is expected to decrease throughout training and accuracy to increase.} - \label{fig:fttransformer-optimisations-loss-acc} + \caption[Training and Validation Accuracy of FT-Transformer with Optimisations]{Training and validation accuracy of FT-Transformer on \gls{ISE} sample with optimizations. Metrics are estimated on the classical feature set. One iteration corresponds to one gradient update. The end of each epoch is marked with a dashed bar. Loss is expected to decrease throughout training and accuracy to increase.} + \label{fig:fttransformer-optimizations-loss-acc} \end{figure} \textbf{Solutions For FT-Transformer} \emph{Activation Function} -Motivated by previous research, we experiment with replacing the $\operatorname{ReLU}$ activation with the $\operatorname{GELU}$ activation function \autocite[][2]{hendrycksGaussianErrorLinear2020} in the classification head and the gated variant $\operatorname{ReGLU}$ with the gated variant $\operatorname{GEGLU}$ \autocite[][2]{shazeerGLUVariantsImprove2020} in the \gls{FFN}. As visualised in \cref{fig:fttransformer-optimisations-loss-acc}, no advantage in terms of validation accuracy or loss is evident. +Motivated by previous research, we experiment with replacing the $\operatorname{ReLU}$ activation with the $\operatorname{GELU}$ activation function \autocite[][2]{hendrycksGaussianErrorLinear2020} in the classification head and the gated variant $\operatorname{ReGLU}$ with the gated variant $\operatorname{GEGLU}$ \autocite[][2]{shazeerGLUVariantsImprove2020} in the \gls{FFN}. As visualized in \cref{fig:fttransformer-optimizations-loss-acc}, no advantage in terms of validation accuracy or loss is evident. \emph{Sample Weighting} -We apply the concept of sample weighting from \gls{GBRT} to Transformers. Specifically, we scale the contribution of individual training samples to the loss using a sample weight, which penalises the model for misclassifying recent observations. This method is crucial for achieving low validation loss and high validation accuracies, as visible in \cref{fig:fttransformer-optimisations-loss-acc}. The significantly lower training accuracy implies, that patterns from latter observations do not universally transfer to previous observations. At this time, it remains unclear what is causing the data drift within the training set. +We apply the concept of sample weighting from \gls{GBRT} to Transformers. Specifically, we scale the contribution of individual training samples to the loss using a sample weight, which penalizes the model for misclassifying recent observations. This method is crucial for achieving low validation loss and high validation accuracies, as visible in \cref{fig:fttransformer-optimizations-loss-acc}. The significantly lower training accuracy implies, that patterns from latter observations do not universally transfer to previous observations. At this time, it remains unclear what is causing the data drift within the training set. \clearpage \emph{Label Smoothing} -A major problem in classification with neural networks is, that the network becomes over-confident in predicting training samples but performs poorly on unseen data. In \cref{fig:fttransformer-optimisations-loss-acc} the effect is evident, as the increased confidence in the prediction on the training set does not transfer to the validation set. To regularise the network, we experiment with label smoothing \autocite[][2823]{szegedyRethinkingInceptionArchitecture2016} by training on soft labels with an uncertainty constant of $\epsilon$. Instead of assigning hard class probabilities of 0 or 1, we assume that true labels in the training set are correct with $1-\epsilon$ probability and incorrect otherwise. For $\epsilon=\num{0.1}$, a trade with the true label $-1$ is assumed to be \SI{90}{\percent} seller-initiated and \SI{10}{\percent} buyer-initiated. While we observe that label smoothing improves the validation loss and reduces the generalisation gap, we find that it has a negligible effect on validation accuracy and therefore abandon this approach. +A major problem in classification with neural networks is, that the network becomes overconfident in predicting training samples but performs poorly on unseen data. In \cref{fig:fttransformer-optimizations-loss-acc} the effect is evident, as the increased confidence in the prediction on the training set does not transfer to the validation set. To regularize the network, we experiment with label smoothing \autocite[][2823]{szegedyRethinkingInceptionArchitecture2016} by training on soft labels with an uncertainty constant of $\epsilon$. Instead of assigning hard class probabilities of 0 or 1, we assume that true labels in the training set are correct with $1-\epsilon$ probability and incorrect otherwise. For $\epsilon=\num{0.1}$, a trade with the true label $-1$ is assumed to be \SI{90}{\percent} seller-initiated and \SI{10}{\percent} buyer-initiated. While we observe that label smoothing improves the validation loss and reduces the generalization gap, we find that it has a negligible effect on validation accuracy and therefore abandon this approach. \emph{Learning Rate Schedule} -When training Transformers, the learning rate is often adjusted throughout the training process. \textcite[][7]{vaswaniAttentionAllYou2017} use a learning rate warm-up period, whereby the learning rate is linearly increased in the early stages of training, followed by decay using an inverse square root learning rate schedule. The warm-up phase is thought to stabilise gradients as weight updates are considerably smaller. According to \cref{sec:residual-connections-layer-norm}, learning rate warm-up is crucial for training post-norm Transformers, but optional for pre-norm Transformers like the FT-Transformer. Nevertheless, we experiment with the effect of learning rate warm-up in our setting and combine a linear warm-up for two epochs with subsequent cosine decay, as visualised in \cref{fig:lr-lin-warmup-cosine-decay}. +When training Transformers, the learning rate is often adjusted throughout the training process. \textcite[][7]{vaswaniAttentionAllYou2017} use a learning rate warm-up period, whereby the learning rate is linearly increased in the early stages of training, followed by decay using an inverse square root learning rate schedule. The warm-up phase is thought to stabilize gradients as weight updates are considerably smaller. According to \cref{sec:residual-connections-layer-norm}, learning rate warm-up is crucial for training post-norm Transformers, but optional for pre-norm Transformers like the FT-Transformer. Nevertheless, we experiment with the effect of learning rate warm-up in our setting and combine a linear warm-up for two epochs with subsequent cosine decay, as visualized in \cref{fig:lr-lin-warmup-cosine-decay}. \begin{figure}[!ht] \centering @@ -96,13 +96,13 @@ \subsubsection{Training of Supervised \label{fig:lr-lin-warmup-cosine-decay} \end{figure} -The scheduled learning rate has soothing effects on the training loss and accuracy estimates, as evident in \cref{fig:fttransformer-optimisations-loss-acc}. Therefore, we adopt a training setup with a learning rate schedule, despite the negative effects on training time. The learning rate itself is tuned as part of \cref{sec:hyperparameter-tuning}. +The scheduled learning rate has soothing effects on the training loss and accuracy estimates, as evident in \cref{fig:fttransformer-optimizations-loss-acc}. Therefore, we adopt a training setup with a learning rate schedule, despite the negative effects on training time. The learning rate itself is tuned as part of \cref{sec:hyperparameter-tuning}. \emph{Batch Size} % 20 epochs (\num{36460} / \num{145840} iterations) -We use a fixed batch size of \num{8192} samples for the feature set classic/size and \num{2048} for the feature set option, which is the largest possible size on our \gls{GPU}. Training is performed for \num{20} epochs at maximum. All samples within the training and validation set are shuffled randomly to promote convergence. Although a smaller batch size could enhance the generalisation capabilities of the model, as found in \textcite[][3]{keskarLargeBatchTrainingDeep2017}, we train on the largest number of trades per iteration, to optimise throughput. Additional regularisation is added to the model, but treated as a tunable hyperparameter. +We use a fixed batch size of \num{8192} samples for the feature set classic/size and \num{2048} for the feature set option, which is the largest possible size on our \gls{GPU}. Training is performed for \num{20} epochs at maximum. All samples within the training and validation set are shuffled randomly to promote convergence. Although a smaller batch size could enhance the generalization capabilities of the model, as found in \textcite[][3]{keskarLargeBatchTrainingDeep2017}, we train on the largest number of trades per iteration, to optimize throughput. Additional regularization is added to the model, but treated as a tunable hyperparameter. \emph{Early Stopping and Checkpointing} @@ -110,7 +110,7 @@ \subsubsection{Training of Supervised \emph{Optimizer} -In line with \textcite[][6]{gorishniyRevisitingDeepLearning2021}, we train the models using the AdamW optimizer \autocite[][2--3]{loshchilovDecoupledWeightDecay2019} with the standard hyperparameters.\footnote{Parameters $\beta_{1}=0.9, \beta_{2}=0.999$, and $\epsilon = \num{1e-8}$.} The weight decay coefficient in AdamW determining the degree of regularisation is tuned in \cref{sec:hyperparameter-tuning}. Weight decay is selectively applied and excludes embeddings, LayerNorm, and biases. +In line with \textcite[][6]{gorishniyRevisitingDeepLearning2021}, we train the models using the AdamW optimizer \autocite[][2--3]{loshchilovDecoupledWeightDecay2019} with the standard hyperparameters.\footnote{Parameters $\beta_{1}=0.9, \beta_{2}=0.999$, and $\epsilon = \num{1e-8}$.} The weight decay coefficient in AdamW determining the degree of regularization is tuned in \cref{sec:hyperparameter-tuning}. Weight decay is selectively applied and excludes embeddings, LayerNorm, and biases. In summary, we extend the training setup of \textcite[][6]{gorishniyRevisitingDeepLearning2021} with a sample weighting scheme and learning rate schedule aimed at boosting performance and training stability. @@ -127,28 +127,28 @@ \subsubsection{Training of Semi-supervised \textbf{Gradient Boosting With Self-Training} -To incorporate unlabelled trades into the training procedure, we combine gradient boosting with a self-training classifier, as derived in \cref{sec:extensions-to-gradient-boosted-trees}. We repeat self-training for 2 iterations and require the predicted class probability to exceed $\tau=0.9$. As the entire ensemble is rebuilt three times, the relatively low number of iterations and high confidence threshold, strike a balance between computational requirements and the need for high-quality predictions. The base classifier is otherwise identical to supervised gradient boosting from \cref{sec:training-of-supervised-models}. +To incorporate unlabeled trades into the training procedure, we combine gradient boosting with a self-training classifier, as derived in \cref{sec:extensions-to-gradient-boosted-trees}. We repeat self-training for 2 iterations and require the predicted class probability to exceed $\tau=0.9$. As the entire ensemble is rebuilt three times, the relatively low number of iterations and high confidence threshold, strike a balance between computational requirements and the need for high-quality predictions. The base classifier is otherwise identical to supervised gradient boosting from \cref{sec:training-of-supervised-models}. \textbf{FT-Transformer with Pre-Training} -The FT-Transformer is trained in two stages. First, we train for \num{20} epochs on unlabelled \gls{ISE} trades using the \gls{RTD} head, followed by \num{20} epochs of fine-tuning on labelled \gls{ISE} training data with the binary classification head. +The FT-Transformer is trained in two stages. First, we train for \num{20} epochs on unlabeled \gls{ISE} trades using the \gls{RTD} head, followed by \num{20} epochs of fine-tuning on labeled \gls{ISE} training data with the binary classification head. During pre-training and fine-tuning, early stopping is applied based on the value of the objective on the validation set, using patience of \num{10}. This particular setup is adopted from \textcite[][15]{rubachevRevisitingPretrainingObjectives2022} for being compute-efficient and offering competitive performance. The hidden dimension of the classification head is set to \num{512}. Based on \textcite[][3]{clarkElectraPretrainingText2020} \SI{15}{\percent} of all tokens are replaced. -Since the unlabelled sample includes various types of trades that may not be comparable to the labelled sample, we update all layers during fine-tuning. Empirically, finetuning the entire model is among the most successful methods for large-scale Transformers, as results from \textcite[][104--105]{raeScalingLanguageModels2022} suggest. +Since the unlabeled sample includes various types of trades that may not be comparable to the labeled sample, we update all layers during fine-tuning. Empirically, fine-tuning the entire model is among the most successful methods for large-scale Transformers, as results from \textcite[][104--105]{raeScalingLanguageModels2022} suggest. Following \textcite[][4]{rubachevRevisitingPretrainingObjectives2022}, the learning rate and weight decay are shared between the pre-training and fine-tuning stages. Given the nature of pre-training, all other hyperparameters related to the model are identical. \subsubsection{Hyperparameter Tuning}\label{sec:hyperparameter-tuning} -All of our machine-learning models feature a set of tunable hyperparameters. The results of previous studies, exemplary the one of \textcite[][5]{grinsztajnWhyTreebasedModels2022}, emphasise the need for tuning routines, as the test performance of the FT-Transformer and \glspl{GBRT} largely fluctuates with the hyperparameter configuration. Classical rules have no hyperparameters per se, but the best hybrid rules can be attained through hyperparameter search. +All of our machine-learning models feature a set of tunable hyperparameters. The results of previous studies, exemplary the one of \textcite[][5]{grinsztajnWhyTreebasedModels2022}, emphasize the need for tuning routines, as the test performance of the FT-Transformer and \glspl{GBRT} largely fluctuates with the hyperparameter configuration. Classical rules have no hyperparameters per se, but the best hybrid rules can be attained through hyperparameter search. For a fair comparison, we employ an exhaustive hyperparameter search, to find a suitable hyperparameter configuration for each of our models. \textbf{Bayesian Search} We perform a novel Bayesian search to suggest and tune the hyperparameters automatically. In Bayesian search, a prior belief for all possible objective functions is formulated from the parameter intervals, which is then gradually refined by updating the Bayesian posterior with data from previous trials thereby approximating the likely objective function \autocite[][2]{shahriariTakingHumanOut2016}. Compared to brute-force approaches, such as grid search, unpromising search regions are omitted, resulting in more promising trials. -While different algorithmic implementations exist for Bayesian optimisation, we choose the \emph{Optuna} library \autocite[][1--10]{akibaOptunaNextgenerationHyperparameter2019}, which implements the tree parzen estimator algorithm and is capable of handling both continuous and categorical hyperparameters.\footnote{Implementation of the tree-parzen estimator searches the first 10 trials randomly before the completed trials affect the sampling.} We maximise the accuracy of the validation set, which is also our decisive metric for evaluation (cp. \cref{sec:evaluation-metric}), and run $\num{50}$ trials per feature set for the \gls{GBRT} and $\num{10}$ trials for the FT-Transformer. The best combination of each is tested out-of-sample in \cref{sec:results}. +While different algorithmic implementations exist for Bayesian optimization, we choose the \emph{Optuna} library \autocite[][1--10]{akibaOptunaNextgenerationHyperparameter2019}, which implements the tree parzen estimator algorithm and is capable of handling both continuous and categorical hyperparameters.\footnote{Implementation of the tree-parzen estimator searches the first 10 trials randomly before the completed trials affect the sampling.} We maximize the accuracy of the validation set, which is also our decisive metric for evaluation (cp. \cref{sec:evaluation-metric}), and run $\num{50}$ trials per feature set for the \gls{GBRT} and $\num{10}$ trials for the FT-Transformer. The best combination of each is tested out-of-sample in \cref{sec:results}. \textbf{Gradient Boosting} @@ -164,15 +164,15 @@ \subsubsection{Hyperparameter Tuning}\label{sec:hyperparameter-tuning} Hyperparameter & Distribution \\ \midrule Depth & $\operatorname{UniformInt}[1,12]$ \\ Learning rate $\eta$ & $\operatorname{LogUniform}[0.001, 0.125]$ \\ - $\ell_2$ Leaf Regularisation & $\operatorname{UniformInt}[2, 30]$ \\ + $\ell_2$ Leaf Regularization & $\operatorname{UniformInt}[2, 30]$ \\ Random Strength & $\operatorname{LogUniform}[\num{1e-9}, 10.0]$ \\ Bagging Temperature & $\operatorname{Uniform}[0.0, 1.0]$ \\ \bottomrule \end{tabular} \end{table} -As documented in \cref{tab:hyperparameter-space-gbm}, we tune five hyperparameters for gradient boosting. The first is the depth, which determines the number of levels in each tree. Other than \textcite[][]{gorishniyRevisitingDeepLearning2021}, we increase the upper bound to twelve to allow for more complex ensemble members. Acknowledging the research of \textcite[][14]{friedmanGreedyFunctionApproximation2001} that the learning rate \eta~and the size of the ensemble have a strong interdependence, we only tune the learning rate and stop extending the ensemble based on the early stopping criterion. Random strength, bagging temperature, and $\ell_2$ leaf regularisation are measures to counter overfitting. Specifically, random strength controls the degree of Gaussian noise added to the scores of split candidates to introduce randomness in the selected splits. In a similar vein, the algorithm introduces randomness on the sample level through Bayesian bootstrap \autocite[][130--131]{rubinBayesianBootstrap1981}. The hyperparameter controls the distribution used for sampling, and implicitly the aggressiveness of bagging. Finally, $\ell_2$ leaf regularisation adds a penalty term to the terminal leaf's estimates. The hyperparameter controls the degree of regularisation. +As documented in \cref{tab:hyperparameter-space-gbm}, we tune five hyperparameters for gradient boosting. The first is the depth, which determines the number of levels in each tree. Other than \textcite[][]{gorishniyRevisitingDeepLearning2021}, we increase the upper bound to twelve to allow for more complex ensemble members. Acknowledging the research of \textcite[][14]{friedmanGreedyFunctionApproximation2001} that the learning rate \eta~and the size of the ensemble have a strong interdependence, we only tune the learning rate and stop extending the ensemble based on the early stopping criterion. Random strength, bagging temperature, and $\ell_2$ leaf regularization are measures to counter overfitting. Specifically, random strength controls the degree of Gaussian noise added to the scores of split candidates to introduce randomness in the selected splits. In a similar vein, the algorithm introduces randomness on the sample level through Bayesian bootstrap \autocite[][130--131]{rubinBayesianBootstrap1981}. The hyperparameter controls the distribution used for sampling, and implicitly the aggressiveness of bagging. Finally, $\ell_2$ leaf regularization adds a penalty term to the terminal leaf's estimates. The hyperparameter controls the degree of regularization. -\cref{fig:ise-gbm-hyperparam-classical} visualises the hyperparameter search space of the \gls{GBRT} on the \gls{ISE} dataset with classical features, from which we can derive several observations. First, hyperparameter tuning has a significant impact on the prediction, as the validation accuracy varies between \SI{58.429}{\percent} and \SI{64.378}{\percent} for different trials. Second, the best hyperparameter combination, marked with \bestcircle, lies off-the-borders surrounded by other promising trials, indicated by the contours, from which we can conclude, that the found solution is a stable and reasonable choice for further analysis. +\cref{fig:ise-gbm-hyperparam-classical} visualizes the hyperparameter search space of the \gls{GBRT} on the \gls{ISE} dataset with classical features, from which we can derive several observations. First, hyperparameter tuning has a significant impact on the prediction, as the validation accuracy varies between \SI{58.429}{\percent} and \SI{64.378}{\percent} for different trials. Second, the best hyperparameter combination, marked with \bestcircle, lies off-the-borders surrounded by other promising trials, indicated by the contours, from which we can conclude, that the found solution is a stable and reasonable choice for further analysis. \begin{figure}[!h] \subfloat[Hyperparameter Search Space of \gls{GBRT} With Feature Set Classical\label{fig:ise-gbm-hyperparam-classical}]{\includegraphics[width=0.6\textwidth]{1gzk7msy-hyperparam-search-space.pdf}} @@ -201,7 +201,7 @@ \subsubsection{Hyperparameter Tuning}\label{sec:hyperparameter-tuning} Hyperparameter & Distribution & {\glsentryshort{FS} Classical} & {\glsentryshort{FS} Size} & {\glsentryshort{FS} Option} \\ \midrule Depth & $\operatorname{UniformInt}[1,12]$ & 8 & 9 & 12 \\ Learning rate $\eta$ & $\operatorname{LogUniform}[0.001, 0.125]$ & 0.12484221864046671 & 0.12347889459796775 & 0.12471458170177774 \\ - $\ell_2$ Leaf Regularisation & $\operatorname{UniformInt}[2, 30]$ & 15 & 5 & 16 \\ + $\ell_2$ Leaf Regularization & $\operatorname{UniformInt}[2, 30]$ & 15 & 5 & 16 \\ Random Strength & $\operatorname{LogUniform}[\num{1e-9}, 10.0]$ & \num{4e-9} & \num{4e-7} & \num{8e-6} \\ Bagging Temperature & $\operatorname{Uniform}[0.0, 1.0]$ & 0.6419530220498153 & 0.5574912093427532 & 0.45578836944233 \\ \midrule Validation Accuracy in \% & & 64.37816236230594 & 75.03504680858162 & 76.99459643967347 \\ \bottomrule @@ -210,7 +210,7 @@ \subsubsection{Hyperparameter Tuning}\label{sec:hyperparameter-tuning} \textbf{Gradient Boosting With Self-Training} -The search space for the semi-supervised variant is identical to the supervised gradient boosting. To conserve space, we only report the tabulated results in \cref{tab:solutions-GBRT-self-training}. Visualisations of the hyperparameter search space are available online.\footnote{See \url{https://wandb.ai/fbv/thesis/runs/37lymmzc} for \gls{FS} classical, \url{https://wandb.ai/fbv/thesis/runs/324v3uv5} for \gls{FS} size, and \url{https://wandb.ai/fbv/thesis/runs/t55nd8r0} for \gls{FS} option.} +The search space for the semi-supervised variant is identical to the supervised gradient boosting. To conserve space, we only report the tabulated results in \cref{tab:solutions-GBRT-self-training}. Visualizations of the hyperparameter search space are available online.\footnote{See \url{https://wandb.ai/fbv/thesis/runs/37lymmzc} for \gls{FS} classical, \url{https://wandb.ai/fbv/thesis/runs/324v3uv5} for \gls{FS} size, and \url{https://wandb.ai/fbv/thesis/runs/t55nd8r0} for \gls{FS} option.} \begin{table}[!h] \centering @@ -222,14 +222,14 @@ \subsubsection{Hyperparameter Tuning}\label{sec:hyperparameter-tuning} Hyperparameter & Distribution & {\glsentryshort{FS} Classical} & {\glsentryshort{FS} Size} & {\glsentryshort{FS} Option} \\ \midrule Depth & $\operatorname{UniformInt}[1,12]$ & 9 & 10 & 9 \\ Learning rate $\eta$ & $\operatorname{LogUniform}[0.001, 0.125]$ & 0.12337960608926582 & 0.1248422186404667 & 0.12347504812996231 \\ - $\ell_2$ Leaf Regularisation & $\operatorname{UniformInt}[2, 30]$ & 12 & 9 & 13 \\ + $\ell_2$ Leaf Regularization & $\operatorname{UniformInt}[2, 30]$ & 12 & 9 & 13 \\ Random Strength & $\operatorname{LogUniform}[\num{1e-9}, 10.0]$ & \num{2e-8} & \num{5e-8} & \num{5e-8} \\ Bagging Temperature & $\operatorname{Uniform}[0.0, 1.0]$ & 0.34010535578784745 & 0.5214954412829511 & 0.4666577105566224 \\ \midrule \multicolumn{2}{l}{Validation Accuracy in \%} & {$\textcolor{viz-red}{\downarrow} \num{64.29671279599335}$} & {$\textcolor{viz-red}{\downarrow} \num{74.83010065958079}$} & {$\textcolor{viz-red}{\downarrow} \num{76.41433947686962}$} \\ \bottomrule \end{tabular} \end{table} -Matching the supervised results, semi-supervised ensembles exhaust the maximum tree depth and combine trees with a coarse learning rate. By parameter importance, both are most influential on the final result. Again, this is an indication that the trade data is not easily separable, requiring multiple features and splits. The found hyperparameters for $\ell_2$ leaf regularisation, random strength and bagging are balanced. Overall, the best validation accuracies are slightly inferior to the supervised variant. +Matching the supervised results, semi-supervised ensembles exhaust the maximum tree depth and combine trees with a coarse learning rate. By parameter importance, both are most influential on the final result. Again, this is an indication that the trade data is not easily separable, requiring multiple features and splits. The found hyperparameters for $\ell_2$ leaf regularization, random strength and bagging are balanced. Overall, the best validation accuracies are slightly inferior to the supervised variant. \clearpage @@ -272,7 +272,7 @@ \subsubsection{Hyperparameter Tuning}\label{sec:hyperparameter-tuning} With the smaller number of trials, the search space is less densely populated. Despite the decreased coverage, for the searched combinations the impact of hyperparameter tuning is less pronounced than for the \gls{GBRT}. As such, accuracies for \cref{fig:ise-transformer-hyperparam} fluctuate between \SI{62.55}{\percent} and \SI{64.65}{\percent}. This translates to other feature sets. Unaffected, the validation accuracies are higher than for gradient boosting. -The best combination is identical for all feature sets. The embedding dimension is at \num{248}, which is almost maxed out and the model uses four Transformer blocks. Thus, it has a medium capacity with \num{1985649} to \num{4252369} parameters, depending on the feature set. From all hyperparameters, the number of layers is most impactful on the final accuracy. While the use of dropout and weight decay for regularisation is minimal and marginally affects the validation performance. The large token dimensionality and mid-layer count could be an indication, that few attention heads are enough to extract patterns, but the learned patterns are relatively complex. The search results are compiled in \cref{tab:solutions-transformer}. +The best combination is identical for all feature sets. The embedding dimension is at \num{248}, which is almost maxed out and the model uses four Transformer blocks. Thus, it has a medium capacity with \num{1985649} to \num{4252369} parameters, depending on the feature set. From all hyperparameters, the number of layers is most impactful on the final accuracy. While the use of dropout and weight decay for regularization is minimal and marginally affects the validation performance. The large token dimensionality and mid-layer count could be an indication, that few attention heads are enough to extract patterns, but the learned patterns are relatively complex. The search results are compiled in \cref{tab:solutions-transformer}. \begin{table}[!h] \centering @@ -307,7 +307,7 @@ \subsubsection{Hyperparameter Tuning}\label{sec:hyperparameter-tuning} Pre-training performance is however bound by the available computing budget. As evident from \cref{fig:fttransformer-pretrain-loss}, the models have not fully converged until the end of pre-training, as the loss on the train- and validation set steadily improves. -Validation accuracy after finetuning improves for all models over Transformers without pretraining. As the search space is identically sampled for both variants we can directly attribute the improvements of \SI{0.28}{\percent} to \SI{0.72}{\percent} in validation accuracy to pre-training on unlabelled trades. Visualisations of the hyperparameter search spaces are available online.\footnote{See \url{https://wandb.ai/fbv/thesis/runs/12isqh2m} for \gls{FS} classical, for \url{https://wandb.ai/fbv/thesis/runs/2hv1nayy} for \gls{FS} size, and \url{https://wandb.ai/fbv/thesis/runs/3jbqpp4r} for \gls{FS} option for details.} +Validation accuracy after fine-tuning improves for all models over Transformers without pretraining. As the search space is identically sampled for both variants we can directly attribute the improvements of \SI{0.28}{\percent} to \SI{0.72}{\percent} in validation accuracy to pre-training on unlabeled trades. Visualizations of the hyperparameter search spaces are available online.\footnote{See \url{https://wandb.ai/fbv/thesis/runs/12isqh2m} for \gls{FS} classical, for \url{https://wandb.ai/fbv/thesis/runs/2hv1nayy} for \gls{FS} size, and \url{https://wandb.ai/fbv/thesis/runs/3jbqpp4r} for \gls{FS} option for details.} \begin{table}[!h] @@ -333,9 +333,9 @@ \subsubsection{Hyperparameter Tuning}\label{sec:hyperparameter-tuning} Akin to selecting the machine learning classifiers, we select our classical baselines on the \gls{ISE} validation set. This prevents \gls{overfitting} the test set and maintains consistency between both paradigms. For the same reason, baselines are kept constant in the transfer setting on the \gls{CBOE} sample. -Optimising hybrids of trade classification rules through Bayesian search is experimentally feasible by the stacking paradigm of \cref{sec:rule-based-approaches} and by treating the rules as a tunable hyperparameter. We consider all rules from \cref{sec:rule-based-approaches} learned on adjacent quotes of the exchange and \gls{NBBO} level or adjacent prices at the exchange and inter-exchange level and stack up to six rules. To model simple rules, consisting of a single or few rules, we add an identity mapping, $\operatorname{Id}$, that defers classification to later rules in the stack. A caveat of this approach is that sampled combinations may not be economically meaningful e.g., applying depth rule after tick rule, or not effective e.g., quote rule after tick rule, assuming complete data. Despite being unexplored, a conditional search space or human-in-the-loop sampling could account for this. +Optimizing hybrids of trade classification rules through Bayesian search is experimentally feasible by the stacking paradigm of \cref{sec:rule-based-approaches} and by treating the rules as a tunable hyperparameter. We consider all rules from \cref{sec:rule-based-approaches} learned on adjacent quotes of the exchange and \gls{NBBO} level or adjacent prices at the exchange and inter-exchange level and stack up to six rules. To model simple rules, consisting of a single or few rules, we add an identity mapping, $\operatorname{Id}$, that defers classification to later rules in the stack. A caveat of this approach is that sampled combinations may not be economically meaningful e.g., applying depth rule after tick rule, or not effective e.g., quote rule after tick rule, assuming complete data. Despite being unexplored, a conditional search space or human-in-the-loop sampling could account for this. -After all, we find no outperformance over hybrid rules already reported in the literature, as documented online.\footnote{For \gls{FS} classical our best combination of $\operatorname{quote}_{\mathrm{nbbo}} \to \operatorname{quote}_{\mathrm{ex}} \to \operatorname{rtick}_{\mathrm{all}}$ (simplified) reaches a validation accuracy of \SI{58.93934926393819}{\percent} equalling the solution of \textcite[][12]{grauerOptionTradeClassification2022}. For \gls{FS} size/option the best search solution is $\operatorname{tsize}_{\mathrm{ex}} \to \operatorname{depth}_{\mathrm{ex}} \to \operatorname{quote}_{\mathrm{nbbo}} \to \operatorname{tick}_{\mathrm{all}}$ (simplified) with \SI{69.03521015523933}{\percent} accuracy. The combination of \textcite[][14]{grauerOptionTradeClassification2022} reaches with $\operatorname{tsize}_{\mathrm{ex}} \to \operatorname{quote}_{\mathrm{nbbo}} \to \operatorname{quote}_{\mathrm{ex}} \to \operatorname{depth}_{\mathrm{nbbo}} \to \operatorname{depth}_{\mathrm{ex}} \to \operatorname{rtick}_{\mathrm{all}}$ an accuracy of \SI{69.3726}{\percent}. See \url{https://wandb.ai/fbv/thesis/runs/3f2m9c6i} and \url{https://wandb.ai/fbv/thesis/runs/16d6e4dk} for details. Experiments are run with \num{500} trials each.} Our combinations match or trail the accuracies of rules from \textcite[][12--14]{grauerOptionTradeClassification2022} on the \gls{ISE} validation set. Subsequently, we adopt their combinations as our benchmark, considering them to be the most challenging. +After all, we find no outperformance over hybrid rules already reported in the literature, as documented online.\footnote{For \gls{FS} classical our best combination of $\operatorname{quote}_{\mathrm{nbbo}} \to \operatorname{quote}_{\mathrm{ex}} \to \operatorname{rtick}_{\mathrm{all}}$ (simplified) reaches a validation accuracy of \SI{58.93934926393819}{\percent} equaling the solution of \textcite[][12]{grauerOptionTradeClassification2022}. For \gls{FS} size/option the best search solution is $\operatorname{tsize}_{\mathrm{ex}} \to \operatorname{depth}_{\mathrm{ex}} \to \operatorname{quote}_{\mathrm{nbbo}} \to \operatorname{tick}_{\mathrm{all}}$ (simplified) with \SI{69.03521015523933}{\percent} accuracy. The combination of \textcite[][14]{grauerOptionTradeClassification2022} reaches with $\operatorname{tsize}_{\mathrm{ex}} \to \operatorname{quote}_{\mathrm{nbbo}} \to \operatorname{quote}_{\mathrm{ex}} \to \operatorname{depth}_{\mathrm{nbbo}} \to \operatorname{depth}_{\mathrm{ex}} \to \operatorname{rtick}_{\mathrm{all}}$ an accuracy of \SI{69.3726}{\percent}. See \url{https://wandb.ai/fbv/thesis/runs/3f2m9c6i} and \url{https://wandb.ai/fbv/thesis/runs/16d6e4dk} for details. Experiments are run with \num{500} trials each.} Our combinations match or trail the accuracies of rules from \textcite[][12--14]{grauerOptionTradeClassification2022} on the \gls{ISE} validation set. Subsequently, we adopt their combinations as our benchmark, considering them to be the most challenging. From all candidate algorithms, a combination of quote and tick rules, $\operatorname{quote}_{\mathrm{nbbo}} \to \operatorname{quote}_{\mathrm{ex}} \to \operatorname{rtick}_{\mathrm{all}}$, where the quote rule first applied to the \gls{NBBO} and then to quotes of the \gls{ISE} followed by the reverse tick rule at inter-exchange level, performs best reaching a validation accuracy of \SI{58.76225138074204}{\percent}. For brevity, we refer to this combination as the \gls{GSU} method (small). It can be estimated using features from feature set one, which qualifies it as a benchmark. @@ -343,5 +343,5 @@ \subsubsection{Hyperparameter Tuning}\label{sec:hyperparameter-tuning} In the absence of other baselines, we repeatedly compare against the same method as a baseline for the third feature set, even if it doesn't involve option-specific features. -In the direct comparison between the validation accuracies of classical rules and our classifiers, the validation accuracies of classical rules considerably underperform the learnt classifier. \cref{sec:results} discusses if the results hold for the test sets. But before we do so, we present the metrics used +In the direct comparison between the validation accuracies of classical rules and our classifiers, the validation accuracies of classical rules considerably underperform the learned classifier. \cref{sec:results} discusses if the results hold for the test sets. But before we do so, we present the metrics used for evaluation. \ No newline at end of file diff --git a/reports/expose.tex b/reports/expose.tex index f6f7115c..5d71847e 100644 --- a/reports/expose.tex +++ b/reports/expose.tex @@ -21,7 +21,7 @@ \usepackage{acronym} % Enables the incorporation of a list of abbreviations. \usepackage{nomencl} % Useful to create a list of symbols. \usepackage{enumerate} % Useful for enumerations. - \usepackage{color} % Enables the definition of colours. + \usepackage{color} % Enables the definition of colors. % Tables and Graphs \usepackage{booktabs} % Improves the design of the tables @@ -33,7 +33,7 @@ \usepackage[hypcap=false]{caption} % Provides many ways to customize captions. % Mathematics - \usepackage{amscd,amsfonts,amsmath,amssymb,amsthm,amscd,bbm} % Extends the maths set. + \usepackage{amscd,amsfonts,amsmath,amssymb,amsthm,amscd,bbm} % Extends the math set. % --------------------------------- Information on thesis --------------------------------- diff --git a/reports/thesis.tex b/reports/thesis.tex index 45336625..37a215dd 100644 --- a/reports/thesis.tex +++ b/reports/thesis.tex @@ -32,7 +32,7 @@ \definecolor{viz-red}{HTML}{FF0000} \definecolor{viz-gray}{HTML}{D6DCE5} \definecolor{viz-white}{HTML}{FFFFFF} -\usepackage{colorprofiles} % load colour profiles for pdf/a standard +\usepackage{colorprofiles} % load color profiles for pdf/a standard % https://tex.stackexchange.com/questions/188533/how-to-draw-squares-circles-and-triangles \usepackage{tikz} @@ -69,8 +69,8 @@ \usepackage[font={small}]{floatrow} \graphicspath{{./Graphs/}} % Tells LATEX that the images are kept in a folder named images under the directory of the main document. -\usepackage[hypcap=false,font={small}]{caption} % Provides many ways to customise captions. -%\usepackage[hypcap=false,font={sf, small}]{caption} % Provides many ways to customise captions. +\usepackage[hypcap=false,font={small}]{caption} % Provides many ways to customize captions. +%\usepackage[hypcap=false,font={sf, small}]{caption} % Provides many ways to customize captions. \usepackage{siunitx} % Enables the use of SI units e. g., proper handling of percentage @@ -102,7 +102,7 @@ \usepackage[super]{nth} % 1st, 2nd etc. \usepackage{import} % path for inkscape graphics % Mathematics - \usepackage{amscd,amsfonts,amsmath,amssymb,amsthm,amscd,bbm} % Extends the maths set. + \usepackage{amscd,amsfonts,amsmath,amssymb,amsthm,amscd,bbm} % Extends the math set. % PDF/a standard \usepackage[a-2b,mathxmp]{pdfx} @@ -205,7 +205,7 @@ \newacronym{MCC}{MCC}{Matthews correlation coefficient} \newacronym{ML}{ML}{machine learning} \newacronym{MLP}{MLP}{multi-layer perceptron} -\newacronym{MLM}{MLM}{masked language modelling} +\newacronym{MLM}{MLM}{masked language modeling} \newacronym[first={national best bid and offer}]{NBBO}{NBBO}{national best bid and offer} \newacronym{NYSE}{NYSE}{New York Stock Exchange} \newacronym{NASDAQ}{NASDAQ}{National Association of Securities Dealers Automated Quotations}