From b9db7795d46a756439da109d7777b00dbc7dd47c Mon Sep 17 00:00:00 2001
From: Markus Bilz <mail@markusbilz.com>
Date: Tue, 27 Jun 2023 15:48:05 +0200
Subject: [PATCH] =?UTF-8?q?Rework=20/=20complete=20chapter=20on=20feature?=
 =?UTF-8?q?=20set=20definition=F0=9F=A7=99=20(#418)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../\360\237\224\232Discussion notes.md"      |  19 +++
 .../\360\237\215\225Application study.md"     |   9 +-
 ...200\215\360\237\215\263Tain-Test-split.md" |  13 ++
 .../\360\237\247\203Feature Sets.md"          |  65 +++++++--
 .../@anandStealthTradingOptions2007.md"       |  13 ++
 .../@chenDemandCrashInsurance2019.md"         |  13 ++
 .../@muravyevTherePriceDiscovery2013.md"      |  13 ++
 reports/Content/Appendix.tex                  |  15 ++-
 reports/Content/bibliography.bib              |  81 +++++++++++-
 reports/Content/data-preprocessing.tex        | 123 +++++++++---------
 reports/Content/end.tex                       |   2 +
 reports/Content/introduction.tex              |   2 +-
 reports/Content/results.tex                   |  14 +-
 reports/Content/rule-approaches.tex           |   4 +
 reports/Content/supervised-approaches.tex     |   1 +
 reports/Content/training-tuning.tex           |   4 +-
 reports/thesis.tex                            |   6 +-
 17 files changed, 308 insertions(+), 89 deletions(-)
 create mode 100644 "references/obsidian/\360\237\223\245Inbox/@anandStealthTradingOptions2007.md"
 create mode 100644 "references/obsidian/\360\237\223\245Inbox/@chenDemandCrashInsurance2019.md"
 create mode 100644 "references/obsidian/\360\237\223\245Inbox/@muravyevTherePriceDiscovery2013.md"

diff --git "a/references/obsidian/\360\237\223\221notes/\360\237\224\232Discussion notes.md" "b/references/obsidian/\360\237\223\221notes/\360\237\224\232Discussion notes.md"
index 8183d4b5..96341b1c 100644
--- "a/references/obsidian/\360\237\223\221notes/\360\237\224\232Discussion notes.md"	
+++ "b/references/obsidian/\360\237\223\221notes/\360\237\224\232Discussion notes.md"	
@@ -3,10 +3,20 @@
 - low accuracy for index options
 	- Study sources of missclassification. See e. g., [[@savickasInferringDirectionOption2003]]
 	- The extent to which inaccurate trade classification biases empirical research dependes on whether misclassifications occur randomly or systematically [[@theissenTestAccuracyLee2000]]. This document also contains ideas how to study the impact of wrong classifications in stock markets. Might different in option markets.
+	- “Spreads are portfolios of options of the same type (either only calls or only puts). Combinations are portfolios of options of different types. Traders can form these complex trades by individually buying the component options or by trading standard packages. The advantage of the latter approach is that the trader is subject to only one bid-ask spread, while buying the component options individually results in paying the bid-ask spread for each option. The market maker determines how to allocate the bid-ask spread among all options in a complex trade. Thus, not all (if any) of the component options necessarily trade at their quotes. Therefore, complex trades are highly likely to produce RQ and outside-quote trades. Furthermore, labeling complex trades as buys or  sells is not straightforward. For example, a bull spread involves buying a call option and selling another call option with a higher strike price. Thus, a buy requires a sell, and it is not clear whether treating the two trades separately is appropriate. Index option trading involves many complex trades because taking covered positions in index options is not as easy (or possible) as in equity options. Frequently, the only alternatives to naked positions in index options are complex options. Therefore, one way to reduce the problem of complex trades is to exclude all index trades. As Table 1 indicates, this results in a significant increase in the classification precision of all methods, but loses roughly one quarter of the sample, which is unacceptable.” (Savickas and Wilson, 2003, p. 899) (Savickas and Wilson, 2003, p. 898)
+	- Neither of the models can detect complex trades. It would require attention across rows and columns, which we outruled.
+	- “In contrast to Pan and Poteshman (2006), we use a unique data set from the International Securities Exchange (ISE), which contains the complete daily record of buy and sell activity in index options over a 12-year period, together with details on whether a transaction is involved in opening or closing an options position. These options are actively traded; indeed, on the ISE, the notional volume in index options is about onefifth of the total notional volume in all individual stock options during our sample period.” (Chordia et al., 2021, p. 1)
+
+“Savickas and Wilson 899 sells is not straightforward. For example, a bull spread involves buying a call option and selling another call option with a higher strike price. Thus, a buy requires a sell, and it is not clear whether treating the two trades separately is appropriate. Index option trading involves many complex trades because taking covered positions in index options is not as easy (or possible) as in equity options. Frequently, the only alternatives to naked positions in index options are complex options. Therefore, one way to reduce the problem of complex trades is to exclude all index trades. As Table 1 indicates, this results in a significant increase in the classification precision of all methods, but loses roughly one quarter of the sample, which is unacceptable.” (Savickas and Wilson, 2003, p. 899)
 - low accuracy for trades outside the quotes
 	- see also [[@ellisAccuracyTradeClassification2000]] for trades inside and outside the spread
+	- “On the one hand, we would expect that the greater (smaller) the transaction price relative to the midspread, the more likely that the transaction is a buy (sell) and occurs on an uptick (a downtick), implying higher classification success for outside-quote trades, especially for large trades in which the trade initiator is willing to pay a premium for the execution of his large order.” ([[@savickasInferringDirectionOption2003]] p. 888)
+	- “On the other hand, however, the outside-quote trades may be the manifestation of stale quotes, which result in misclassification. Also, the effect of market makers’ hedging and rebalancing trades on the classification of outside-quote trades is unclear. Section IV.C contains a logit analysis of outside-quote trades.” ([[@savickasInferringDirectionOption2003]], p. 888)
 - high gains for options for otm options and options with long maturity
 	- Accuracy is not the sole criterion. Depends on whether error is systematic or not. Thus, we do application study. See reasoning in ([[@theissenTestAccuracyLee2000]])
+	- “Specifically, one of the most noticeable regularities is that smaller trades are classified more precisely. This is because these trades are more likely to be executed at quotes and are less prone to reversed-quote trading (partially due to the fact that many small trades are executed on RAES)” (Savickas and Wilson, 2003, p. 889)
+	- Moneyness levels are “Out-of-the-money options offer the highest leverage (exposure for a dollar invested) and thus are particularly attractive for informed investors. Consistent with this argument, the information price impact is decreasing and convex in absolute delta. Figure 3(D) shows that the impact decreases from 0.4% for out-of-the-money options to 0.15% for in-the-money options. Next, private information is often short-lived and is related to near-term events, and thus short-term options are better suited for informed investors in addition to providing higher leverage. Indeed, the price impact decreases by 0.12% if time-to-expiration decreases from 80 days to 20 days. Buyer-initiated trades have a higher price impact than sell trades, because these trades provide an opportunity to bet not only on future volatility but also on the underlying direction. These results are broadly consistent with Pan and Poteshman (2006), except that I do not find a significant difference between call and put options, perhaps because my sample consists of large stocks that are easy to sell short.” (Muravyev, 2016, p. 695)
+“Since time to maturity is inversely related to trade size, we observe greater classification errors for shorter maturity options.” (Savickas and Wilson, 2003, p. 889)
 - performance gap in classical rules
 - strong performance of neural networks / tree-based ensembles
 	- We identify missingess in data to be down-ward biasing the results of classical estimators. ML predictors are robust to this missingness, as they can handle missing values and potentially substitute.
@@ -17,6 +27,15 @@
 	- Finetune. Low cost of inference
 - which algorithm is no preferable? Do Friedman rank test
 
+## time-to-maturity
+- “Expiration dummies are particularly good instruments. Investors substitute expiring option positions with similar nonexpiring ones in the three-day window around the expiration day (every third Friday of a month). Because investors are short call and put equity options on average, the rollover creates unprecedentedly large selling pressure in the nonexpiring options. Option expirations create exogenous variation in order imbalance, and thus exogenous variation in market-maker inventories as investors open new positions to replace positions in expiring options. Volatility and returns of the underlying stocks change little around expiration. Thus, fundamentals and informed trading are not responsible for the order imbalance.” (Muravyev, 2016, p. 700)
+- “Order imbalance is extremely negative around option expiration because investors are rolling over their positions to nonexpiring options. The selling pressure is particularly large on the postexpiration Monday when the abnormal order imbalance reaches −24%.” (Muravyev, 2016, p. 701)
+
+## Quotes change after the trade
+“With respect to the intraday analysis, the interaction between trades and quotes is key to understanding how and why prices change. The literature identifies two reasons why quoted prices increase after a buyer-initiated trade. First, market-makers adjust upward their beliefs about fair value as the trade may contain private information (e.g., Glosten and Milgrom (1985)). Second, market-makers require compensation for allowing their inventory position to deviate from the desired level, and thus a risk-averse market-maker will accommodate a subsequent buy order only at a higher price (e.g., Stoll (1978)).” (Muravyev, 2016, p. 674)
+
+## Quotes NBBO / Exchange
+- “Condition (d) also serves another purpose. Since the trade price is equal to the NBBO price quoted by at least two exchanges, this condition resolves ambiguity about trade direction as further discussed in the Internet Appendix.” (Muravyev, 2016, p. 689)
 
 ## Algorithm
 2.3.7 How to Write the Discussion  Assessment of the results  Comparison of your own results with the results of other studies = Citation of already published literature!  Components  Principles, relationships, generalizations shown by the results = Discussion, not recapitulation of the results  Exceptions, lack of correlation, open points  Referring to published work: = Results and interpretations in agreement with or in contrast to your results  Our Recommendations: The writing of the chapter “Discussion” is the most difficult one. Compare your own data/results with the results from other already published papers (and cite them!). Outline the discussion part in a similar way to that in the Results section = consistency. Evaluate whether your results are in agreement with or in contrast to existing knowledge to date. You can describe why or where the differences occur, e.g. in methods, in sites, in special conditions, etc. Sometimes it is difficult to discuss results without repetition from the chapter “Results”. Then, there is the possibility to combine the “Results” and “Discussion” sections into one chapter. However, in your presentation you have to classify clearly which are your own results and which are taken from other studies. For beginners, it is often easier to separate these sections.
diff --git "a/references/obsidian/\360\237\223\226chapters/\360\237\215\225Application study.md" "b/references/obsidian/\360\237\223\226chapters/\360\237\215\225Application study.md"
index e760c4ff..93c7c2e1 100644
--- "a/references/obsidian/\360\237\223\226chapters/\360\237\215\225Application study.md"	
+++ "b/references/obsidian/\360\237\223\226chapters/\360\237\215\225Application study.md"	
@@ -37,8 +37,13 @@ The null hypothesis is that the location of medians in two independent samples a
 (🔥What can we see? How do the results compare?)
 
 
+- “During our sample period of 2004–2015, quoted half-spreads of options on stocks in the S&P 500 index averaged 13 cents per share and 8.6% of the option price. Dollar (percentage) spreads were considerably wider for well in-the-money (out-of-the-money) options.” (Muravyev and Pearson, 2020, p. 4973)
+- “Although the costs of options market making can help explain why options spreads should be higher than the spreads of their underlying stocks (Battalio and Schultz 2011), a second puzzle is that existing theories are unable to explain the observed patterns of spreads. For example, the high dollar spreads of inthe-money (ITM) options and the relation between spreads and moneyness cannot be explained by hedge rebalancing costs incurred by options market makers, because hedges of well ITM options rarely need to be rebalanced. Similarly, the pattern cannot be explained by difficult to hedge gamma and vega risks that options market makers bear when they hold inventories of options, because well ITM options are not exposed to these risks.” (Muravyev and Pearson, 2020, p. 4974)
 
-
+## Inside / Outside / At the Quote
+- “Options traders exploit this predictability in timing their executions. Executions at the ask price tend to occur when the estimate of the fair value (the expected future midpoint) is close to but less than the quoted ask price, and executions at the bid price tend to occur when it is close to but greater than the quoted bid price. Traders who exploit this predictability are able to take liquidity at low costs, as we explain next.” (Muravyev and Pearson, 2020, p. 4975)
+- “Why do option market makers not update quotes frequently? Even if liquidity providers are faster than most liquidity takers, if they are slower than only one they are at risk to get picked off.4 To protect against this risk, market-makers post wider spreads that do not have to be changed with every change in the option fair value.5 Foucault, Roell, and Sandas (2003) model the trade-off that dealers face between the cost of frequent quote revisions and the benefits of being picked off less frequently.6 It is also costly for option market makers to update quotes frequently because the options exchanges place caps on the number of quote updates and fine exchange members whose ratios of messages to executions is large. In addition, market frictions, such as minimum tick sizes, prevent market makers from continuously centering their quotes on the fair value. Finally, trades by execution timers incur a half-spread of about three cents, which exceeds market-makers’ marginal costs of executing trades. Thus, nontimers’ trades are highly profitable for market makers, while the spreads on timers’ trades appear to at least cover market makers’ marginal costs of trading. Thus, market makers can facilitate trading by cost sensitive investors by changing their quotes infrequently.” (Muravyev and Pearson, 2020, p. 4977)
+- “During our sample the overwhelming bulk of option trading was electronic, with market makers generally using auto-quoting algorithms and quotes and trades disseminated almost instantly to participants in both the option and equity markets. In contrast to the previous option market structure in which trading occurred on exchange floors, in the current market structure an option market maker on the exchange where trade occurs does not have any informational advantage relative to other market participants, including market makers on the equity exchanges. This helps explain our findings that option quotes do not contain information not already reflected in stock quotes.” (Muravyev et al., 2013, p. 261)
 
 
 “We repeated this analysis with our dataset from the Frankfurt Stock Exchange. The results are presented in columns 2 and 3 of Table 5. The bias is even more dramatic. The traditional spread estimate is, on average, about twice as large as the “true” spread.8 A Wilcoxon test rejects the null hypothesis of equal medians (p < 0.01). Despite the large differences, the correlation between the two spread estimates is very high (ρ= 0.96). The magnitude of the relative bias (i.e., the traditional spread estimate divided by the “true” spread) is strongly negatively related to the classification accuracy. The correlation is –0.84.” ([[@theissenTestAccuracyLee2000]], p. 12)
@@ -80,6 +85,8 @@ Savickas and Wilson 897 TABLE 5 Estimated Effective Spreads Average Sprd. Quote
 % TODO: read: Pinder, S. (2003). An empirical examination of the impact of market microstructure changes on the determinants of option bid–ask spreads. International Review of Financial Analysis, 12(5):563–577.
 
 
+“In addition, my results offer little help in answering why option bid-ask spreads are so large. This is one of the biggest puzzles in the options literature—existing theories of the option spread fail to explain its magnitude and shape (Muravyev and Pearson (2014)).” (Muravyev, 2016, p. 696)
+
 - [[@rosenthalModelingTradeDirection2012]] lists fields where trade classification is used and what the impact of wrongly classified trades is.
 - The extent to which inaccurate trade classification biases empirical research depends on whether misclassifications occur randomly or systematically [[@theissenTestAccuracyLee2000]].
 
diff --git "a/references/obsidian/\360\237\223\226chapters/\360\237\221\250\342\200\215\360\237\215\263Tain-Test-split.md" "b/references/obsidian/\360\237\223\226chapters/\360\237\221\250\342\200\215\360\237\215\263Tain-Test-split.md"
index daed9134..e43a7ff2 100644
--- "a/references/obsidian/\360\237\223\226chapters/\360\237\221\250\342\200\215\360\237\215\263Tain-Test-split.md"
+++ "b/references/obsidian/\360\237\223\226chapters/\360\237\221\250\342\200\215\360\237\215\263Tain-Test-split.md"
@@ -1,3 +1,16 @@
+Prime examples for auto-correlation between trades are market or limit orders, that are split into smaller orders to encourage order execution. Also, informed traders tend to slice orders into smaller-sized trades to disguise their trading activity, as documented in ([[@anandStealthTradingOptions2007]]183). Oder splitting leads trades executed (almost) simultaneously with similar trade characteristics, which would be trivial to classify with the true label of a single transaction.
+
+
+
+“A floor broker seeking toexecute a market order for a "large" number of shares will frequently split his order among the quotations of several competing market participants, such as other floor brokers and book or? ders. In this situation, successive sales are a consequence ofthe same trade, and take place on the same side of the market, but are recorded as separate transac? tions. This, in turn, implies positive serial correlation in transactiontype” (Choi et al., 1988, p. 221)
+
+“Limit orders also can cause serial dependence in transaction type. Suppose the currentbid and ask quotes from the dealer are Pb and Pa, respectively. Limit orders to sell (buy) whose prices are lower (higher) than or equal to Pt,(Pa) are transacted at the market. All other limit orders remain in the dealer's book until there is a change in his quotations. However, a change in the dealer's quotation will result in transactions only on one side of the orders in the book. For exam? ple, if the "equilibrium" price increases (decreases), many ofthe limit orders to sell (buy) would be transacted at the same time. These transactions are recorded separately and would, therefore, induce serial correlation in transaction type.” (Choi et al., 1988, p. 221)
+
+“If these informed traders attempt to hide their information by splitting their trades into medium size trades, we should see medium size trades associated with higher price discovery in the dominant exchange and not in the other (non-dominant) exchanges. Underpinning our analysis is the intuition that an informed trader is likely to choose the options market venue (and option trade size) that best protects her ability to hide.” ([Anand and Chakravarty, 2007, p. 183)
+
+"Orders might also be split by option series" (anandStealthTradingOptions2007)
+
+
 Prior classical works assess the performance of classical rules in-sample (cp. [[@ellisAccuracyTradeClassification2000]]541) or in an out-of-sample setting (cp. [[@grauerOptionTradeClassification2022]]7--9) and ([[@chakrabartyTradeClassificationAlgorithms2007]]3814--3815).  In the presence of tunable hyperparameters in machine learning algorithms, we separate the dataset into *three* disjoint sets. The training set is used to fit the classifier to the data. The validation set is dedicated to tuning the hyperparameters, and the test set is used for unbiased out-of-sample estimates. 
 
 Trades in the dataset are ordered by time of execution, and nearby trades exhibit strong auto-correlation. Exemplary, subsequent trades on the same option series may share a similar trade price and quotes. This imposes constraints on the train-test split, which must ensure that minimal information leaks into the test set through serially-correlated features, leading to an otherwise overestimated model performance. The violation of statistical independence, out rules methods like the $k$-fold cross-validation or random test splits, both of which assume samples to be i.i.d ([[@lopezdepradoAdvancesFinancialMachine2018]] 104--105). We expect the previous research of ([[@ronenMachineLearningTrade2022]]14) to suffer from this problem leading to biased results. Differently, our work statically splits into subsets by time, which maintains the temporal ordering and eschews data leakage. This however limits the model's ability to leverage recent information for prediction beyond the training set's cut-off point. We do not explore dynamic training schemes, as they are practically intractable considering the number of model combinations and computational requirements of Transformers and gradient-boosted trees. In absence of an update mechanism, our results can be interpreted as a lower bound.
diff --git "a/references/obsidian/\360\237\223\226chapters/\360\237\247\203Feature Sets.md" "b/references/obsidian/\360\237\223\226chapters/\360\237\247\203Feature Sets.md"
index d644027c..a0128ede 100644
--- "a/references/obsidian/\360\237\223\226chapters/\360\237\247\203Feature Sets.md"	
+++ "b/references/obsidian/\360\237\223\226chapters/\360\237\247\203Feature Sets.md"	
@@ -1,21 +1,66 @@
+Our second feature set extends the first feature set by the trade size and size of the quotes, required to estimate hybrid rules involving the depth rule and trade size rule. Both rules achieve state-of-the-art performance on option trades when paired with hybrid algorithms and are thus an important source of features. We model the depth rule as the ratio between ask and bid sizes and the trade size rule as the ratio between the size of the trade and the quoted bid and ask sizes. Since features are not discretised, we obtain a generic formulation of the trade size rule, where part of the quoted size can remain unfilled. This potentially helps to distinguish limit from market orders. The trade price and midspread required for the depth rule are already encompassed in the first feature set. More generically, trade size is known to strongly affect classification. ([[@savickasInferringDirectionOption2003]]889) and ([[@ellisAccuracyTradeClassification2000]]537) report, that better classification is achieved for smaller trades, as smaller trades are more likely to be executed at the quotes. By providing the model with the trade and quoted sizes we hope to make these nuances learnable.
 
-![[feature-and-feature-set-definition.png]]
+Our largest feature set also incorporates option characteristics, including the strike price, the time to maturity, the moneyness, the option type and issue type as well as the underlying and traded volume of the option series. By providing unique identifiers for the option series, we can potentially establish connections between transactions when trade initiators divide a single order into sub-orders or rely on complex trades. Similar reasoning applies to the daily volume of the option series. Features are also informative individually. Time to maturity $\tau_{i,t}$, estimated in days, indirectly affects classification performance. On gls-CBOE data in ([[@savickasInferringDirectionOption2003]]889), trades with longer maturities are smaller, hence more likely to be classified correctly. Moreover, time-of-maturity can be used as a dummy to identify rollovers ([[@muravyevOrderFlowExpected2016]]700). When investors are short in call or put options, they replace expiring for non-expiring options, which creates a selling pressure in the non-expiring option. The feature might make this practice learnable. Related to the time-to-maturity is moneyness, estimated as the ratio between price of the underlying $S_{i,t}$ and the strike price $K_{i,t}$ for calls and the reciprocal for puts. As moneyness is linked to leverage in the investment, we reason that incentives to initiate a trade might vary between buyers and sellers.  Classification of certain security types, in particular index options, poses a major challenges for traditional approaches, as unanimously reported in ([[@grauerOptionTradeClassification2022]]22) and ([[@savickasInferringDirectionOption2003]]898-899), we equip the models with the issue type, as well as the option type and root to extend the learnable context. 
 
-![[calculate-stats.png]]
+By providing the model with option-specific features, we make nuances between the underlying, security types, and option types learnable. 
 
+- option series (an option series is characterized by option type (call/put), underlying stock, strike price, and maturity date),
 
-![[proximity-to-quote.png]]
+## NBBO / EX
+- “The structure of the U.S. options market is similar to that of the equity market but has some distinct features. Options are typically cross-listed across multiple fully electronic exchanges, and the NBBO rule is enforced. Investors can post limit or market orders, and market-makers are obliged to provide continuous two-sided quotes.” (Chordia et al., 2021, p. 3)
+- “lthough a number of markets satisfy these assumptions, the equity options market fits them particularly well. First, in early 2003, prior to the start of my sample period, all options exchanges were connected through Linkage and the National Best Bid and Offer (NBBO) rule was introduced. At the same time, investors gained access to real-time information about the best prices from all exchanges.” (Muravyev, 2016, p. 685)
+- “Second, market-makers stand on the liquidity-providing side of most trades. In the options market, market-makers transfer liquidity not only over time but also across different options. With several hundred option contracts available for each underlying, two investors rarely select the same option and thus they are likely to trade with a market-maker. Also, exchange rules grant lead market-makers substantial competitive edge over other liquidity providers (e.g., the 60/40 NBBO order split rule). These rules further strengthen lead market-makers’ position as the main liquidity providers and make it hard for new players to enter” (Muravyev, 2016, p. 685)
+- “Condition (d) also serves another purpose. Since the trade price is equal to the NBBO price quoted by at least two exchanges, this condition resolves ambiguity about trade direction as further discussed in the Internet Appendix.” (Muravyev, 2016, p. 689) There seems to be some amiguouty I do not understnad yet.
 
-**Ideas:**
--   While the determination of average trade sizes allows further gauging of common and larger trade sizes, the initial order size must also be considered. In electronic markets, and particularly in options, price guidance is given by quotes. Trade initiators rarely trade at these quoted prices (5-20% of options screen volume), and mostly enter limit orders mid-market. The difference between order and trade sizes is twofold. Firstly, a trade initiator may slice an order into suborders to minimise market impact by trading 500 futures or options via 5 orders at 100 contracts. Trade records provide evidence for this practise as the trade executions feature the same counterparty on the trade at identical prices with subsequent, virtually identical timestamps. Secondly, other market participants respond to incoming orders from trade initiators in a competitive fashion with immediate-or-cancel (IOC) orders. The initial order of 500 contracts placed a tick under the best quoted offer may be traded by, for example, 3 market makers sending IOCs in 100, 100 and 300 contracts. (https://www.esma.europa.eu/sites/default/files/esma_mifid_add_fese_replyform_1.docx)
+## trade size
+- “Trade size affects classification precision in two related ways. First, as found by EMO (2000), small trades are more likely to be executed at the quotes, implying an inverse relation between classification precision and trade size. Second, large reversed-quote limit orders are more likely to be executed by the market maker in his hedging and portfolio rebalancing activity, resulting in marketmaker-initiated trades” (Savickas and Wilson, 2003, p. 888)
+- “Time to maturity also has an indirect effect on classification precision. Trades in options with longer maturity tend to be smaller, resulting in negative correlation between the effects of maturity and of trade size. Because an option’s delta is non-monotonic in maturity, we do not expect a strong correlation between the effects of maturity and underlying price change. Finally, RAES buys (sells) are automatically crossed with the market maker’s ask (bid) prices and we would expect precise quote-based classifications of these trades” (Savickas and Wilson, 2003, p. 889)
+- “Thus, large trades are more informative than medium and small trades, suggesting that option investors no longer engage in “stealth trading,” whereby informed investors split their option trades and medium-sized trades are the most informed (Anand and Chakravarty (2007)). Fourth, the underlying stock price responds to option trades instantly, and the price impact of option trades on the underlying stock price is permanent and increasing in trade size. Option trades are thus informative about the underlying stock price level.” ([[@muravyevOrderFlowExpected2016]], p. 676)
+- “There is little need for stealth trading in the options market today, however, since order flow is sparse and is spread across multiple contracts, making it hard for informed traders to hide by splitting orders into mid-sized pieces. A possible explanation is that Anand and Chakravarty (2007) use data from 1999, when options were traded manually on the floor with no linkage between exchanges.” ([[@muravyevTherePriceDiscovery2013]], p. 693)
+- informed trading: “From the above analysis, we find evidence of trade size fragmentation, but we are unable to say anything about when either medium or small size trades would be preferred by informed traders. This brings us to the second objective of this study, wherein we investigate the relation (if any) between price discovery attributable to distinct trade size classes and the role of option volume, money10We thank the referee for suggesting this comparison” ([[@anandStealthTradingOptions2007]], p. 177)
+- “If these informed traders attempt to hide their information by splitting their trades into medium size trades, we should see medium size trades associated with higher price discovery in the dominant exchange and not in the other (non-dominant) exchanges. Underpinning our analysis is the intuition that an informed trader is likely to choose the options market venue (and option trade size) that best protects her ability to hide.” (Anand and Chakravarty, 2007, p. 183)
+- “The information share of medium size trades on the dominant exchange is consistently statistically higher than the information share of medium size trades on other exchanges collectively. In sum, the evidence presented in the current section shows that informed traders do in fact look for volume in the options of the firm of interest and then fragment their trades in that exchange.” (Anand and Chakravarty, 2007, p. 184)
+- “For liquid contracts (i.e., higher option volume), the largest contribution to price discovery comes from medium size trades while for relatively illiquid contracts the largest fraction of price discovery is associated with small size trades.” (Anand and Chakravarty, 2007, p. 186) -> is there difference between volume of option series?
+- “We also show that the strategic fragmentation of trades by informed traders, and the price discovery that follows such actions, is a function of the volume of the options contract: the moneyness and the time-to-maturity of the options do not appear to play a major role after controlling for liquidity. The implication is that informed traders act strategically in fragmenting their orders depending on the liquidity in the contract that affects their ability to hide.” (Anand and Chakravarty, 2007, p. 186)
+
+## Option series
+- “Finally, we analyze informed traders’ choice of option series for each of the 20 stocks in our data with the highest option volume traded on them. Our analysis is motivated by the desire to understand which of the several option series traded on a stock are preferred by informed traders. We find that the largest information share is exhibited by at-the-money calls. Puts do not appear to contribute significantly to the price discovery process except in those days when the underlying stocks go down significantly in price, but even then their relative information share is superseded by at-the-money calls. Our findings imply that informed traders prefer at-the-money calls to execute their option trades.” ([[@anandStealthTradingOptions2007]], p. 186)
+
+## strike price
+“The underlying price change is a critical factor that can either reduce classification precision by making past prices and quotes obsolete, or enhance it by 7Using the underlying price at the time of the last trade may be more appropriate for the tick rule. Similarly, for the EMO method, one might use the underlying price at the time of the latest quote for at-the-quote trades, and the underlying price at the time of the latest trade for all other trades. However, such differential treatment would hinder cross-rule comparisons. Therefore, we use the same variable for all four method rdecreasing the probability of RQ trades (i.e., buys at the bid and sells at the ask). Specifically, the underlying price increase since the most recent call option quote will result in an increase in the option’s value relative to the quote, leading to the increase in market orders to buy that call at the ask. Therefore, limit orders to buy at the bid will remain unexecuted, thus reducing the probability of RQ trades.8 The greater the drift in the underlying price since the last quotes, the stronger the effect. Similar reasoning applies to underlying price decreases and to puts.” (Savickas and Wilson, 2003, p. 889)
+
+## issue type
+- “In a closely related and important paper, Chen et al. (2018) show that high buying activity in deep out-ofthe-money S&P 500 index put options predicts low monthly market excess returns.” (Chordia et al., 2021, p. 2)
+- “The market of DOTM SPX put options are well-suited for this purpose. First, this market is large in terms of the economic exposures it provides for aggregate tail risks.3 Second, compared to other over-the-counter derivatives that also provide exposures to aggregate tail risks, the exchange-traded SPX options have the advantages of better liquidity and almost no counterparty risk (other than exchange failure).” ([[@chenDemandCrashInsurance2019]], p. 5)
+- “Using data on the trading activities between public investors and financial intermediaries in the market of deep out-of-the-money put options on the S&P 500 index (abbreviated as DOTM SPX puts), we exploit the price-quantity relations to identify periods when shocks to intermediary constraints are likely to be the main driver of the variations in the net amount of trading between public investors and financial intermediaries.” (Chen et al., 2019, p. 1)
+- “To construct the constraint measure, we start by computing the net amount of DOTM SPX puts that public investors in aggregate purchase each month (henceforth referred to as P N BO), which also reflects the net amount of the same options that broker-dealers and market-makers sell in that month. While it is well known that financial intermediaries are net sellers of these types of options during normal times, we find that P N BO varies significantly over time and tends to fall/turn negative during times of market distress.” (Chen et al., 2019, p. 1)
+- “In contrast to Pan and Poteshman (2006), we use a unique data set from the International Securities Exchange (ISE), which contains the complete daily record of buy and sell activity in index options over a 12-year period, together with details on whether a transaction is involved in opening or closing an options position. These options are actively traded; indeed, on the ISE, the notional volume in index options is about onefifth of the total notional volume in all individual stock options during our sample period.” (Chordia et al., 2021, p. 1) -> index options are highly liquid / frequently traded.
+- “Further, an upward movement in bid and ask quotes, which implies that informed market makers encourage sells because they have positive information, is not accompanied by higher put returns.” (Chordia et al., 2021, p. 2)
+- “Market makers offer this insurance by selling index puts. In times of market stress, the demand for insurance increases. The expected market risk premium increases as well, leading to a positive relation between public put buying and subsequent market excess returns. On the other hand, in good times, people sell puts in the hope of earning the put premium. This hypothesis is consistent with Grossman and Zhou (1996)andBates(2008). In their models, less crash-averse agents (the intermediary firms making markets) insure the more crash-averse agents (outside investors) through options.” (Chordia et al., 2021, p. 17)
+- “Institutional investors prefer SPX options because of the ability to execute large multileg trades and relatively low transaction costs. Other index options, including the most liquid ISE index options that we study, are traded by few investors with high preference for a given index and mostly by unsophisticated retail investors. First, using intraday options transaction data from OPRA from 2006 to 2015, we find that the average dollar trade size for index options on the ISE is one-sixth of that for SPX options. Second, we note thattheopen-closedatalabeltradingvolumeas “firm” when a member like Morgan Stanley trades for their own account, while other non-market-maker trades that include retail trades are marked as “customer.” Thus, firm volume comes entirely from sophisticated institutions, while retail investor trades are part of customer volume. For position-opening trades, we compute the fraction of total volume that corresponds to customer volume. Consistent with the hypothesis that most order flow in ISE index options is retail, customer trades constitute about 90% of position-opening volume for ISE index options, which is substantially larger than the corresponding fraction for CBOE (69%).” (Chordia et al., 2021, p. 17)
+- “Our results are most consistent with the notion that investors buy protection using put options when uncertainty is high. The uncertainty is accompanied by higher required market returns, thus yielding our predictability result. This is supported by the finding that the predictability is strongest around periods with scheduled macroeconomic news announcements (when uncertainty is high). Further, put option bid–ask spreads are lower, whereas the call option spreads are the same, in weeks with major macroeconomic announcements as compared with weeks without the major macroeconomic announcements, suggesting that increased put demand during periods of high uncertainty adds to liquidity in put options markets.” (Chordia et al., 2021, p. 18)
+
+## time-to-maturity
+
+“Expiration dummies are particularly good instruments. Investors substitute expiring option positions with similar nonexpiring ones in the three-day window around the expiration day (every third Friday of a month). Because investors are short call and put equity options on average, the rollover creates unprecedentedly large selling pressure in the nonexpiring options. Option expirations create exogenous variation in order imbalance, and thus exogenous variation in market-maker inventories as investors open new positions to replace positions in expiring options. Volatility and returns of the underlying stocks change little around expiration. Thus, fundamentals and informed trading are not responsible for the order imbalance.” (Muravyev, 2016, p. 700)
 
-- https://www.jstor.org/stable/2962317?seq=12 and [[@berkmanLargeOptionTrades1996]]
+## moneyness
+- Comparing options with different maturities and moneyness, we find that our trade size rule achieves the highest improvements when applied to options with long maturities and deep-out-ofthe money options ([[@grauerOptionTradeClassification2022]])
+- “he moneyness ratio for call options is calculated as the underlying 6In results not reported here, we find that daily trading volume has trivial economic impact on misclassification probability for all four rules. divided by the strike price. The moneyness ratio for puts is the strike price divided by the underlying price. The absolute value of the relative change in the underlying price is computed as abs Pc Pp Pp,wherePc is the underlying price at the time of the current transaction (i.e., the transaction being classified), and Pp is the underlying price at the time of the latest quote” (Savickas and Wilson, 2003, p. 888)
+option type
+- “Option moneyness has two indirect effects on classification precision. First, deep in-the-money options have deltas close to one (in absolute value) and, therefore, are more sensitive to the underlying asset price changes. Second, there is a positive relation between option moneyness and trade size” (Savickas and Wilson, 2003, p. 889)
+- “For example, out-of-the-money options provide the highest leverage, which attracts informed investors.” (Muravyev, 2016, p. 676)
 
-In an idealised setting where the noise traders/trade initiators place a market order that executes against a limit order or against posted quotes, we can accurately identify the side that initiated the trade based on whether it was executed at the ask price or the bid price. Although such a setting is not a complete depiction of real world stock markets, the literature typically uses such an algorithm to identify the side that initiates the trade because trade initiators are more likely to place market orders since they are less likely to accept the risk of not consummating their trades. In contrast, market makers are more likely to place limit orders because their primary objective is not to take a position in the stock but to clear the market and to profit from the spread. It is possible that some noise traders/trade initiators may place limit orders rather than market orders with the hope of executing trades at a more favourable price. In such instances, trades would cross between two trade initiators rather than between an active trade initiator and a market maker. Such trades add noise to the measure of order flow but as long as some of the trades are truly between active trade initiators and market makers, the net order flow is still an unbiased measure of net trades initiated for a particular stock. The key point to note here is that market makers are involved in at least some of the trades as liquidity providers, and the hypotheses that we examine do not apply to them. https://abfer.org/media/abfer-events-2013/annual-conference/investment-finance/track3-buyers-versus-sellers-who-initiates-trades-and-when.pdf
+We reason that different levels of moneyness offer different 
 
-“An interesting upshot of these results is that the aggressor side of trading appears little related to any underlying information, a decoupling that we argue arises from how trading transpires in modern high frequency markets. Our findings complement recent work by Collin-Dufresne and Vos (2015) who find that standard measures of adverse selection relying on estimates of the persistent price effect” ([[@easleyDiscerningInformationTrade2016]] p. 270)
+- “Out-of-the-money options offer the highest leverage (exposure for a dollar invested) and thus are particularly attractive for informed investors. Consistent with this argument, the information price impact is decreasing and convex in absolute delta. Figure 3(D) shows that the impact decreases from 0.4% for out-of-the-money options to 0.15% for in-the-money options. Next, private information is often short-lived and is related to near-term events, and thus short-term options are better suited for informed investors in addition to providing higher leverage. Indeed, the price impact decreases by 0.12% if time-to-expiration decreases from 80 days to 20 days. Buyer-initiated trades have a higher price impact than sell trades, because these trades provide an opportunity to bet not only on future volatility but also on the underlying direction. These results are broadly consistent with Pan and Poteshman (2006), except that I do not find a significant difference between call and put options, perhaps because my sample consists of large stocks that are easy to sell short.” (Muravyev, 2016, p. 695)
 
 
+![[calculate-stats.png]]
+
+
+![[proximity-to-quote.png]]
+
 **Other research:**
 -   While the determination of average trade sizes allows further gauging of common and larger trade sizes, the initial order size must also be considered. In electronic markets, and particularly in options, price guidance is given by quotes. Trade initiators rarely trade at these quoted prices (5-20% of options screen volume), and mostly enter limit orders mid-market. The difference between order and trade sizes is twofold. Firstly, a trade initiator may slice an order into suborders to minimise market impact by trading 500 futures or options via 5 orders at 100 contracts. Trade records provide evidence for this practise as the trade executions feature the same counterparty on the trade at identical prices with subsequent, virtually identical timestamps. Secondly, other market participants respond to incoming orders from trade initiators in a competitive fashion with immediate-or-cancel (IOC) orders. The initial order of 500 contracts placed a tick under the best quoted offer may be traded by, for example, 3 market makers sending IOCs in 100, 100 and 300 contracts. (https://www.esma.europa.eu/sites/default/files/esma_mifid_add_fese_replyform_1.docx)
 - In an idealised setting where the noise traders/trade initiators place a market order that executes against a limit order or against posted quotes, we can accurately identify the side that initiated the trade based on whether it was executed at the ask price or the bid price. Although such a setting is not a complete depiction of real world stock markets, the literature typically uses such an algorithm to identify the side that initiates the trade because trade initiators are more likely to place market orders since they are less likely to accept the risk of not consummating their trades. In contrast, market makers are more likely to place limit orders because their primary objective is not to take a position in the stock but to clear the market and to profit from the spread. It is possible that some noise traders/trade initiators may place limit orders rather than market orders with the hope of executing trades at a more favourable price. In such instances, trades would cross between two trade initiators rather than between an active trade initiator and a market maker. Such trades add noise to the measure of order flow but as long as some of the trades are truly between active trade initiators and market makers, the net order flow is still an unbiased measure of net trades initiated for a particular stock. The key point to note here is that market makers are involved in at least some of the trades as liquidity providers, and the hypotheses that we examine do not apply to them. https://abfer.org/media/abfer-events-2013/annual-conference/investment-finance/track3-buyers-versus-sellers-who-initiates-trades-and-when.pdf
@@ -43,4 +88,6 @@ g designated market makers). To investigate these three hypotheses further, we e
 
 % TODO: These proxies have in common that they factor in the order book imbalance the relative depth quoted at the best bid and ask prices. If traders care about transaction costs, the relatively wide ask-side spread deters buyers, whereas the tight bid-side spread may attract sellers. There are then more traders submitting market orders at the bid side, and the true effective spread is, on average, smaller than the average midpoint effective spread.
 
-% TODO: Derive in greater detail why orderbook imbalance makes sense! See my notes from Hagströmer
\ No newline at end of file
+% TODO: Derive in greater detail why orderbook imbalance makes sense! See my notes from Hagströmer
+
+
diff --git "a/references/obsidian/\360\237\223\245Inbox/@anandStealthTradingOptions2007.md" "b/references/obsidian/\360\237\223\245Inbox/@anandStealthTradingOptions2007.md"
new file mode 100644
index 00000000..ed2769eb
--- /dev/null
+++ "b/references/obsidian/\360\237\223\245Inbox/@anandStealthTradingOptions2007.md"
@@ -0,0 +1,13 @@
+*title:* Stealth Trading in Options Markets
+*authors:* Amber Anand, Sugato Chakravarty
+*year:* 2007
+*tags:* 
+*status:* #📥
+*related:*
+*code:*
+*review:*
+
+## Notes 📍
+
+## Annotations 📖
+Note: 
\ No newline at end of file
diff --git "a/references/obsidian/\360\237\223\245Inbox/@chenDemandCrashInsurance2019.md" "b/references/obsidian/\360\237\223\245Inbox/@chenDemandCrashInsurance2019.md"
new file mode 100644
index 00000000..8cf60df1
--- /dev/null
+++ "b/references/obsidian/\360\237\223\245Inbox/@chenDemandCrashInsurance2019.md"
@@ -0,0 +1,13 @@
+*title:* Demand for Crash Insurance, Intermediary Constraints, and Risk Premia in Financial Markets
+*authors:* Hui Chen, Scott Joslin, Sophie Xiaoyan Ni
+*year:* 2019
+*tags:* 
+*status:* #📥
+*related:*
+*code:*
+*review:*
+
+## Notes 📍
+
+## Annotations 📖
+Note: 
\ No newline at end of file
diff --git "a/references/obsidian/\360\237\223\245Inbox/@muravyevTherePriceDiscovery2013.md" "b/references/obsidian/\360\237\223\245Inbox/@muravyevTherePriceDiscovery2013.md"
new file mode 100644
index 00000000..1da1929f
--- /dev/null
+++ "b/references/obsidian/\360\237\223\245Inbox/@muravyevTherePriceDiscovery2013.md"
@@ -0,0 +1,13 @@
+*title:* Is there price discovery in equity options?
+*authors:* Dmitriy Muravyev, Neil D. Pearson, John Paul Broussard
+*year:* 2013
+*tags:* 
+*status:* #📥
+*related:*
+*code:*
+*review:*
+
+## Notes 📍
+
+## Annotations 📖
+Note: 
\ No newline at end of file
diff --git a/reports/Content/Appendix.tex b/reports/Content/Appendix.tex
index 65e02436..3fc89c4d 100644
--- a/reports/Content/Appendix.tex
+++ b/reports/Content/Appendix.tex
@@ -8,7 +8,7 @@ \section{Appendix}
     \label{app:literature-ml-tc}
     \begin{table}[ht]
         \centering
-        \caption*[Literature on Trade Classification Using Machine Learning.]{Literature on trade classification using machine learning. Improvement is the out-of-sample performance over the best baselines if multiple baselines are reported. Data requirements may not be identical to these of classical rules.}
+        \caption[Literature on Trade Classification Using Machine Learning.]{Literature on trade classification using machine learning. Improvement is the out-of-sample performance over the best baselines if multiple baselines are reported. Data requirements may not be identical to these of classical rules.}
         \label{tab:literature-trade-classification-ml}
         \begin{tabular}{@{}p{3cm}p{3cm}lp{4cm}p{4cm}l@{}}
             \toprule
@@ -22,13 +22,16 @@ \section{Appendix}
     \end{table}
 \end{landscape}
 
+\subsection{Summary Statistics}
+\label{app:summary-statistics}
+
 
 \subsection{Power-Transforms of Features}
 \label{app:power-transforms-of-features}
 
 \begin{table}[ht]
     \centering
-    \caption*[Power-Transforms of Features]{Power-transforms of features. \lambda~specifies the exponent. Transformations estimated on \gls{ISE} training set.}
+    \caption[Power-Transforms of Features]{Power-transforms of features. \lambda~specifies the exponent. Transformations estimated on \gls{ISE} training set.}
     \label{tab:power-transformerations}
     \begin{tabular}{lS}
         \toprule
@@ -95,7 +98,7 @@ \subsection{Features and Transformations}
             \item[*] Notation assumes, that the previous or next trade price is distinguishable.
         \end{tablenotes}
     \end{threeparttable}
-    \caption*[Overview of Features and Transformations]{Overview of Features and Transformations}
+    \caption[Overview of Features and Transformations]{Overview of Features and Transformations}
     \label{tab:features-transformations}
 \end{table}
 
@@ -106,7 +109,7 @@ \subsection{Autocorrelation of Features}
 \begin{figure}[ht]
     \centering
     \includegraphics{auto-corr-features.pdf}
-    \caption*[Autocorrelation of Features]{Autocorrelation Features. Own work.}
+    \caption[Autocorrelation of Features]{Autocorrelation Features. Own work.}
     \label{fig:auto-correlation-features}
 \end{figure}
 
@@ -116,7 +119,7 @@ \subsection{Results of Supervised Models With Re-training}
 
 \begin{table}[ht]
     \centering
-    \caption*[Accuracies of Supervised Approaches With Re-Training On \glsentryshort{CBOE} and \glsentryshort{ISE} Sample]{This table reports the accuracy of \glspl{GBRT} for different feature sets on the \gls{ISE} and \gls{CBOE} test set after re-training on \gls{ISE} training and validation set. The improvement is estimated as the absolute change in accuracy between the classifier and the benchmark. For feature set classical, $\operatorname{gsu}_{\mathrm{small}}$ is the benchmark and otherwise $\operatorname{gsu}_{\mathrm{large}}$.}
+    \caption[Accuracies of Supervised Approaches With Re-Training On \glsentryshort{CBOE} and \glsentryshort{ISE} Sample]{This table reports the accuracy of \glspl{GBRT} for different feature sets on the \gls{ISE} and \gls{CBOE} test set after re-training on \gls{ISE} training and validation set. The improvement is estimated as the absolute change in accuracy between the classifier and the benchmark. For feature set classical, $\operatorname{gsu}_{\mathrm{small}}$ is the benchmark and otherwise $\operatorname{gsu}_{\mathrm{large}}$.}
     \label{tab:results-supervised-retraining-ise-cboe}
     \begin{tabular}{@{}llSSSSSS@{}}
         \toprule
@@ -165,6 +168,6 @@ \subsection{Attention Heads of Transformer}
     % \subfloat[Head (2,8)]{\label{sfig:hb}\includegraphics[width=.23\textwidth]{attention_head_8_layer_2_color_green_ise_quotes_mid.pdf}}\hfill
     % \subfloat[Head (3,8)]{\label{sfig:hc}\includegraphics[width=.23\textwidth]{attention_head_8_layer_3_color_green_ise_quotes_mid.pdf}}\hfill
     % \subfloat[Head (4,8)]{\label{sfig:hd}\includegraphics[width=.23\textwidth]{attention_head_8_layer_4_color_green_ise_quotes_mid.pdf}}\\
-    \caption*[Rule-Like Roles of All Attention Heads]{Attention heads that correspond to trade classification rules. Tuple denotes the location of the attention head in the model in the form of (layer, head). Plot visualises the attention weights for a trade executed at the quote, correctly classified by the model.}
+    \caption[Rule-Like Roles of All Attention Heads]{Attention heads that correspond to trade classification rules. Tuple denotes the location of the attention head in the model in the form of (layer, head). Plot visualises the attention weights for a trade executed at the quote, correctly classified by the model.}
     \label{fig:attention-heads-ise-all-transformer}
     \end{figure}
\ No newline at end of file
diff --git a/reports/Content/bibliography.bib b/reports/Content/bibliography.bib
index b379a325..5acdd102 100644
--- a/reports/Content/bibliography.bib
+++ b/reports/Content/bibliography.bib
@@ -130,6 +130,20 @@ @misc{aminiSelfTrainingSurvey2023
   archiveprefix = {arxiv}
 }
 
+@article{anandStealthTradingOptions2007,
+  title = {Stealth {{Trading}} in {{Options Markets}}},
+  author = {Anand, Amber and Chakravarty, Sugato},
+  year = {2007},
+  journal = {The Journal of Financial and Quantitative Analysis},
+  volume = {42},
+  number = {1},
+  eprint = {27647290},
+  eprinttype = {jstor},
+  publisher = {{Cambridge University Press}},
+  doi = {10.1017/S0022109000002234},
+  urldate = {2023-06-26}
+}
+
 @article{antoniouLognormalDistributionStock2004,
   title = {On the Log-Normal Distribution of Stock Market Data},
   author = {Antoniou, I and Ivanov, Vi.V and Ivanov, Va.V and Zrelov, P.V},
@@ -810,6 +824,17 @@ @misc{chenDeepLearningAsset2021
   archiveprefix = {arxiv}
 }
 
+@article{chenDemandCrashInsurance2019,
+  title = {Demand for {{Crash Insurance}}, {{Intermediary Constraints}}, and {{Risk Premia}} in {{Financial Markets}}},
+  author = {Chen, Hui and Joslin, Scott and Ni, Sophie Xiaoyan},
+  year = {2019},
+  journal = {The Review of Financial Studies},
+  volume = {32},
+  number = {1},
+  doi = {10.1093/rfs/hhy004},
+  urldate = {2023-06-26}
+}
+
 @misc{chenExcelFormerNeuralNetwork2023,
   title = {{{ExcelFormer}}: A Neural Network Surpassing Gbdts on Tabular Data},
   author = {Chen, Jintai and Yan, Jiahuan and Chen, Danny Ziyi and Wu, Jian},
@@ -880,11 +905,8 @@ @article{choiEstimationBidAskSpreads1988
   journal = {The Journal of Financial and Quantitative Analysis},
   volume = {23},
   number = {2},
-  eprint = {2330882},
-  eprinttype = {jstor},
   publisher = {{Cambridge University Press}},
-  doi = {10.2307/2330882},
-  urldate = {2023-06-02}
+  doi = {10.2307/2330882}
 }
 
 @misc{cholakovGatedTabTransformerEnhancedDeep2022,
@@ -962,6 +984,17 @@ @misc{coenenVisualizingMeasuringGeometry2019
   archiveprefix = {arxiv}
 }
 
+@article{collin-dufresneInformedTradingStock2021,
+  title = {Informed {{Trading}} in the {{Stock Market}} and {{Option-Price Discovery}}},
+  author = {{Collin-Dufresne}, Pierre and Fos, Vyacheslav and Muravyev, Dmitry},
+  year = {2021},
+  journal = {Journal of Financial and Quantitative Analysis},
+  volume = {56},
+  number = {6},
+  doi = {10.1017/S0022109020000629},
+  urldate = {2023-06-26}
+}
+
 @article{congDEEPSEQUENCEMODELING,
   title = {Deep Sequence Modeling: Development and Applications in Asset Pricing},
   author = {Cong, Lin William and Tang, Ke and Wang, Jingyuan and Zhang, Yang}
@@ -3048,6 +3081,17 @@ @article{muravyevOrderFlowExpected2016
   doi = {10.1111/jofi.12380}
 }
 
+@article{muravyevTherePriceDiscovery2013,
+  title = {Is There Price Discovery in Equity Options?},
+  author = {Muravyev, Dmitriy and Pearson, Neil D. and Paul Broussard, John},
+  year = {2013},
+  journal = {Journal of Financial Economics},
+  volume = {107},
+  number = {2},
+  doi = {10.1016/j.jfineco.2012.09.003},
+  urldate = {2023-06-26}
+}
+
 @article{nabiNovelApproachStock2020,
   title = {A Novel Approach for Stock Price Prediction Using Gradient Boosting Machine with Feature Engineering ({{GBM-wFE}})},
   author = {Nabi, Rebwar M. and Ab. M. Saeed, Soran and Harron, Habibollah},
@@ -3755,6 +3799,24 @@ @article{schapireStrengthWeakLearnability1990
   urldate = {2022-12-14}
 }
 
+@inproceedings{schroffFaceNetUnifiedEmbedding2015,
+  title = {{{FaceNet}}: {{A}} Unified Embedding for Face Recognition and Clustering},
+  booktitle = {2015 {{IEEE Conference}} on {{Computer Vision}} and {{Pattern Recognition}} ({{CVPR}})},
+  author = {Schroff, Florian and Kalenichenko, Dmitry and Philbin, James},
+  year = {2015},
+  publisher = {{IEEE}},
+  address = {{Boston, MA, USA}},
+  doi = {10.1109/CVPR.2015.7298682},
+  urldate = {2023-06-26}
+}
+
+@techreport{securitiesandexchangecommissionReportConcerningExaminations2007,
+  title = {Report {{Concerning Examinations}} of {{Options Order Routing}} and {{Execution}}},
+  author = {{Securities and Exchange Commission}},
+  year = {2007},
+  urldate = {2023-06-26}
+}
+
 @article{shahriariTakingHumanOut2016,
   title = {Taking the Human out of the Loop: A Review of Bayesian Optimization},
   author = {Shahriari, Bobak and Swersky, Kevin and Wang, Ziyu and Adams, Ryan P. and {de Freitas}, Nando},
@@ -4380,6 +4442,17 @@ @article{wangTransTabLearningTransferable
   author = {Wang, Zifeng and Sun, Jimeng}
 }
 
+@misc{wangWizMapScalableInteractive2023,
+  title = {{{WizMap}}: {{Scalable Interactive Visualization}} for {{Exploring Large Machine Learning Embeddings}}},
+  author = {Wang, Zijie J. and Hohman, Fred and Chau, Duen Horng},
+  year = {2023},
+  number = {arXiv:2306.09328},
+  eprint = {2306.09328},
+  publisher = {{arXiv}},
+  urldate = {2023-06-26},
+  archiveprefix = {arxiv}
+}
+
 @article{waszczukAssemblingInternationalEquity2014,
   title = {Assembling International Equity Datasets – Review of Studies on the Cross-Section of Returns},
   author = {Waszczuk, Antonina},
diff --git a/reports/Content/data-preprocessing.tex b/reports/Content/data-preprocessing.tex
index 06ca6a4a..32f34d24 100644
--- a/reports/Content/data-preprocessing.tex
+++ b/reports/Content/data-preprocessing.tex
@@ -5,21 +5,29 @@ \section{Empirical Study}\label{sec:empirical-study}
 
 \subsection{Data and Data Preparation}\label{sec:data-and-data-preparation}
 
+% \todo{This paper employs one of the largest data sets in recent intraday options literature.}
 The following chapter describes the construction of datasets, that suffice the data requirements of classical trade classification rules and for our machine learning models. We also discuss how we define and infer the true trade initiator.
 
 \subsubsection{Data Collection}\label{sec:data-collection}
 
-% \textbf{Data Sources}
+\textbf{Data Sources}
 
 Testing the empirical accuracy of our approaches requires option trades where the true initiator is known. To arrive at a labelled sample, we combine data from four individual data sources. Our primary source is LiveVol, which records option trades executed at US option exchanges at a transaction level. We limit our focus to option trades executed at the \gls{CBOE} and \gls{ISE}. LiveVol contains both trade and matching quote data. Like most proprietary data sources, it does not distinguish the initiator nor does it include the involved trader types. For the \gls{CBOE} and \gls{ISE} exchange, the \gls{ISE} Open/Close Trade Profile and \gls{CBOE} Open-Close Volume Summary contain the buy and sell volumes for the option series by trader type aggregated on a daily level. A combination of the LiveVol dataset with the open/close data, allows us to infer the trade initiator for a subset of trades. For evaluation and use in some of our machine learning models, we acquire additional underlying and option characteristics from IvyDB's OptionMetrics.
 
-% \textbf{Trade Initiator}
+\textbf{Trade Initiator}
 
-In \cref{sec:trade-initiator} we discussed three views on the trade initiator. As our data sources do not provide the order entry times or order types for both sides of the trade, we define the trade initiator based on the position relative to the market maker, who caters to the liquidity demand. More specifically, we classify customer trades as buyer-initiated if the trade is due to a customer buy order and as seller-initiated for customer sales. As previous literature, e.g., \textcite[][4276]{garleanuDemandBasedOptionPricing2009} suggests that trader types, for example, proprietary traders, have a similar role to market makers by supplying liquidity, we limit our analysis to trades between customers and market makers for which the picture is unambiguous. Our definition is consistent with the off \textcite[][8]{grauerOptionTradeClassification2022}.
+In \cref{sec:trade-initiator} we discussed three views on the trade initiator. As our data sources do not provide the order entry times or order types for both sides of the trade, we define the trade initiator based on the position relative to the market maker, who caters to the liquidity demand. More specifically, we classify customer trades as buyer-initiated if the trade is due to a customer buy order and as seller-initiated for customer sales. As previous literature, e.g., \textcite[][4276]{garleanuDemandBasedOptionPricing2009} suggests that trader types, for example, proprietary traders, have a similar role to market makers by supplying liquidity, we limit our analysis to trades between customers and market makers for which the picture is unambiguous. Our definition is consistent with the of \textcite[][8]{grauerOptionTradeClassification2022}.
 
-% \textbf{Sample Construction}
+\textbf{Sample Construction}
 
-Our sample construction follows \textcite[][7--9]{grauerOptionTradeClassification2022}, fostering comparability between both works. We acquire transaction-level options trade data for all major US exchanges from LiveVol. The dataset is tabular, and each record is time-stamped to the second. For each transaction, the executing exchange, trade price, trade volume, quotes and quote sizes for the exchanges where the option is quoted, as well as the \gls{NBBO} are recorded. This is sufficient to estimate the quote rule, depth rule, and trade size rule. In addition, for tick-based algorithms, we add the previous and subsequent distinguishable trade prices. We can uniquely identify the traded option series from a distinct key consisting of the underlying, expiration date, option type and strike price. Our analysis is conducted on transactions at the \gls{ISE} and \gls{CBOE}. To purge the data of potential errors, we filter out option trades with a trade price equal to or less than zero and eliminate trades with a negative or zero trade volume as well as large trades with a trading volume exceeding \num{10000000} contracts. We further remove cancelled or duplicated trades and eliminate entries with multiple underlying symbols for the same root.
+% \todo{Optional: Minor importance of cust-cust trades? “Second, market-makers stand on the liquidity-providing side of most trades. In the options market, market-makers transfer liquidity not only over time but also across different options. With several hundred option contracts available for each underlying, two investors rarely select the same option and thus they are likely to trade with a market-maker. Also, exchange rules grant lead market-makers substantial competitive edge over other liquidity providers (e.g., the 60/40 NBBO order split rule). These rules further strengthen lead market-makers position as the main liquidity providers and make it hard for new players to enter” Found in \textcite{muravyevOrderFlowExpected2016}.}
+Our sample construction follows \textcite[][7--9]{grauerOptionTradeClassification2022}, fostering comparability between both works. We acquire transaction-level options trade data for all major US exchanges from LiveVol. The dataset is tabular, and each record is time-stamped to the second. For each transaction, the executing exchange, trade price, trade volume, quotes and quote sizes for the exchanges where the option is quoted, as well as the \gls{NBBO} are recorded. This is sufficient to estimate the quote rule, depth rule, and trade size rule. In addition, for tick-based algorithms, we add the previous and subsequent distinguishable trade prices. We can uniquely identify the traded option series from a distinct key consisting of the underlying, expiration date, option type and strike price. Our analysis is conducted on transactions at the \gls{ISE} and \gls{CBOE}. To purge the data of potential errors, we filter out:
+\begin{enumerate}[label=(\roman*),noitemsep]
+    \item trades with a trade price $\leq \SI{0}[\$]{}$,
+    \item trades with a trade volume $\leq 0$ or $\ge \num{10000000}$ contracts,
+    \item cancelled or duplicated trades,
+    \item entries with multiple underlying symbols for the same root.
+\end{enumerate}
 
 The open/close datasets for the \gls{ISE} and \gls{CBOE} contain the daily buy and sell volumes for the option series by trader type, the trade volume and whether a position was closed or opened. Four trader types are available: customer, professional customer, broker/dealer, and firm proprietary. Customer orders are placed by a retail trader or a member of the exchange on behalf of the customer. Professional customers are distinguished from the former by a high trading activity ($\geq390$ orders per day over one month period). Likewise, trades by a member are classified as proprietary, if executed for their account or broker/dealer if placed for non-members of the exchange \autocite[][2]{nasdaqincFrequentlyAskedQuestions2017}. Trades of customers and professional customers are detailed by trade volume ($\leq 100$; 101--199; $> 199$ contracts). As well as, if a position is newly opened or closed. We first sum buy and sell orders of all trader types and volumes to obtain the daily trading volumes at the \gls{ISE} or \gls{CBOE} per option series and day. Separately for the customer buy and sell volumes, we calculate the daily aggregates identified by the account type customer.
 
@@ -29,9 +37,7 @@ \subsubsection{Data Collection}\label{sec:data-collection}
 
 Following our initial rationale for using semi-supervised methods, we reserve unlabelled trades between 24 October 2012 and 24 October 2013 at the \gls{ISE} for pre- and self-training. We provide further details in \cref{sec:train-test-split}. Since LiveVol doesn't distinguish by trader types, this dataset includes both customer and non-customer trades, as well as simultaneous buy and sell trades on the same day.
 
-While our procedure makes the inference of the true trade initiator partly feasible, concerns regarding a selection bias due to the excessive filtering have to be raised. We address these concerns as part of our exploratory data analysis in (...), in which we compare unmerged and merged sub-samples.
-
-\todo{Report stats for unmatched and matched example in appendix or here. Adress differences between matched and unmatched sample.}
+While our procedure makes the inference of the true trade initiator partly feasible, concerns regarding a selection bias due to the excessive filtering have to be raised. We address these concerns and report summary statistics for unmerged and merged sub-samples in \cref{app:summary-statistics}.
 
 In the following chapter, we motivate feature engineering, present our feature sets and discuss strategies for transforming features into a form that accelerates and advances the training of our models.
 
@@ -46,19 +52,19 @@ \subsubsection{Data Preprocessing}\label{sec:data-preprocessing}
 \begin{ThreePartTable}
     \centering
     \begin{TableNotes}\footnotesize
-        \item[*] Notation assumes, that the previous or next trade price is distinguishable.
+        \item[*] Notation assumes, that the previous or next trade price is the distinguishable. See discussion in \cref{sec:tick-test}.
     \end{TableNotes}
     \begin{longtable}{@{}lllllll@{}}
 
 
-        \caption[Features and Feature Sets]{Features and Feature Sets.}\label{tab:feature-sets}                                                                                                  \\
+        \caption[Features and Feature Sets]{Features and feature sets. We divide data into three feature sets: classic, size, and option aligned with the data requirements of traditional trade classification rules. Feature definitions are derived from rules without quantization, whereby the column source documents the origin. Additional option-specific features are defined in text.}\label{tab:feature-sets} \\
         \toprule
-        Feature Name            & Definition                                                                                       & Source               & \gls{FS} 1 & \gls{FS} 2 & \gls{FS} 3 \\ \midrule
+        Feature Name            & Definition                                                                                                                      & Source               & \gls{FS} Classic                  & \gls{FS} Size                     & \gls{FS} Option                                                                                                                                    \\ \midrule
         \endfirsthead
 
-        \multicolumn{6}{l}{\textit{Continued \tablename~\thetable}}                                                                                                                              \\
+        \multicolumn{6}{l}{\textit{Continued \tablename~\thetable}}                                                                                                                                                                                                                                                                                                                                                   \\
         \toprule
-        Feature Name            & Definition                                                                                       & Source               & \gls{FS} 1 & \gls{FS} 2 & \gls{FS} 3 \\ \midrule
+        Feature Name            & Definition                                                                                                                      & Source               & \gls{FS} Classic                  & \gls{FS} Size                     & \gls{FS} Option                                                                                                                                    \\ \midrule
         \endhead
 
         \bottomrule
@@ -67,55 +73,54 @@ \subsubsection{Data Preprocessing}\label{sec:data-preprocessing}
         \insertTableNotes
         \endlastfoot
 
-        trade price             & $P_{i, t}$                                                                                       & tick rule            & \checkmark & \checkmark & \checkmark \\
-        price lag (ex)          & $P_{i, t-1}^{\text{ex}}$\tnote{*}                                                                & tick rule            & \checkmark & \checkmark & \checkmark \\
-        price lag (all)         & $P_{i, t-1}^{\text{all}}$\tnote{*}                                                               & tick rule            & \checkmark & \checkmark & \checkmark \\
-        price change lag (ex)   & $P_{i, t-1}^{\text{ex}}/P_{i, t}^{\text{ex}}$\tnote{*}                                           & tick rule            & \checkmark & \checkmark & \checkmark \\
-        price change lag (all)  & $P_{i, t-1}^{\text{all}}/P_{i, t}^{\text{all}}$\tnote{*}                                         & tick rule            & \checkmark & \checkmark & \checkmark \\
-        priced lead (ex)        & $P_{i, t+1}^{\text{ex}}$\tnote{*}                                                                & rev. tick rule       & \checkmark & \checkmark & \checkmark \\
-        price lead (all)        & $P_{i, t+1}^{\text{all}}$\tnote{*}                                                               & rev. tick rule       & \checkmark & \checkmark & \checkmark \\
-        price change lead (ex)  & $P_{i, t}^{\text{ex}}/P_{i, t+1}^{\text{ex}}$\tnote{*}                                           & rev. tick rule       & \checkmark & \checkmark & \checkmark \\
-        price change lead (all) & $P_{i, t}^{\text{all}}/P_{i, t+1}^{\text{all}}$\tnote{*}                                         & rev. tick rule       & \checkmark & \checkmark & \checkmark \\
-        bid (all)               & $B_{i, t}^{\text{all}}$                                                                          & quote rule           & \checkmark & \checkmark & \checkmark \\
-        bid (ex)                & $B_{i, t}^{\text{ex}}$                                                                           & quote rule           & \checkmark & \checkmark & \checkmark \\
-        ask (all)               & $A_{i, t}^{\text{all}}$                                                                          & quote rule           & \checkmark & \checkmark & \checkmark \\
-        ask (ex)                & $A_{i, t}^{\text{all}}$                                                                          & quote rule           & \checkmark & \checkmark & \checkmark \\
-        prox. to quotes (ex)    & $\left(P_{i, t}^{\text{ex}}- M_{i, t}^{\text{ex}}\right) / \tfrac{1}{2} S_{i, t}^{\text{ex}}$    & \gls{EMO}/\gls{CLNV} & \checkmark & \checkmark & \checkmark \\
-        prox. to quotes (all)   & $\left(P_{i, t}^{\text{all}}- M_{i, t}^{\text{all}}\right) / \tfrac{1}{2} S_{i, t}^{\text{all}}$ & \gls{EMO}/\gls{CLNV} & \checkmark & \checkmark & \checkmark \\
-        bid ask size ratio (ex) & $\tilde{B}_{i, t}^{\text{ex}}/\tilde{A}_{i, t}^{\text{ex}}$                                      & depth rule           &            & \checkmark & \checkmark \\
-        bid size (ex)           & $\tilde{B}_{i, t}^{\text{ex}}$                                                                   & depth rule           &            & \checkmark & \checkmark \\
-        ask size (ex)           & $\tilde{A}_{i, t}^{\text{ex}}$                                                                   & depth rule           &            & \checkmark & \checkmark \\
-        rel. bid size (ex)      & $\tilde{B}_{i, t}^{\text{ex}}/\tilde{P}_{i, t}^{\text{ex}}$                                      & trade size rule      &            & \checkmark & \checkmark \\
-        rel. ask size (ex)      & $\tilde{A}_{i, t}^{\text{ex}}/\tilde{P}_{i, t}^{\text{ex}}$                                      & trade size rule      &            & \checkmark & \checkmark \\
-        trade size              & $\tilde{P}_{i, t}$                                                                               & trade size rule      &            & \checkmark & \checkmark \\
-        strike price            &                                                                                                  & option               &            &            & \checkmark \\
-        volume option series    &                                                                                                  & option               &            &            & \checkmark \\
-        root                    &                                                                                                  & option               &            &            & \checkmark \\
-        time to maturity        &                                                                                                  & option               &            &            & \checkmark \\
-        moneyness               &                                                                                                  & option               &            &            & \checkmark \\
-        option type             &                                                                                                  & option               &            &            & \checkmark \\
-        issue type              &                                                                                                  & option               &            &            & \checkmark \\ \bottomrule
+        trade price             & $P_{i, t}$                                                                                                                      & tick rule            & \textcolor{viz-green}{\checkmark} & \textcolor{viz-green}{\checkmark} & \textcolor{viz-green}{\checkmark}                                                                                                                  \\
+        price lag (ex)          & $P_{i, t-1}^{\text{ex}}$\tnote{*}                                                                                               & tick rule            & \textcolor{viz-green}{\checkmark} & \textcolor{viz-green}{\checkmark} & \textcolor{viz-green}{\checkmark}                                                                                                                  \\
+        price lag (all)         & $P_{i, t-1}^{\text{all}}$\tnote{*}                                                                                              & tick rule            & \textcolor{viz-green}{\checkmark} & \textcolor{viz-green}{\checkmark} & \textcolor{viz-green}{\checkmark}                                                                                                                  \\
+        price change lag (ex)   & $P_{i, t-1}^{\text{ex}}/P_{i, t}^{\text{ex}}$\tnote{*}                                                                          & tick rule            & \textcolor{viz-green}{\checkmark} & \textcolor{viz-green}{\checkmark} & \textcolor{viz-green}{\checkmark}                                                                                                                  \\
+        price change lag (all)  & $P_{i, t-1}^{\text{all}}/P_{i, t}^{\text{all}}$\tnote{*}                                                                        & tick rule            & \textcolor{viz-green}{\checkmark} & \textcolor{viz-green}{\checkmark} & \textcolor{viz-green}{\checkmark}                                                                                                                  \\
+        priced lead (ex)        & $P_{i, t+1}^{\text{ex}}$\tnote{*}                                                                                               & rev. tick rule       & \textcolor{viz-green}{\checkmark} & \textcolor{viz-green}{\checkmark} & \textcolor{viz-green}{\checkmark}                                                                                                                  \\
+        price lead (all)        & $P_{i, t+1}^{\text{all}}$\tnote{*}                                                                                              & rev. tick rule       & \textcolor{viz-green}{\checkmark} & \textcolor{viz-green}{\checkmark} & \textcolor{viz-green}{\checkmark}                                                                                                                  \\
+        price change lead (ex)  & $P_{i, t}^{\text{ex}}/P_{i, t+1}^{\text{ex}}$\tnote{*}                                                                          & rev. tick rule       & \textcolor{viz-green}{\checkmark} & \textcolor{viz-green}{\checkmark} & \textcolor{viz-green}{\checkmark}                                                                                                                  \\
+        price change lead (all) & $P_{i, t}^{\text{all}}/P_{i, t+1}^{\text{all}}$\tnote{*}                                                                        & rev. tick rule       & \textcolor{viz-green}{\checkmark} & \textcolor{viz-green}{\checkmark} & \textcolor{viz-green}{\checkmark}                                                                                                                  \\
+        bid (all)               & $B_{i, t}^{\text{all}}$                                                                                                         & quote rule           & \textcolor{viz-green}{\checkmark} & \textcolor{viz-green}{\checkmark} & \textcolor{viz-green}{\checkmark}                                                                                                                  \\
+        bid (ex)                & $B_{i, t}^{\text{ex}}$                                                                                                          & quote rule           & \textcolor{viz-green}{\checkmark} & \textcolor{viz-green}{\checkmark} & \textcolor{viz-green}{\checkmark}                                                                                                                  \\
+        ask (all)               & $A_{i, t}^{\text{all}}$                                                                                                         & quote rule           & \textcolor{viz-green}{\checkmark} & \textcolor{viz-green}{\checkmark} & \textcolor{viz-green}{\checkmark}                                                                                                                  \\
+        ask (ex)                & $A_{i, t}^{\text{all}}$                                                                                                         & quote rule           & \textcolor{viz-green}{\checkmark} & \textcolor{viz-green}{\checkmark} & \textcolor{viz-green}{\checkmark}                                                                                                                  \\
+        prox. to quotes (ex)    & $\tfrac{2 \left(P_{i, t}^{\text{ex}}- M_{i, t}^{\text{ex}}\right)}{\left(A_{i, t}^{\text{ex}}-B_{i, t}^{\text{ex}}\right)}$     & \gls{EMO}/\gls{CLNV} & \textcolor{viz-green}{\checkmark} & \textcolor{viz-green}{\checkmark} & \textcolor{viz-green}{\checkmark}                                                                                                                  \\
+        prox. to quotes (all)   & $\tfrac{2 \left(P_{i, t}^{\text{all}}- M_{i, t}^{\text{all}}\right)}{\left(A_{i, t}^{\text{all}}-B_{i, t}^{\text{all}}\right)}$ & \gls{EMO}/\gls{CLNV} & \textcolor{viz-green}{\checkmark} & \textcolor{viz-green}{\checkmark} & \textcolor{viz-green}{\checkmark}                                                                                                                  \\
+        bid ask size ratio (ex) & $\tilde{B}_{i, t}^{\text{ex}}/\tilde{A}_{i, t}^{\text{ex}}$                                                                     & depth rule           &                                   & \textcolor{viz-green}{\checkmark} & \textcolor{viz-green}{\checkmark}                                                                                                                  \\
+        bid size (ex)           & $\tilde{B}_{i, t}^{\text{ex}}$                                                                                                  & depth rule           &                                   & \textcolor{viz-green}{\checkmark} & \textcolor{viz-green}{\checkmark}                                                                                                                  \\
+        ask size (ex)           & $\tilde{A}_{i, t}^{\text{ex}}$                                                                                                  & depth rule           &                                   & \textcolor{viz-green}{\checkmark} & \textcolor{viz-green}{\checkmark}                                                                                                                  \\
+        rel. bid size (ex)      & $\tilde{B}_{i, t}^{\text{ex}}/\tilde{P}_{i, t}^{\text{ex}}$                                                                     & trade size rule      &                                   & \textcolor{viz-green}{\checkmark} & \textcolor{viz-green}{\checkmark}                                                                                                                  \\
+        rel. ask size (ex)      & $\tilde{A}_{i, t}^{\text{ex}}/\tilde{P}_{i, t}^{\text{ex}}$                                                                     & trade size rule      &                                   & \textcolor{viz-green}{\checkmark} & \textcolor{viz-green}{\checkmark}                                                                                                                  \\
+        trade size              & $\tilde{P}_{i, t}$                                                                                                              & trade size rule      &                                   & \textcolor{viz-green}{\checkmark} & \textcolor{viz-green}{\checkmark}                                                                                                                  \\
+        root                    & $\left\{\mathtt{SPY},\ldots\right\}$                                                                                            & option               &                                   &                                   & \textcolor{viz-green}{\checkmark}                                                                                                                  \\
+        strike price            & $K_{i,t}$                                                                                                                       & option               &                                   &                                   & \textcolor{viz-green}{\checkmark}                                                                                                                  \\
+        time to maturity        & $\tau_{i,t}$                                                                                                                    & option               &                                   &                                   & \textcolor{viz-green}{\checkmark}                                                                                                                  \\
+        moneyness               & $\tfrac{S_{i,t}}{K_{i,t}}$ or $\tfrac{K_{i,t}}{S_{i,t}}$                                                                        & option               &                                   &                                   & \textcolor{viz-green}{\checkmark}                                                                                                                  \\
+        option type             & $\left\{\mathtt{C},\mathtt{P}\right\}$                                                                                          & option               &                                   &                                   & \textcolor{viz-green}{\checkmark}                                                                                                                  \\
+        security type           & $\left\{\mathtt{0},\mathtt{A},\ldots\right\}$                                                                                   & option               &                                   &                                   & \textcolor{viz-green}{\checkmark}                                                                                                                  \\
+        volume option series    &                                                                                                                                 & option               &                                   &                                   & \textcolor{viz-green}{\checkmark}                                                                                                                  \\ \bottomrule
     \end{longtable}
 \end{ThreePartTable}
 
-\todo{define and explain option characterstics and add proper caption to table.}
-
-\todo{Panel A uses the full sample, and panel B uses only trades that fall in a particular moneyness category, defined by the absolute value of the option delta: || (Big Delta) < 0.35 for OTM, 0.35 ≤| < 0.65 for ATM, and 0.65 ≤|| for ITM options. The quoted half-spread is half the difference between the ask and bid prices at the time of the trade, expressed as either a percentage of the pretrade bid-ask midpoint or in dollars. The effective half-spread is the difference between the trade price and the bid-ask midpoint (bid-ask midpoint and trade price) for trades signed as buys (sells), where a trade is signed as a buy (sell) if the trade price is greater (less) than the midpoint. The price impact for a buy (sell) is the (negative of the) difference between the bid-ask midpoint 10 minutes after the trade and at the time of the trade, divided by the midpoint at the time of the trade. ( Found in murjaev)}
 
-\todo{(b)Option moneyness measured by absolute option delta (i.e., the sensitivity oftheoption price to changes in the underlying price)is between 0.2 and 0.8, that is, options with at least some “optionality” are selecte}
+% Issue Type = the type of security: 
+% - 0 = Common Stock 
+% - A = Market index 
+% - 7 = Mutual or investment trust fund 
+% - F = ADR/ADS 
+% - % = Exchange-traded fund 
+% - (blank) = Unspecified
 
 Features and feature sets are documented in \cref{tab:feature-sets}.
 We aid the models by estimating the change in trade price between the previous and successive distinguishable trades. This is identical to the criterion used in the (reverse) tick rule, but in a non-quantized fashion to enforce a richer decision boundary and to surpass hard cut-off points. Similarly, the proximity of the trade price to the quotes, which is the decisive criterion in the quote rule and hybrids' thereof is added. The feature value ranges from $\left(-\infty,\infty\right)$ and is $-1$ for trades at the bid, 0 for trades at the mid, and 1 for trades at the ask. Quotes and trade prices are also incorporated as-is.
 
-\todo{add reasoning. depth rule and trade size rule yield significant improvements and must be considered state of the art. Thus, extend feature set. See e.g., Savickas p. 889}
-
-Our second feature set extends the first feature set by the trade size and size of the quotes, required to estimate hybrid rules involving the depth rule and trade size rule. Both rules are state-of-the-art when paired with hybrid algorithms and are thus both benchmark and source of features. We model the depth rule as the ratio between ask and bid sizes and the trade size rule as the ratio between the size of the trade and the quoted bid and ask sizes. Since features are not discretized, we obtain a generic formulation of the trade size rule, where part of the quoted size can remain unfilled. The trade price and midspread required for the depth rule are already encompassed in the first feature set.
+Our second feature set, named \gls{FS} Size, extends the first feature set by the trade size and size of the quotes, required to estimate hybrid rules involving the depth rule and trade size rule. Both rules achieve state-of-the-art performance on option trades when paired with hybrid algorithms and are thus an important source of features. We model the depth rule as the ratio between ask and bid sizes and the trade size rule as the ratio between the size of the trade and the quoted bid and ask sizes. Since features are not discretised, we obtain a generic formulation of the trade size rule, where part of the quoted size can remain unfilled. This potentially helps to distinguish limit orders from market orders. The trade price and midspread required for the depth rule are already encompassed in the first feature set. More generically, trade size is known to strongly affect classification. For instance, \textcites[][889]{savickasInferringDirectionOption2003}[][537]{ellisAccuracyTradeClassification2000} report that better classification is associated with smaller trades, as smaller trades are more likely to be executed at the quotes. By providing the model with the trade and quoted sizes we hope to make these nuances learnable.
 
-\todo{reason dependency on characteristics}
+Our largest feature set, abbreviated with \gls{FS} Option, also incorporates option characteristics, including the strike price, the time to maturity, the moneyness, the option type and security type as well as the underlying and traded volume of the option series. By providing unique identifiers for the option series, we can potentially establish connections between transactions when trade initiators divide a single order into sub-orders or rely on complex trades. Features are also informative individually. Similar reasoning applies to the daily volume of the option series. Time to maturity $\tau_{i,t}$, estimated in days, indirectly affects classification performance. On \gls{CBOE} data in \textcite[][889]{savickasInferringDirectionOption2003}, trades with longer maturities are smaller, hence more likely to be classified correctly. Moreover, time-of-maturity can be used as a dummy to identify rollovers \autocite[][700]{muravyevOrderFlowExpected2016}. When investors are short in call or put options, they replace expiring for non-expiring options, which creates selling pressure in the non-expiring option. The feature could make the procedure learnable. Related to the time-to-maturity is moneyness, estimated as the ratio between the price of the underlying $S_{i,t}$ and the strike price $K_{i,t}$ for calls and the reciprocal for puts. As moneyness is linked to leverage in the investment, we reason that incentives to initiate a trade might vary between buyers and sellers. The classification of index options, poses major challenges for traditional approaches relative to other security types, as unanimously documented in \textcites[][22]{grauerOptionTradeClassification2022}[][898-899]{savickasInferringDirectionOption2003}, we equip the models with the security type, as well as the option type and root to extend the learnable context.
 
-Our largest feature set also incorporates option characteristics, including the strike price, the time to maturity, the moneyness, the option type and issue type as well as the underlying and traded volume of the option series. By providing the model with option-specific features, we make nuances between the underlying, security types, and option types learnable. Additionally, by providing unique identifiers for the option series, we can potentially learn to establish connections between transactions when trade initiators divide a single order into sub-orders.
-
-Arguably, our models have simultaneous access to the previous and successive trade prices and quotes for both the exchange and the NBBO, which is an advantage over base rules. As we benchmark against various, stacked hybrid rules, the data requirements are comparable. We emphasise this aspect, as it is neglected in previous works \autocites[][485]{blazejewskiLocalNonParametricModel2005}[][48]{ronenMachineLearningTrade2022}[][9]{rosenthalModelingTradeDirection2012}.
+Arguably, our models have simultaneous access to the previous and successive trade prices and quotes for both the exchange and the \gls{NBBO}, which is an advantage over base rules. As we benchmark against various, stacked hybrid rules, the data requirements are comparable. We emphasise this aspect, as it is neglected in previous works \autocites[][485]{blazejewskiLocalNonParametricModel2005}[][48]{ronenMachineLearningTrade2022}[][9]{rosenthalModelingTradeDirection2012}.
 
 \textbf{Numerical Features}
 
@@ -141,7 +146,7 @@ \subsubsection{Data Preprocessing}\label{sec:data-preprocessing}
 
 \textbf{Categorical Features}
 
-As for the categorical variables, consisting of the option type, the underlying, and the issue type, different transformations are required. We perform a label encoding by randomly mapping every unique value onto an integer key. As an example, the option type in the set $\{\mathrm{'C'},\mathrm{'P'}\}$ would be randomly mapped onto $\{1,0\}$. This basic transformation defers the handling of categorical data to the model. Also, it minimises target leakage. Missing classes or classes unseen during training are mapped to the key of an $\mathtt{[UNK]}$ \gls{token}, as motivated in \cref{sec:token-embeddings}.
+As for the categorical variables, consisting of the option type, the underlying, and the issue type, different transformations are required. We perform a label encoding by randomly mapping every unique value onto an integer key. As an example, the option type in the set $\left\{\mathtt{C},\mathtt{P}\right\}$ would be randomly mapped onto $\{1,0\}$. This basic transformation defers the handling of categorical data to the model. Also, it minimises target leakage. Missing classes or classes unseen during training are mapped to the key of an $\mathtt{[UNK]}$ \gls{token}, as motivated in \cref{sec:token-embeddings}.
 
 The option type and issue type are both low-cardinal with two and five unique classes. Differently, the underlying is high-cardinal with more than \num{9999} distinct classes, as options are written on a wide range of underlyings. The high cardinality of the feature not just drives the computational demand through a higher parameter count but also affects the model's tendency to overfit, as most classes appear infrequently. Thus, we require each category to appear at least \num{1000} times in the training set. Infrequent categories are removed by mapping to the $\mathtt{[UNK]}$ \gls{token}. Virtually, this is identical to constraining the vocabulary size $N_V = \num{3333}$. Vocabulary is defined on the \gls{ISE} labelled train set and shared between all sets.
 
@@ -153,9 +158,9 @@ \subsubsection{Train-Test Split}\label{sec:train-test-split}
 
 Prior classical works assess the performance of classical rules in-sample \autocite[cp.][541]{ellisAccuracyTradeClassification2000} or in an out-of-sample setting \autocites[cp.][7--9]{grauerOptionTradeClassification2022}[][3814--3815]{chakrabartyTradeClassificationAlgorithms2007}. In the presence of tunable hyperparameters in our classifiers, we separate the \gls{ISE} dataset into \emph{three} disjoint sets. The training set is used to fit the classifier to the data. The validation set is dedicated to tuning the hyperparameters, and the test set is used for unbiased out-of-sample estimates.
 
-\todo{Better motivate with split orders e.g., limit orders. See \autocite{choiEstimationBidAskSpreads1988}}
-
-Trades in the dataset are ordered by time of execution, and nearby trades exhibit auto-correlation. Exemplary, subsequent trades on the same option series may share a similar trade price and quotes. This imposes constraints on the train-test split, which must ensure that minimal information leaks into the test set through serially-correlated features, leading to an otherwise overestimated model performance.\footnote{We emphasise this aspect, as previous research of \textcite[][14]{ronenMachineLearningTrade2022} is expectedly affected from this issue leading to exaggerated results.} The violation of statistical independence, out rules methods like the $k$-fold cross-validation or random train-test splits, both of which assume samples to be i.i.d. \autocite[][103--105]{lopezdepradoAdvancesFinancialMachine2018}. Differently, our work statically splits into subsets by time, which maintains the temporal ordering and eschews data leakage. Albeit this limits the model's ability to leverage recent information for prediction beyond the training set's cut-off point. We do not explore dynamic training schemes, as they are practically intractable considering the number of model combinations and computational requirements of Transformers and gradient-boosted trees. In the absence of an update mechanism, our results can be interpreted as a lower bound.
+Trades in the dataset are ordered by time of execution, and nearby trades can be auto-correlated, as documented in \cref{app:autocorrelation-of-features}.
+Prime examples for auto-correlation between trades are market or limit orders, that are split into smaller orders to encourage order execution. % Also, informed traders disguise their trading activity by slicing orders into smaller-sized orders, as reported by \textcite[][183]{anandStealthTradingOptions2007}. 
+The resulting, separate transactions are trivial to classify with the true label of a single transaction. This imposes constraints on the train-test split, which must ensure that minimal information leaks into the test set through serially-correlated features, leading to an otherwise overestimated model performance.\footnote{We emphasise this aspect, as previous research of \textcite[][14]{ronenMachineLearningTrade2022} is expectedly affected from this issue leading to exaggerated results.} The violation of statistical independence, out rules methods like the $k$-fold cross-validation or random train-test splits, both of which assume samples to be i.i.d. \autocite[][103--105]{lopezdepradoAdvancesFinancialMachine2018}. Differently, our work statically splits into subsets by time, which maintains the temporal ordering and eschews data leakage. Albeit this limits the model's ability to leverage recent information for prediction beyond the training set's cut-off point. We do not explore dynamic training schemes, as they are practically intractable considering the number of model combinations and computational requirements of Transformers and gradient-boosted trees. In the absence of an update mechanism, our results can be interpreted as a lower bound.
 
 Applying the time-based split, we attribute the first \SI{60}{\percent} of our dataset for training and the next \SI{20}{\percent} each for validation and testing. Days at the split boundary are assigned to either one set to avoid train-test contamination. Data within the training and validation set may be shuffled to accelerate training.
 
@@ -170,8 +175,6 @@ \subsubsection{Train-Test Split}\label{sec:train-test-split}
 
 Models are pre-trained on unlabelled samples from the last year of the training period. Given the significantly larger number of unlabelled customer trades, the pre-training period is reduced to one year to facilitate training on the available computing resources. Within the period, we filter out trades for which true label can be inferred, to avoid overlaps with the supervised training set. This is essential for self-training, as labelled and unlabelled data are provided to the model simultaneously.
 
-We use the \gls{CBOE} sample past 5 November 2015 as a second test set, as visualised in \cref{fig:train-test-split}. Our evaluation approach is the most rigorous as it disallows any form of adaptation of the models, thereby ensuring a rigorous evaluation. Unlike transfer learning techniques such as parameter or model transfer, which expectedly improve model performance, we choose to forgo these techniques and demonstrate the effectiveness of our models without any transfer of knowledge. The start date ensures that leakage from the \gls{ISE} set is minimised. \footnote{The datasets contain features, such as the \gls{NBBO}, that are identical for both sets, assuming trades were executed at both exchanges simultaneously. Utilising the full \gls{CBOE} sample could result in exaggerated performance estimates the corresponding \gls{ISE} trade is used in training.}
-
-Our train-test-split assumes that all subsets are drawn from the same distribution, so fitting a classifier on the training set and optimising for the validation set provides good estimates for the test set. To validate this assumption, we use adversarial validation. Specifically, we re-label all training samples with $y=-1$ and all trades of the validation set with $y=1$, train a classifier on a random subset of the composed dataset and predict class conformance. The performance is estimated using the \gls{MCC} of \textcite[][445]{matthewsComparisonPredictedObserved1975}, which ranges between $\left[-1, 1\right]$ and is insensitive to class imbalances.\footnote{Classes are imbalanced, due to the training set being three times the size of the validation set.} Assuming train and validation samples are sampled from the same distribution, the performance estimate is near a random guess, or $\operatorname{MCC} = 0$. For the mid-sized feature set, the \gls{MCC} is \num{0.364260805498287} suggesting training and validation sets are approximately similar. The next section discusses techniques used in training the classifiers.
+We use the \gls{CBOE} sample past 5 November 2015 as a second test set, as visualised in \cref{fig:train-test-split}. Our evaluation approach is the most rigorous as it disallows any form of adaptation of the models, thereby ensuring a rigorous evaluation. Unlike transfer learning techniques such as parameter or model transfer, which expectedly improve model performance, we choose to forgo these techniques and demonstrate the effectiveness of our models without any transfer of knowledge. The start date ensures that leakage from the \gls{ISE} set is minimised. \footnote{The datasets contain features, such as the \gls{NBBO}, that are identical for both sets, assuming trades were executed at both exchanges simultaneously. Also, quotes can be identical between exchanges, if market makers quote at the \gls{NBBO}, which is common practice as documented in \textcite[10]{securitiesandexchangecommissionReportConcerningExaminations2007}. Utilising the full \gls{CBOE} sample could result in exaggerated performance estimates if the corresponding \gls{ISE} trade is used in training.}
 
-\todo{report stats for entire sample and subsamples in the appendix ()}
\ No newline at end of file
+Our train-test-split assumes that all subsets are drawn from the same distribution, so fitting a classifier on the training set and optimising for the validation set provides good estimates for the test set. To validate this assumption, we use adversarial validation. Specifically, we re-label all training samples with $y=-1$ and all trades of the validation set with $y=1$, train a classifier on a random subset of the composed dataset and predict class conformance. The performance is estimated using the \gls{MCC} of \textcite[][445]{matthewsComparisonPredictedObserved1975}, which ranges between $\left[-1, 1\right]$ and is insensitive to class imbalances.\footnote{Classes are imbalanced, due to the training set being three times the size of the validation set.} Assuming train and validation samples are sampled from the same distribution, the performance estimate is near a random guess, or $\operatorname{MCC} = 0$. For the mid-sized feature set, the \gls{MCC} is \num{0.364260805498287} suggesting training and validation sets are approximately similar. The next section discusses techniques used in training the classifiers.
\ No newline at end of file
diff --git a/reports/Content/end.tex b/reports/Content/end.tex
index a4c9d6b4..184bf119 100644
--- a/reports/Content/end.tex
+++ b/reports/Content/end.tex
@@ -5,6 +5,8 @@ \section{Discussion}\label{sec:discussion}
 \newpage
 \section{Conclusion}\label{sec:conclusion}
 
+\todo{The predictability results survive an extensive list of robustness checks.}
+
 The goal of this study is to examine the performance of machine learning-based trade classification in the option market. In particular, we propose to model trade classification with Transformers and gradient boosting. Both approaches are supervised and leverage labelled trades. For settings, where labelled trades are scarce, we extend Transformers with a pre-training objective to train on unlabelled trades as well as generate pseudo-labels for gradient boosting through a self-training procedure.
 
 Our models establish a new state-of-the-art for trade classification on the \gls{ISE} and \gls{CBOE} dataset. For \gls{ISE} trades, Transformers achieve an accuracy of \SI{63.78}{\percent} when trained on trade and quoted prices as well as \SI{72.58}{\percent} when trained on additional quoted sizes, improving over current best of \textcite[][27]{grauerOptionTradeClassification2022} by \SI{3.73}{\percent} and \SI{4.97}{\percent}. Similarly, \glspl{GBRT} reach accuracies between \SI{63.67}{\percent} and \SI{73.24}{\percent}. We observe performance improvements up to \SI{6.51}{\percent} for \glspl{GBRT} and \SI{6.31}{\percent} for Transformers when models have access to option characteristics. Relative to the ubiquitous tick test, quote rule, and LR algorithm, improvements are \SI{23.88}{\percent}, \SI{17.11}{\percent}, and \SI{17.02}{\percent}. Outperformance is particularly strong for \gls{OTM} options, options with a long maturity, as well as options traded at the quotes. Both architectures generalise well on \gls{CBOE} data, with even stronger improvements between \SI{4.92}{\percent} and \SI{7.58}{\percent} over the benchmark depending on the model and feature set. 
diff --git a/reports/Content/introduction.tex b/reports/Content/introduction.tex
index 0e498f11..bb251d02 100644
--- a/reports/Content/introduction.tex
+++ b/reports/Content/introduction.tex
@@ -13,7 +13,7 @@ \section{Introduction}\label{sec:introduction}
 Our work fills this gap by focusing on machine learning methods to infer the trade initiator in the option market.\footnote{The authors acknowledge support by the federal state of Baden-Württemberg through \href{https://www.bwhpc.de/}{bwHPC}.} Approaching trade classification with machine learning is a logical choice, given its capability to handle high-dimensional trade data and learn complex decision boundaries. This raises the question, \emph{can an alternative machine learning-based classifier improve upon the accuracy of state-of-the-art approaches for option trade classification?}
 
 To answer this question, we model trade classification through Transformers and gradient boosting. We consider the supervised case, where fully-labelled trade data is available, as well as the semi-supervised setting, where trades are partially labelled with the true trade initiator. Our work makes the following contributions:
-\begin{enumerate}
+\begin{enumerate}[label=(\roman*),noitemsep]
     \item We employ state-of-the-art supervised algorithms i.~e., gradient-boosted trees and Transformer networks to the problem of trade classification and benchmark these approaches against rules-based methods. Our approaches outperform all rule-based approaches on \gls{ISE} and \gls{CBOE} data with comparable data requirements. In the application setting, our approaches approximate the true effective spread best.
     \item In a real-world setting, labelled trades are typically scarce, while unlabelled trades are abundant. Motivated by this consideration, we extend the classifiers to learn on both labelled and unlabelled instances through pre-training and self-training procedures. We analyse the effect on classification accuracy and observe that pre-training of Transformers further alleviates accuracy on \gls{ISE} test data.
     \item We strive to identify the most predictive features. Through a game-theoretic approach, our work is the first to consistently attribute the performance of rule-based and machine learning-based classification to individual features. We discover that both paradigms share common features, but machine learning-based classifiers attain higher performance gains and effectively exploit the data. By probing and visualising the attention mechanism in the Transformer, we can strengthen the connection to rule-based classification and reveal that \emph{learned} rules mimic classical rules.
diff --git a/reports/Content/results.tex b/reports/Content/results.tex
index cf5648aa..767aa2d4 100644
--- a/reports/Content/results.tex
+++ b/reports/Content/results.tex
@@ -257,6 +257,8 @@ \subsection{Results of Semi-supervised
 
 \subsection{Robustness of Results}\label{sec:robustness-checks}
 
+\todo{call them long-term options / expiring options?}
+
 To assess the robustness of our algorithms, we partition the test sets into sub-samples along seven dimensions: option type, security type, trade size, year, time to maturity, moneyness, as well as proximity to quotes. Comparable robustness checks have been previously conducted in \textcite[][47]{grauerOptionTradeClassification2022} as well as  \textcite[][890--892]{savickasInferringDirectionOption2003}, strengthening comparability across different works.\footnote{Despite all efforts, when comparing with \textcite[][47--52]{grauerOptionTradeClassification2022}, one has to be aware that evaluation periods and fallback strategies differ. Furthermore, the authors group similar algorithms. Thus, we recommend relying on our estimates of their rules.}
 
 Our results are tabulated \cref{tab:diff-ise-gbm,tab:diff-cboe-gbm,tab:diff-ise-transformer,tab:diff-cboe-transformer,tab:diff-ise-gbm-semi,tab:diff-cboe-gbm-semi}, separately for \glspl{GBRT} and Transformers as well as exchanges.
@@ -639,7 +641,7 @@ \subsection{Robustness of Results}\label{sec:robustness-checks}
 
 \textbf{Transformer With Self-Training}
 
-Transformers with pre-training objectives outperform the benchmark in all subsets apart from index options and trades outside the quotes. For \gls{ISE} trades in \cref{tab:diff-ise-transformer-semi} pre-training improves performance across subsets, reaching accuracies greater than \SI{86}{\percent}. The only exception is index options, where the performance gap slightly widens. Deep-\gls{OTM}  options and options with long maturity profit the most from the introduction of option features. 
+Transformers with pre-training objectives outperform the benchmark in all subsets apart from index options and trades outside the quotes. For \gls{ISE} trades in \cref{tab:diff-ise-transformer-semi} pre-training improves performance across subsets, reaching accuracies greater than \SI{86}{\percent}. The only exception is index options, where the performance gap slightly widens. Deep-\gls{ITM}  options and options with long maturity profit the most from the introduction of option features. 
 
 For trades at the \gls{CBOE} performance improvements associated with pre-training are slightly lower across several sub-groups. Positively, pre-training improves robustness, as the performance gap to the benchmarks narrows for trades outside the quotes. The results in conjunction with the identical model architecture suggest, that pre-training on unlabelled trades encodes exchange-specific knowledge, which improves performance and robustness on \gls{ISE} trades, but does not universally profit \gls{CBOE} trades. 
 
@@ -833,7 +835,7 @@ \subsection{Feature Importance}\label{sec:feature-importance}
     \label{fig:sage-importances}
 \end{figure}
 
-From \cref{fig:sage-importances} that all models achieve the largest improvement in loss from quoted prices and if provided from the quoted sizes. The contribution of the \gls{NBBO} to performance is roughly equal for all models, suggesting that even simple heuristics effectively exploit the data. For machine learning-based predictors, quotes at the exchange level hold equal importance in classification. This contrast with \gls{GSU} methods, which rely less on exchange-level quotes and mostly classify trades based on upstream rules. The performance improvements from the trade size and quoted size, are slightly lower for rule-based methods compared to machine-learning-based methods.  Transformers and \glspl{GBRT} gain performance from the addition of option features, i.e., moneyness and time-to-maturity. In conjunction with the results from the robustness checks, this suggests that the improvement observed for long-running options or \gls{OTM}  options is directly linked to the moneyness or time to maturity of the traded option itself. However, it remains unclear how these features interact with others. Regardless of the method used, changes in trade price before or after the trade are irrelevant for classification and can even harm performance. Similarly, additional features such as option type, issue type, the trading volume of the option series, and the underlying are also irrelevant. Thus, we note that there is a significant overlap between the importance of features in classical trade classification rules and machine learning-based predictors.
+From \cref{fig:sage-importances} that all models achieve the largest improvement in loss from quoted prices and if provided from the quoted sizes. The contribution of the \gls{NBBO} to performance is roughly equal for all models, suggesting that even simple heuristics effectively exploit the data. For machine learning-based predictors, quotes at the exchange level hold equal importance in classification. This contrast with \gls{GSU} methods, which rely less on exchange-level quotes and mostly classify trades based on upstream rules. The performance improvements from the trade size and quoted size, are slightly lower for rule-based methods compared to machine-learning-based methods.  Transformers and \glspl{GBRT} gain performance from the addition of option features, i.e., moneyness and time-to-maturity. In conjunction with the results from the robustness checks, this suggests that the improvement observed for long-running options or \gls{ITM} options is directly linked to the moneyness or time to maturity of the traded option itself. However, it remains unclear how these features interact with others. Regardless of the method used, changes in trade price before or after the trade are irrelevant for classification and can even harm performance. Similarly, additional features such as option type, issue type, the trading volume of the option series, and the underlying are also irrelevant. Thus, we note that there is a significant overlap between the importance of features in classical trade classification rules and machine learning-based predictors.
 
 \clearpage
 
@@ -867,7 +869,7 @@ \section{Application in Transaction Cost Estimation}\label{sec:application}
 
 \textbf{Results}
 
-The true and the estimated effective spreads for the test sets are shown in the \cref{tab:effective-spread} aggregated by mean. \textcite[][896--897]{savickasInferringDirectionOption2003} estimated the effective spreads of rules on a older subset of option trades at the \gls{CBOE}, which can be compared against. Our results match theirs in magnitude. \todo{Compare more recent study e.g., \autocite{muravyevOptionsTradingCosts2020} or \autocite{kaeckPriceImpactBid2022}}
+The true and the estimated effective spreads for the test sets are shown in the \cref{tab:effective-spread} aggregated by mean. \textcite[][896--897]{savickasInferringDirectionOption2003} estimated the effective spreads of rules on a older subset of option trades at the \gls{CBOE}, which can be compared against. Our results match theirs in magnitude.
 
 \begin{table}[!ht]
     \centering
@@ -880,6 +882,8 @@ \section{Application in Transaction Cost Estimation}\label{sec:application}
 
 From our supervised classifiers the FT-Transformer or \gls{GBRT} trained on \gls{FS} option provides estimates closest to the true effective spread, in particular on the \gls{CBOE} sample. For semi-supervised classifiers, Transformer-based models approximate the true effective spread best. This best manifests in a predicted effective spread at the \gls{ISE} of \SI[round-mode=places, round-precision=3]{0.013118}[\$]{} versus \SI[round-mode=places, round-precision=3]{0.004926}[\$]{}. The null hypothesis of equal medians is rejected at the \SI{1}{\percent} level for all classifiers.
 
-\SI[round-precision=3]{0.005}[\$]{}
-
 Thus, $\operatorname{gsu}_{\mathrm{large}}$ provides the best estimate of the effective spread if the true labels are absent. For labelled data, Transformer or gradient boosting-based approaches can provide more accurate estimates. The de facto standard, the \gls{LR} algorithm, fails to deliver accurate estimates and may bias research.
+
+\todo{“In addition, my results offer little help in answering why option bid-ask spreads are so large. This is one of the biggest puzzles in the options literature—existing theories of the option spread fail to explain its magnitude and shape (Muravyev and Pearson (2014)).”}
+
+\todo{compare against \textcite[][4981]{muravyevOptionsTradingCosts2020} or \autocite{kaeckPriceImpactBid2022}}
\ No newline at end of file
diff --git a/reports/Content/rule-approaches.tex b/reports/Content/rule-approaches.tex
index a9820658..011f5194 100644
--- a/reports/Content/rule-approaches.tex
+++ b/reports/Content/rule-approaches.tex
@@ -19,6 +19,8 @@ \subsection{Trade Initiator}
 
 In anticipation of \cref{sec:data-preprocessing}, we adopt a customer's position-based view in relation to the market maker. Nevertheless, it is worth noting that the concepts presented in this thesis can be applied to other perspectives as well.
 
+\todo{new word: “Second, since net buying of puts by customers is tantamount to net put selling by market makers, and such order flow positively predicts market returns, option market makers may have information relevant for predicting market returns.” found in \textcite[][2]{chordiaIndexOptionTrading2021}}
+
 As the trade initiator is frequently absent in option datasets, it must be inferred using trade classification algorithms. The following section introduces basic rules for trade classification. We start with the ubiquitous quote and tick rule and continue with the more recent depth and trade size rule. Our focus is on classification rules, that sign trades on a trade-by-trade basis. Consequently, we omit classification rules for aggregated trades, like the \gls{BVC} algorithm of \textcite[][1466--1468]{easleyFlowToxicityLiquidity2012}.
 
 \subsection{Basic Rules}\label{sec:basic-rules}
@@ -40,6 +42,8 @@ \subsubsection{Quote Rule}\label{sec:quote-rule}
 
 The quote rule can be estimated at the exchange level or on the \gls{NBBO}.
 
+\todo{“The structure of the U.S. options market is similar to that of the equity market but has some distinct features. Options are typically cross-listed across multiple fully electronic exchanges, and the NBBO rule is enforced. Investors can post limit or market orders, and market-makers are obliged to provide continuous two-sided quotes.” Make clear, what is the difference and why it matters.}
+
 \subsubsection{Tick Test}\label{sec:tick-test}
 
 A common alternative to the quote rule is the tick test. Based on the rationale that buys increase trade prices and sells lower them, the tick test classifies trades by the change in trade price. It was first applied in \textcites[][244]{holthausenEffectLargeBlock1987}[][240]{hasbrouckTradesQuotesInventories1988}. The tick test is defined as:
diff --git a/reports/Content/supervised-approaches.tex b/reports/Content/supervised-approaches.tex
index 9e612be8..ecc9e4d3 100644
--- a/reports/Content/supervised-approaches.tex
+++ b/reports/Content/supervised-approaches.tex
@@ -326,6 +326,7 @@ \subsubsection{Attention Mechanism}\label{sec:attention}
 
 Rather than relying on a single attention function, \textcite[][4--5]{vaswaniAttentionAllYou2017} introduce multiple \emph{attention heads}, which perform attention in parallel on $H$ \emph{different} linear projections of queries, keys, and values. The \emph{multi-head attention} enables the model to learn richer representations of the input, as attention heads operate independently, they can pick up unique patterns or focus on different positions in the sequence at once. Multi-head attention is visualised in \cref{fig:transformer-architecture-overview} (centre).
 
+\todo{introduce word modalities}
 Exemplary for machine translation, \textcite[][5795]{voitaAnalyzingMultiHeadSelfAttention2019} show, that heads serve indeed distinct purposes like learning positional or syntactic relations between tokens. It is conceivable, that for tabular data this maps to dependencies between features. In practice, Transformers may not leverage all attention heads and some heads could even be pruned without impacting the performance \autocites[][9]{michelAreSixteenHeads2019}[][5805]{voitaAnalyzingMultiHeadSelfAttention2019}.
 
 Multi-head attention can be computed as:
diff --git a/reports/Content/training-tuning.tex b/reports/Content/training-tuning.tex
index b30a5302..30cd3c94 100644
--- a/reports/Content/training-tuning.tex
+++ b/reports/Content/training-tuning.tex
@@ -226,7 +226,7 @@ \subsubsection{Hyperparameter Tuning}\label{sec:hyperparameter-tuning}
         $\ell_2$ Leaf Regularisation & $\operatorname{UniformInt}[2, 30]$            & 12                                     & 9                                      & 13                                     \\
         Random Strength              & $\operatorname{LogUniform}[\num{1e-9}, 10.0]$ & \num{2e-8}                             & \num{5e-8}                             & \num{5e-8}                             \\
         Bagging Temperature          & $\operatorname{Uniform}[0.0, 1.0]$            & 0.34010535578784745                    & 0.5214954412829511                     & 0.4666577105566224                     \\ \midrule
-        \multicolumn{2}{l}{Validation Accuracy in \%}                                               & {$\downarrow \num{64.29671279599335}$} & {$\downarrow \num{74.83010065958079}$} & {$\downarrow \num{76.41433947686962}$} \\ \bottomrule
+        \multicolumn{2}{l}{Validation Accuracy in \%}                                               & {$\textcolor{viz-red}{\downarrow} \num{64.29671279599335}$} & {$\textcolor{viz-red}{\downarrow} \num{74.83010065958079}$} & {$\textcolor{viz-red}{\downarrow} \num{76.41433947686962}$} \\ \bottomrule
     \end{tabular}
 \end{table}
 
@@ -315,7 +315,7 @@ \subsubsection{Hyperparameter Tuning}\label{sec:hyperparameter-tuning}
         Learning rate $\eta$                 & $\operatorname{LogUniform}[\num{3e-5}, \num{3e-4}]$ & \num{1e-6}                           & \num{1e-6}                           & \num{1e-6}                  \\
         Weight decay $\lambda$               & $\operatorname{LogUniform}[\num{1e-6}, \num{1e-3}]$ & \num{6e-5}                           & \num{6e-5}                           & \num{6e-5}                  \\ \midrule
         Validation Accuracy \%               & Pre-Train                                           & {\num{95.89540958404541}}            & {\num{95.64009308815002}}           & {\num{94.06319856643677}}   \\
-                                             & Fine-Tune                                           & {$\uparrow \num{65.13623935106421}$} & {$\uparrow \num{75.69871634547757}$} & {$\uparrow \num{77.8904}$}  \\ \bottomrule
+                                             & Fine-Tune                                           & {$\textcolor{viz-green}{\uparrow}\num{65.13623935106421}$} & {$\textcolor{viz-green}{\uparrow} \num{75.69871634547757}$} & {$\textcolor{viz-green}{\uparrow} \num{77.8904}$}  \\ \bottomrule
     \end{tabular}
 \end{table}
 
diff --git a/reports/thesis.tex b/reports/thesis.tex
index d19f8bb7..45336625 100644
--- a/reports/thesis.tex
+++ b/reports/thesis.tex
@@ -27,6 +27,8 @@
 % Colors
 \usepackage{xcolor} % Enables the definition of more colors.
 
+\definecolor{viz-green}{RGB}{0,150,0}
+\definecolor{viz-red}{RGB}{150,0,0}
 \definecolor{viz-red}{HTML}{FF0000}
 \definecolor{viz-gray}{HTML}{D6DCE5}
 \definecolor{viz-white}{HTML}{FFFFFF}
@@ -49,6 +51,7 @@
 \newcommand{\bestcircle}{\tikz{\node[circle,draw=darkgray, fill=white, line width=0.5pt, minimum width=0.2cm,minimum height=0.2cm, inner sep=0pt, draw opacity=.2] at (0,0){};}}
 \newcommand{\myline}{\tikz{\draw[dashed, gray, line width=0.5pt] (0,0) -- (0,0.3);}}
 
+\usepackage{enumitem} % enumerate with letters https://tex.stackexchange.com/a/129960
 
 % Tables and Graphs
 \usepackage{makecell} %Connected rows
@@ -193,6 +196,7 @@
 \newacronym{FFN}{FFN}{feed-forward network}
 \newacronym{FS}{FS}{feature set}
 \newacronym{ISE}{ISE}{International Securities Exchange}
+\newacronym{ITM}{ITM}{in-the-money}
 \newacronym{GBRT}{GBRT}{gradient-boosted regression tree}
 \newacronym{GPU}{GPU}{graphics processing unit}
 \newacronym{GSU}{GSU}{Grauer-Schuster-Uhrig-Homburg}
@@ -261,7 +265,7 @@
 \newglossaryentry{exploding-gradient}{name={exploding gradient},plural={exploding gradients},description={Exploding gradients is a problem encountered in training deep neural networks with backpropagation. Error gradients can accumulate, and result in very large parameter updates and unstable training of the network. The opposite is the vanishing gradient problem, whereby gradients become successively smaller during backpropagation, resulting in no or small parameter updates of the network. In both cases, the network does not converge.}}
 
 % compile only locally
-\includeonly{Content/data-preprocessing}
+% \includeonly{Content/introduction,Content/rule-approaches,Content/data-preprocessing,Content/Appendx}
 
 % ----------------------------------- Start of document -----------------------------------
 \begin{document}