From ff76f865ec119d1927fc4d2c4318e8a3ad8c3793 Mon Sep 17 00:00:00 2001
From: Markus Bilz <github@markusbilz.com>
Date: Mon, 4 Dec 2023 16:25:49 +0100
Subject: [PATCH] Use prev/succ trade price from subset in all hybrids

---
 README.md                          |  2 +-
 src/tclf/classical_classifier.py   | 24 ++++----
 tests/test_classical_classifier.py | 93 +++++++++++++++++++++++++-----
 3 files changed, 91 insertions(+), 28 deletions(-)

diff --git a/README.md b/README.md
index c38dbac..99f24a1 100644
--- a/README.md
+++ b/README.md
@@ -78,7 +78,7 @@ features = ["trade_price", "bid_ex", "ask_ex", "bid_best", "ask_best"]
 clf = ClassicalClassifier(
     layers=[("quote", "ex"), ("quote", "best")], strategy="const", features=features
 )
-clf.fit(X, y_true)
+clf.fit(X)
 
 y_pred = clf.predict(X)
 print(accuracy_score(y_true, y_pred))
diff --git a/src/tclf/classical_classifier.py b/src/tclf/classical_classifier.py
index bee7983..6039bec 100644
--- a/src/tclf/classical_classifier.py
+++ b/src/tclf/classical_classifier.py
@@ -160,7 +160,7 @@ def _quote(self, subset: str) -> npt.NDArray:
         )
 
     def _lr(self, subset: str) -> npt.NDArray:
-        """Classify a trade as a buy (sell) if its price is above (below) the midpoint (quote rule), and use the tick test (all) to classify midspread trades.
+        """Classify a trade as a buy (sell) if its price is above (below) the midpoint (quote rule), and use the tick test to classify midspread trades.
 
         Adapted from Lee and Ready (1991).
 
@@ -172,10 +172,10 @@ def _lr(self, subset: str) -> npt.NDArray:
             Can be np.NaN.
         """
         q_r = self._quote(subset)
-        return np.where(~np.isnan(q_r), q_r, self._tick("all"))
+        return np.where(~np.isnan(q_r), q_r, self._tick(subset))
 
     def _rev_lr(self, subset: str) -> npt.NDArray:
-        """Classify a trade as a buy (sell) if its price is above (below) the midpoint (quote rule), and use the reverse tick test (all) to classify midspread trades.
+        """Classify a trade as a buy (sell) if its price is above (below) the midpoint (quote rule), and use the reverse tick test to classify midspread trades.
 
         Adapted from Lee and Ready (1991).
 
@@ -187,7 +187,7 @@ def _rev_lr(self, subset: str) -> npt.NDArray:
             rule. Can be np.NaN.
         """
         q_r = self._quote(subset)
-        return np.where(~np.isnan(q_r), q_r, self._rev_tick("all"))
+        return np.where(~np.isnan(q_r), q_r, self._rev_tick(subset))
 
     def _mid(self, subset: str) -> npt.NDArray:
         """Calculate the midpoint of the bid and ask spread.
@@ -245,7 +245,7 @@ def _is_at_upper_xor_lower_quantile(
         return in_upper ^ in_lower
 
     def _emo(self, subset: str) -> npt.NDArray:
-        """Classify a trade as a buy (sell) if the trade takes place at the ask (bid) quote, and use the tick test (all) to classify all other trades.
+        """Classify a trade as a buy (sell) if the trade takes place at the ask (bid) quote, and use the tick test to classify all other trades.
 
         Adapted from Ellis et al. (2000).
 
@@ -257,11 +257,11 @@ def _emo(self, subset: str) -> npt.NDArray:
             np.NaN.
         """
         return np.where(
-            self._is_at_ask_xor_bid(subset), self._quote(subset), self._tick("all")
+            self._is_at_ask_xor_bid(subset), self._quote(subset), self._tick(subset)
         )
 
     def _rev_emo(self, subset: str) -> npt.NDArray:
-        """Classify a trade as a buy (sell) if the trade takes place at the ask (bid) quote, and use the reverse tick test (all) to classify all other trades.
+        """Classify a trade as a buy (sell) if the trade takes place at the ask (bid) quote, and use the reverse tick test to classify all other trades.
 
         Adapted from Grauer et al. (2022).
 
@@ -273,7 +273,7 @@ def _rev_emo(self, subset: str) -> npt.NDArray:
             Can be np.NaN.
         """
         return np.where(
-            self._is_at_ask_xor_bid(subset), self._quote(subset), self._rev_tick("all")
+            self._is_at_ask_xor_bid(subset), self._quote(subset), self._rev_tick(subset)
         )
 
     def _clnv(self, subset: str) -> npt.NDArray:
@@ -282,7 +282,7 @@ def _clnv(self, subset: str) -> npt.NDArray:
         Spread is divided into ten deciles and trades are classified as follows:
         - use quote rule for at ask until 30 % below ask (upper 3 deciles)
         - use quote rule for at bid until 30 % above bid (lower 3 deciles)
-        - use tick rule (all) for all other trades (±2 deciles from midpoint; outside
+        - use tick rule for all other trades (±2 deciles from midpoint; outside
         bid or ask).
 
         Adapted from Chakrabarty et al. (2007).
@@ -297,7 +297,7 @@ def _clnv(self, subset: str) -> npt.NDArray:
         return np.where(
             self._is_at_upper_xor_lower_quantile(subset),
             self._quote(subset),
-            self._tick("all"),
+            self._tick(subset),
         )
 
     def _rev_clnv(self, subset: str) -> npt.NDArray:
@@ -306,7 +306,7 @@ def _rev_clnv(self, subset: str) -> npt.NDArray:
         Spread is divided into ten deciles and trades are classified as follows:
         - use quote rule for at ask until 30 % below ask (upper 3 deciles)
         - use quote rule for at bid until 30 % above bid (lower 3 deciles)
-        - use reverse tick rule (all) for all other trades (±2 deciles from midpoint;
+        - use reverse tick rule for all other trades (±2 deciles from midpoint;
         outside bid or ask).
 
         Similar to extension of emo algorithm proposed Grauer et al. (2022).
@@ -321,7 +321,7 @@ def _rev_clnv(self, subset: str) -> npt.NDArray:
         return np.where(
             self._is_at_upper_xor_lower_quantile(subset),
             self._quote(subset),
-            self._rev_tick("all"),
+            self._rev_tick(subset),
         )
 
     def _trade_size(self, subset: str) -> npt.NDArray:
diff --git a/tests/test_classical_classifier.py b/tests/test_classical_classifier.py
index b92398e..cd146d7 100644
--- a/tests/test_classical_classifier.py
+++ b/tests/test_classical_classifier.py
@@ -3,6 +3,7 @@
 import numpy as np
 import pandas as pd
 import pytest
+from numpy.testing import assert_allclose
 from sklearn.utils.validation import check_is_fitted
 
 from tclf.classical_classifier import ClassicalClassifier
@@ -72,7 +73,9 @@ def test_strategy_const(self) -> None:
         fitted_classifier = ClassicalClassifier(
             layers=[("nan", "ex")], strategy="const"
         ).fit(self.x_train)
-        assert (fitted_classifier.predict_proba(self.x_test) == 0.5).all()
+        assert_allclose(
+            fitted_classifier.predict_proba(self.x_test), 0.5, rtol=1e-09, atol=1e-09
+        )
 
     def test_invalid_func(self) -> None:
         """Test, if only valid function strings can be passed.
@@ -135,9 +138,9 @@ def test_np_array(self) -> None:
         y_train = np.array([0, 0, 0])
         y_test = np.array([-1, 1])
 
-        columns = ["trade_price", "price_ex_lag", "price_all_lead"]
+        columns = ["trade_price", "price_ex_lag", "price_ex_lead"]
         fitted_classifier = ClassicalClassifier(
-            layers=[("tick", "ex"), ("rev_tick", "all")],
+            layers=[("tick", "ex"), ("rev_tick", "ex")],
             random_state=7,
             features=columns,
         ).fit(x_train, y_train)
@@ -268,13 +271,23 @@ def test_lr(self, subset: str) -> None:
         """
         x_train = pd.DataFrame(
             [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]],
-            columns=["trade_price", f"bid_{subset}", f"ask_{subset}", "price_all_lag"],
+            columns=[
+                "trade_price",
+                f"bid_{subset}",
+                f"ask_{subset}",
+                f"price_{subset}_lag",
+            ],
         )
         y_train = pd.Series([-1, 1, -1])
         # first two by quote rule, remaining two by tick rule.
         x_test = pd.DataFrame(
             [[1, 1, 3, 0], [3, 1, 3, 0], [1, 1, 1, 0], [3, 2, 4, 4]],
-            columns=["trade_price", f"bid_{subset}", f"ask_{subset}", "price_all_lag"],
+            columns=[
+                "trade_price",
+                f"bid_{subset}",
+                f"ask_{subset}",
+                f"price_{subset}_lag",
+            ],
         )
         y_test = pd.Series([-1, 1, 1, -1])
         fitted_classifier = ClassicalClassifier(
@@ -294,7 +307,12 @@ def test_rev_lr(self, subset: str) -> None:
         """
         x_train = pd.DataFrame(
             [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]],
-            columns=["trade_price", f"bid_{subset}", f"ask_{subset}", "price_all_lead"],
+            columns=[
+                "trade_price",
+                f"bid_{subset}",
+                f"ask_{subset}",
+                f"price_{subset}_lead",
+            ],
         )
         y_train = pd.Series([-1, 1, -1])
         # first two by quote rule, two by tick rule, and two by random chance.
@@ -307,7 +325,12 @@ def test_rev_lr(self, subset: str) -> None:
                 [1, 1, np.nan, np.nan],
                 [1, 1, np.nan, np.nan],
             ],
-            columns=["trade_price", f"bid_{subset}", f"ask_{subset}", "price_all_lead"],
+            columns=[
+                "trade_price",
+                f"bid_{subset}",
+                f"ask_{subset}",
+                f"price_{subset}_lead",
+            ],
         )
         y_test = pd.Series([-1, 1, 1, -1, -1, 1])
         fitted_classifier = ClassicalClassifier(
@@ -327,7 +350,12 @@ def test_emo(self, subset: str) -> None:
         """
         x_train = pd.DataFrame(
             [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]],
-            columns=["trade_price", f"bid_{subset}", f"ask_{subset}", "price_all_lag"],
+            columns=[
+                "trade_price",
+                f"bid_{subset}",
+                f"ask_{subset}",
+                f"price_{subset}_lag",
+            ],
         )
         y_train = pd.Series([-1, 1, -1])
         # first two by quote rule, two by tick rule, two by random chance.
@@ -345,7 +373,12 @@ def test_emo(self, subset: str) -> None:
                 [1, 1, np.inf, np.nan],
                 [1, 1, np.nan, np.nan],
             ],
-            columns=["trade_price", f"bid_{subset}", f"ask_{subset}", "price_all_lag"],
+            columns=[
+                "trade_price",
+                f"bid_{subset}",
+                f"ask_{subset}",
+                f"price_{subset}_lag",
+            ],
         )
         y_test = pd.Series([-1, 1, 1, -1, -1, 1])
         fitted_classifier = ClassicalClassifier(
@@ -365,7 +398,12 @@ def test_rev_emo(self, subset: str) -> None:
         """
         x_train = pd.DataFrame(
             [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]],
-            columns=["trade_price", f"bid_{subset}", f"ask_{subset}", "price_all_lead"],
+            columns=[
+                "trade_price",
+                f"bid_{subset}",
+                f"ask_{subset}",
+                f"price_{subset}_lead",
+            ],
         )
         y_train = pd.Series([-1, 1, -1])
         # first two by quote rule, two by tick rule, two by random chance.
@@ -378,7 +416,12 @@ def test_rev_emo(self, subset: str) -> None:
                 [1, 1, np.inf, np.nan],
                 [1, 1, np.nan, np.nan],
             ],
-            columns=["trade_price", f"ask_{subset}", f"bid_{subset}", "price_all_lead"],
+            columns=[
+                "trade_price",
+                f"ask_{subset}",
+                f"bid_{subset}",
+                f"price_{subset}_lead",
+            ],
         )
         y_test = pd.Series([-1, 1, 1, -1, -1, 1])
         fitted_classifier = ClassicalClassifier(
@@ -398,7 +441,12 @@ def test_clnv(self, subset: str) -> None:
         """
         x_train = pd.DataFrame(
             [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]],
-            columns=["trade_price", f"ask_{subset}", f"bid_{subset}", "price_all_lag"],
+            columns=[
+                "trade_price",
+                f"ask_{subset}",
+                f"bid_{subset}",
+                f"price_{subset}_lag",
+            ],
         )
         y_train = pd.Series([-1, 1, -1])
         # first two by quote rule, two by tick rule, two by random chance.
@@ -411,7 +459,12 @@ def test_clnv(self, subset: str) -> None:
                 [1.7, 3, 1, 0],  # tick rule
                 [1.3, 3, 1, 1],  # quote rule
             ],
-            columns=["trade_price", f"ask_{subset}", f"bid_{subset}", "price_all_lag"],
+            columns=[
+                "trade_price",
+                f"ask_{subset}",
+                f"bid_{subset}",
+                f"price_{subset}_lag",
+            ],
         )
         y_test = pd.Series([1, -1, 1, -1, 1, -1])
         fitted_classifier = ClassicalClassifier(
@@ -431,7 +484,12 @@ def test_rev_clnv(self, subset: str) -> None:
         """
         x_train = pd.DataFrame(
             [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]],
-            columns=["trade_price", f"ask_{subset}", f"bid_{subset}", "price_all_lead"],
+            columns=[
+                "trade_price",
+                f"ask_{subset}",
+                f"bid_{subset}",
+                f"price_{subset}_lead",
+            ],
         )
         y_train = pd.Series([-1, 1, -1])
         # .
@@ -444,7 +502,12 @@ def test_rev_clnv(self, subset: str) -> None:
                 [1.7, 3, 1, 0],  # rev tick rule
                 [1.3, 3, 1, 1],  # quote rule
             ],
-            columns=["trade_price", f"ask_{subset}", f"bid_{subset}", "price_all_lead"],
+            columns=[
+                "trade_price",
+                f"ask_{subset}",
+                f"bid_{subset}",
+                f"price_{subset}_lead",
+            ],
         )
         y_test = pd.Series([1, -1, 1, -1, 1, -1])
         fitted_classifier = ClassicalClassifier(