From fb518b93a74057ec90c21ee950a47b8d424d4ca7 Mon Sep 17 00:00:00 2001 From: Markus Bilz Date: Mon, 20 Nov 2023 17:25:40 +0100 Subject: [PATCH] Add ruff + basic implementation of ClassicalClassifier --- .github/workflows/tests.yaml | 2 +- examples/plot_classifier.py | 45 --- examples/plot_template.py | 18 - examples/plot_transformer.py | 27 -- pyproject.toml | 66 ++-- {tclf/tests => src/tclf}/__init__.py | 0 src/tclf/classical_classifier.py | 526 ++++++++++++++++++++++++++ tclf/__init__.py | 9 - tclf/_template.py | 217 ----------- tclf/_version.py | 1 - tclf/tests/test_common.py | 11 - tclf/tests/test_template.py | 61 --- tests/__init__.py | 0 tests/templates.py | 52 +++ tests/test_classical_classifier.py | 541 +++++++++++++++++++++++++++ 15 files changed, 1165 insertions(+), 411 deletions(-) delete mode 100644 examples/plot_classifier.py delete mode 100644 examples/plot_template.py delete mode 100644 examples/plot_transformer.py rename {tclf/tests => src/tclf}/__init__.py (100%) create mode 100644 src/tclf/classical_classifier.py delete mode 100644 tclf/__init__.py delete mode 100644 tclf/_template.py delete mode 100644 tclf/_version.py delete mode 100644 tclf/tests/test_common.py delete mode 100644 tclf/tests/test_template.py create mode 100644 tests/__init__.py create mode 100644 tests/templates.py create mode 100644 tests/test_classical_classifier.py diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index 8ad336a..346cc60 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -19,6 +19,6 @@ jobs: python -m pip install --upgrade pip setuptools pip install .[dev] - name: Test with pytest - run: pytest -v --cov=tclf --pyargs tclf + run: pytest -v --cov=src tests/ - name: Upload Coverage to Codecov uses: codecov/codecov-action@v3 \ No newline at end of file diff --git a/examples/plot_classifier.py b/examples/plot_classifier.py deleted file mode 100644 index 104bd42..0000000 --- a/examples/plot_classifier.py +++ /dev/null @@ -1,45 +0,0 @@ -""" -============================ -Plotting Template Classifier -============================ - -An example plot of :class:`tclf.template.TemplateClassifier` -""" -import numpy as np -from matplotlib import pyplot as plt - -from tclf import TemplateClassifier - -X = [[0, 0], [1, 1]] -y = [0, 1] -clf = TemplateClassifier() -clf.fit(X, y) - -rng = np.random.RandomState(13) -X_test = rng.rand(500, 2) -y_pred = clf.predict(X_test) - -X_0 = X_test[y_pred == 0] -X_1 = X_test[y_pred == 1] - - -p0 = plt.scatter(0, 0, c="red", s=100) -p1 = plt.scatter(1, 1, c="blue", s=100) - -ax0 = plt.scatter(X_0[:, 0], X_0[:, 1], c="crimson", s=50) -ax1 = plt.scatter(X_1[:, 0], X_1[:, 1], c="deepskyblue", s=50) - -leg = plt.legend( - [p0, p1, ax0, ax1], - ["Point 0", "Point 1", "Class 0", "Class 1"], - loc="upper left", - fancybox=True, - scatterpoints=1, -) -leg.get_frame().set_alpha(0.5) - -plt.xlabel("Feature 1") -plt.ylabel("Feature 2") -plt.xlim([-0.5, 1.5]) - -plt.show() diff --git a/examples/plot_template.py b/examples/plot_template.py deleted file mode 100644 index 13e27e8..0000000 --- a/examples/plot_template.py +++ /dev/null @@ -1,18 +0,0 @@ -""" -=========================== -Plotting Template Estimator -=========================== - -An example plot of :class:`tclf.template.TemplateEstimator` -""" -import numpy as np -from matplotlib import pyplot as plt - -from tclf import TemplateEstimator - -X = np.arange(100).reshape(100, 1) -y = np.zeros((100,)) -estimator = TemplateEstimator() -estimator.fit(X, y) -plt.plot(estimator.predict(X)) -plt.show() diff --git a/examples/plot_transformer.py b/examples/plot_transformer.py deleted file mode 100644 index 1abcb76..0000000 --- a/examples/plot_transformer.py +++ /dev/null @@ -1,27 +0,0 @@ -""" -============================= -Plotting Template Transformer -============================= - -An example plot of :class:`tclf.template.TemplateTransformer` -""" -import numpy as np -from matplotlib import pyplot as plt - -from tclf import TemplateTransformer - -X = np.arange(50, dtype=float).reshape(-1, 1) -X /= 50 -estimator = TemplateTransformer() -X_transformed = estimator.fit_transform(X) - -plt.plot(X.flatten(), label="Original Data") -plt.plot(X_transformed.flatten(), label="Transformed Data") -plt.title("Plots of original and transformed data") - -plt.legend(loc="best") -plt.grid(True) -plt.xlabel("Index") -plt.ylabel("Value of Data") - -plt.show() diff --git a/pyproject.toml b/pyproject.toml index 88c8e29..030e74e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,9 +2,6 @@ requires = ["setuptools"] build-backend = "setuptools.build_meta" -[tool.setuptools] -packages = ["tclf"] - [project] name = "otc" authors = [ @@ -35,16 +32,6 @@ dynamic = ["version"] "Homepage" = "https://github.com/KarelZe/thesis" "Bug Tracker" = "https://github.com/KarelZe/thesis/issues" -[tool.autoflake] -recursive = true -in-place = true -ignore-init-module-imports = true -remove-all-unused-imports = true -remove-unused-variables = true - -[tool.isort] -profile = "black" - [tool.mypy] # https://github.com/python/mypy/issues/2410 ignore_missing_imports = true @@ -55,18 +42,13 @@ disallow_incomplete_defs = true [project.optional-dependencies] dev=[ "build", + "mypy", "pre-commit", "pytest", "pytest-cov", "ruff", ] - -[tool.pylint.TYPECHECK] -# List of members which are set dynamically and missed by Pylint inference -# system, and so shouldn't trigger E1101 when accessed. -generated-members=["numpy.*", "torch.*"] - [tool.pytest.ini_options] minversion = 7.0 addopts = "-ra -p no:warnings -v --cov --cov-report term-missing --doctest-modules" @@ -79,8 +61,6 @@ omit = [ "tclf/tests/*", ] branch = true -source = ["tclf"] -include = ["*/tclf/*"] [tool.coverage.report] exclude_also = [ @@ -94,3 +74,47 @@ exclude_also = [ "if self.verbose:" ] show_missing = true + + +[tool.ruff] +# See rules: https://beta.ruff.rs/docs/rules/ +select = [ + "C", # flake8-comprehensions + "D", # pydocstyle + "E", # pycodestyle errors + "F", # pyflakes + "I", # isort + "N", # pep8-naming + "NPY", # numpy + "PD", # pandas-vet + "PIE", # misc lints + "PT", # pytest + "PTH", # flake8-use-pathlib + "PGH", # pygrep + "RET", # return + "RUF", # ruff-specific rules + "UP", # pyupgrade + "SIM", # flake8-simplify + "W", # pycodestyle warnings +] + +include = ["*.py", "*.pyi", "**/pyproject.toml", "*.ipynb"] + +ignore = [ + "E501", # line too long, handled by black + "N803", # argument name should be lowercase + "N806", # variable name should be lowercase + "C901", # too complex + "D206", # indent with white space + "W191", # tab identation +] + +[tool.ruff.isort] +known-first-party = ["tclf"] +section-order = ["future", "standard-library", "third-party", "first-party", "local-folder"] + +[tool.ruff.per-file-ignores] +"__init__.py" = ["D104", "F401"] # disable missing docstrings in __init__, unused imports + +[tool.ruff.pydocstyle] +convention = "google" \ No newline at end of file diff --git a/tclf/tests/__init__.py b/src/tclf/__init__.py similarity index 100% rename from tclf/tests/__init__.py rename to src/tclf/__init__.py diff --git a/src/tclf/classical_classifier.py b/src/tclf/classical_classifier.py new file mode 100644 index 0000000..32194bb --- /dev/null +++ b/src/tclf/classical_classifier.py @@ -0,0 +1,526 @@ +"""Implements classical trade classification rules with a sklearn-like interface. + +Both simple rules like quote rule or tick test or hybrids are included. +""" + +from __future__ import annotations + +from typing import Any, Literal + +import numpy as np +import numpy.typing as npt +import pandas as pd +from sklearn.base import BaseEstimator, ClassifierMixin +from sklearn.utils import check_random_state +from sklearn.utils.multiclass import check_classification_targets +from sklearn.utils.validation import _check_sample_weight, check_is_fitted, check_X_y + +allowed_func_str = ( + "tick", + "rev_tick", + "quote", + "lr", + "rev_lr", + "emo", + "rev_emo", + "clnv", + "rev_clnv", + "trade_size", + "depth", + "nan", +) + +allowed_subsets = ("all", "ex", "best") + + +class ClassicalClassifier(ClassifierMixin, BaseEstimator): + """ClassicalClassifier implements several trade classification rules. + + Including: + * Tick test + * Reverse tick test + * Quote rule + * LR algorithm + * LR algorithm with reverse tick test + * EMO algorithm + * EMO algorithm with reverse tick test + * CLNV algorithm + * CLNV algorithm with reverse tick test + * Trade size rule + * Depth rule + * nan + + Args: + ---- + ClassifierMixin (_type_): ClassifierMixin + BaseEstimator (_type_): Baseestimator + """ + + def __init__( + self, + *, + layers: list[ + tuple[ + str, + str, + ] + ], + features: list[str] | None = None, + random_state: float | None = 42, + strategy: Literal["random", "const"] = "random", + ): + """Initialize a ClassicalClassifier. + + Args: + layers (List[ tuple[ str, str, ] ]): Layers of classical rule. + features (List[str] | None, optional): List of feature names in order of + columns. Required to match columns in feature matrix with label. + Can be `None`, if `pd.DataFrame` is passed. Defaults to None. + random_state (float | None, optional): random seed. Defaults to 42. + strategy (Literal["random", "const"], optional): Strategy to fill unclassfied. Randomly with uniform probability or with constant 0. Defaults to "random". + """ + self.layers = layers + self.random_state = random_state + self.features = features + self.strategy = strategy + + def _more_tags(self) -> dict[str, bool]: + """Set tags for sklearn. + + See: https://scikit-learn.org/stable/developers/develop.html#estimator-tags + """ + # FIXME: Try enabling _skip_test again. Skip tests, as prediction is not + # invariant and parameters immutable. + return { + "allow_nan": True, + "binary_only": True, + "_skip_test": True, + "poor_score": True, + } + + def _tick(self, subset: Literal["all", "ex"]) -> npt.NDArray: + """Classify a trade as a buy (sell) if its trade price is above (below) the closest different price of a previous trade. + + Args: + subset (Literal["all", "ex"]): subset i. e., + 'all' or 'ex'. + + Returns: + npt.NDArray: result of tick rule. Can be np.NaN. + """ + return np.where( + self.X_["TRADE_PRICE"] > self.X_[f"price_{subset}_lag"], + 1, + np.where( + self.X_["TRADE_PRICE"] < self.X_[f"price_{subset}_lag"], -1, np.nan + ), + ) + + def _rev_tick(self, subset: Literal["all", "ex"]) -> npt.NDArray: + """Classify a trade as a sell (buy) if its trade price is below (above) the closest different price of a subsequent trade. + + Args: + subset (Literal["all", "ex"]): subset i. e., + 'all' or 'ex'. + + Returns: + npt.NDArray: result of reverse tick rule. Can be np.NaN. + """ + return np.where( + self.X_[f"price_{subset}_lead"] > self.X_["TRADE_PRICE"], + -1, + np.where( + self.X_[f"price_{subset}_lead"] < self.X_["TRADE_PRICE"], 1, np.nan + ), + ) + + def _quote(self, subset: Literal["best", "ex"]) -> npt.NDArray: + """Classify a trade as a buy (sell) if its trade price is above (below) the midpoint of the bid and ask spread. Trades executed at the midspread are not classified. + + Args: + subset (Literal["ex", "best"]): subset i. e., + 'ex' or 'best'. + + Returns: + npt.NDArray: result of quote rule. Can be np.NaN. + """ + mid = self._mid(subset) + + return np.where( + self.X_["TRADE_PRICE"] > mid, + 1, + np.where(self.X_["TRADE_PRICE"] < mid, -1, np.nan), + ) + + def _lr(self, subset: Literal["best", "ex"]) -> npt.NDArray: + """Classify a trade as a buy (sell) if its price is above (below) the midpoint (quote rule), and use the tick test (all) to classify midspread trades. + + Adapted from Lee and Ready (1991). + + Args: + subset (Literal["ex", "best"]): subset i. e., + 'ex' or 'best'. + + Returns: + npt.ndarray: result of the lee and ready algorithm with tick rule. + Can be np.NaN. + """ + q_r = self._quote(subset) + return np.where(~np.isnan(q_r), q_r, self._tick("all")) + + def _rev_lr(self, subset: Literal["best", "ex"]) -> npt.NDArray: + """Classify a trade as a buy (sell) if its price is above (below) the midpoint (quote rule), and use the reverse tick test (all) to classify midspread trades. + + Adapted from Lee and Ready (1991). + + Args: + subset (Literal["ex", "best"]): subset i. e., + 'ex' or 'best'. + + Returns: + npt.NDArray: result of the lee and ready algorithm with reverse tick + rule. Can be np.NaN. + """ + q_r = self._quote(subset) + return np.where(~np.isnan(q_r), q_r, self._rev_tick("all")) + + def _mid(self, subset: Literal["best", "ex"]) -> npt.NDArray: + """Calculate the midpoint of the bid and ask spread. + + Midpoint is calculated as the average of the bid and ask spread if the spread is positive. Otherwise, np.NaN is returned. + + Args: + subset (Literal["best", "ex"]): subset i. e., + 'ex' or 'best' + Returns: + npt.NDArray: midpoints. Can be np.NaN. + """ + return np.where( + self.X_[f"ask_{subset}"] >= self.X_[f"bid_{subset}"], + 0.5 * (self.X_[f"ask_{subset}"] + self.X_[f"bid_{subset}"]), + np.nan, + ) + + def _is_at_ask_xor_bid(self, subset: Literal["best", "ex"]) -> pd.Series: + """Check if the trade price is at the ask xor bid. + + Args: + subset (Literal["ex", "best"]): subset i. e., + 'ex' or 'best'. + + Returns: + pd.Series: boolean series with result. + """ + at_ask = np.isclose(self.X_["TRADE_PRICE"], self.X_[f"ask_{subset}"], atol=1e-4) + at_bid = np.isclose(self.X_["TRADE_PRICE"], self.X_[f"bid_{subset}"], atol=1e-4) + return at_ask ^ at_bid + + def _is_at_upper_xor_lower_quantile( + self, subset: Literal["best", "ex"], quantiles: float = 0.3 + ) -> pd.Series: + """Check if the trade price is at the ask xor bid. + + Args: + subset (Literal["best", "ex"]): subset i. e., 'ex'. + quantiles (float, optional): percentage of quantiles. Defaults to 0.3. + + Returns: + pd.Series: boolean series with result. + """ + in_upper = ( + (1.0 - quantiles) * self.X_[f"ask_{subset}"] + + quantiles * self.X_[f"bid_{subset}"] + <= self.X_["TRADE_PRICE"] + ) & (self.X_["TRADE_PRICE"] <= self.X_[f"ask_{subset}"]) + in_lower = (self.X_[f"bid_{subset}"] <= self.X_["TRADE_PRICE"]) & ( + self.X_["TRADE_PRICE"] + <= quantiles * self.X_[f"ask_{subset}"] + + (1.0 - quantiles) * self.X_[f"bid_{subset}"] + ) + return in_upper ^ in_lower + + def _emo(self, subset: Literal["best", "ex"]) -> npt.NDArray: + """Classify a trade as a buy (sell) if the trade takes place at the ask (bid) quote, and use the tick test (all) to classify all other trades. + + Adapted from Ellis et al. (2000). + + Args: + subset (Literal["ex", "best"]): subset i. e., + 'ex' or 'best'. + + Returns: + npt.NDArray: result of the emo algorithm with tick rule. Can be + np.NaN. + """ + return np.where( + self._is_at_ask_xor_bid(subset), self._quote(subset), self._tick("all") + ) + + def _rev_emo(self, subset: Literal["best", "ex"]) -> npt.NDArray: + """Classify a trade as a buy (sell) if the trade takes place at the ask (bid) quote, and use the reverse tick test (all) to classify all other trades. + + Adapted from Grauer et al. (2022). + + Args: + subset (Literal["ex", "best"]): subset + i. e., 'ex' or 'best'. + + Returns: + npt.NDArray: result of the emo algorithm with reverse tick rule. + Can be np.NaN. + """ + return np.where( + self._is_at_ask_xor_bid(subset), self._quote(subset), self._rev_tick("all") + ) + + def _clnv(self, subset: Literal["best", "ex"]) -> npt.NDArray: + """Classify a trade based on deciles of the bid and ask spread. + + Spread is divided into ten deciles and trades are classified as follows: + - use quote rule for at ask until 30 % below ask (upper 3 deciles) + - use quote rule for at bid until 30 % above bid (lower 3 deciles) + - use tick rule (all) for all other trades (±2 deciles from midpoint; outside + bid or ask). + + Adapted from Chakrabarty et al. (2007). + + Args: + subset (Literal["ex", "best"]): subset i. e., + 'ex' or 'best'. + + Returns: + npt.NDArray: result of the emo algorithm with tick rule. Can be + np.NaN. + """ + return np.where( + self._is_at_upper_xor_lower_quantile(subset), + self._quote(subset), + self._tick("all"), + ) + + def _rev_clnv(self, subset: Literal["best", "ex"]) -> npt.NDArray: + """Classify a trade based on deciles of the bid and ask spread. + + Spread is divided into ten deciles and trades are classified as follows: + - use quote rule for at ask until 30 % below ask (upper 3 deciles) + - use quote rule for at bid until 30 % above bid (lower 3 deciles) + - use reverse tick rule (all) for all other trades (±2 deciles from midpoint; + outside bid or ask). + + Similar to extension of emo algorithm proposed Grauer et al. (2022). + + Args: + subset (Literal["ex", "best"]): subset i. e., + 'ex' or 'best'. + + Returns: + npt.NDArray: result of the emo algorithm with tick rule. Can be + np.NaN. + """ + return np.where( + self._is_at_upper_xor_lower_quantile(subset), + self._quote(subset), + self._rev_tick("all"), + ) + + def _trade_size(self, *args: Any) -> npt.NDArray: + """Classify a trade as a buy (sell) the trade size matches exactly either the bid (ask) quote size. + + Adapted from Grauer et al. (2022). + + Returns: + npt.NDArray: result of the trade size rule. Can be np.NaN. + """ + bid_eq_ask = np.isclose( + self.X_["ask_size_ex"], self.X_["bid_size_ex"], atol=1e-4 + ) + + ts_eq_bid = ( + np.isclose(self.X_["TRADE_SIZE"], self.X_["bid_size_ex"], atol=1e-4) + & ~bid_eq_ask + ) + ts_eq_ask = ( + np.isclose(self.X_["TRADE_SIZE"], self.X_["ask_size_ex"], atol=1e-4) + & ~bid_eq_ask + ) + + return np.where(ts_eq_bid, 1, np.where(ts_eq_ask, -1, np.nan)) + + def _depth(self, subset: Literal["best", "ex"]) -> npt.NDArray: + """Classify midspread trades as buy (sell), if the ask size (bid size) exceeds the bid size (ask size). + + Adapted from Grauer et al. (2022). + + Args: + subset (Literal["best", "ex"]): subset + + Returns: + npt.NDArray: result of depth rule. Can be np.NaN. + """ + at_mid = np.isclose(self._mid(subset), self.X_["TRADE_PRICE"], atol=1e-4) + + return np.where( + at_mid & (self.X_["ask_size_ex"] > self.X_["bid_size_ex"]), + 1, + np.where( + at_mid & (self.X_["ask_size_ex"] < self.X_["bid_size_ex"]), + -1, + np.nan, + ), + ) + + def _nan(self, *args: Any) -> npt.NDArray: + """Classify nothing. Fast forward results from previous classifier. + + Returns: + npt.NDArray: result of the trade size rule. Can be np.NaN. + """ + return np.full(shape=(self.X_.shape[0],), fill_value=np.nan) + + def fit( + self, + X: npt.NDArray | pd.DataFrame, + y: npt.NDArray | pd.Series, + sample_weight: npt.NDArray | None = None, + ) -> ClassicalClassifier: + """Fit the classifier. + + Args: + X (npt.NDArray | pd.DataFrame): features + y (npt.NDArray | pd.Series): ground truth (ignored) + sample_weight (npt.NDArray | None, optional): Sample weights. + Defaults to None. + + Raises: + ValueError: Unknown subset e. g., 'ise' + ValueError: Unknown function string e. g., 'lee-ready' + ValueError: Multi output is not supported. + + Returns: + ClassicalClassifier: Instance of itself. + """ + _check_sample_weight(sample_weight, X) + + funcs = ( + self._tick, + self._rev_tick, + self._quote, + self._lr, + self._rev_lr, + self._emo, + self._rev_emo, + self._clnv, + self._rev_clnv, + self._trade_size, + self._depth, + self._nan, + ) + + self.func_mapping_ = dict(zip(allowed_func_str, funcs)) + + # create working copy to be altered and try to get columns from df + self.columns_ = self.features + if isinstance(X, pd.DataFrame): + self.columns_ = X.columns.tolist() + + check_classification_targets(y) + + X, y = check_X_y( + X, y, multi_output=False, accept_sparse=False, force_all_finite=False + ) + + # FIXME: make flexible if open-sourced + # self.classes_ = np.unique(y) + self.classes_ = np.array([-1, 1]) + + # if no features are provided or inferred, use default + if not self.columns_: + self.columns_ = [str(i) for i in range(X.shape[1])] + + if len(self.columns_) > 0 and X.shape[1] != len(self.columns_): + raise ValueError( + f"Expected {len(self.columns_)} columns, got {X.shape[1]}." + ) + + for func_str, subset in self.layers: + if subset not in allowed_subsets: + raise ValueError( + f"Unknown subset: {subset}, expected one of {allowed_subsets}." + ) + if func_str not in allowed_func_str: + raise ValueError( + f"Unknown function string: {func_str}," + f"expected one of {allowed_func_str}." + ) + + return self + + def predict(self, X: npt.NDArray | pd.DataFrame) -> npt.NDArray: + """Perform classification on test vectors `X`. + + Args: + X (npt.NDArray | pd.DataFrame): feature matrix. + + Returns: + npt.NDArray: Predicted traget values for X. + """ + check_is_fitted(self) + + rs = check_random_state(self.random_state) + + self.X_ = pd.DataFrame(data=X, columns=self.columns_) + + mapping_cols = {"BEST_ASK": "ask_best", "BEST_BID": "bid_best"} + + self.X_ = self.X_.rename(columns=mapping_cols) + + pred = np.full(shape=(X.shape[0],), fill_value=np.nan) + + for func_str, subset in self.layers: + func = self.func_mapping_[func_str] + pred = np.where( + np.isnan(pred), + func(subset), + pred, + ) + + # fill NaNs randomly with -1 and 1 or with constant zero + mask = np.isnan(pred) + if self.strategy == "random": + pred[mask] = rs.choice(self.classes_, pred.shape)[mask] + else: + pred[mask] = np.zeros(pred.shape)[mask] + + # reset self.X_ to avoid persisting it + del self.X_ + return pred + + def predict_proba(self, X: npt.NDArray | pd.DataFrame) -> npt.NDArray: + """Predict class probabilities for X. + + Probabilities are either 0 or 1 depending on the class. + + For strategy 'constant' probabilities are (0.5,0.5) for unclassified classes. + + Args: + X (npt.NDArray | pd.DataFrame): feature matrix + + Returns: + npt.NDArray: probabilities + """ + # assign 0.5 to all classes. Required for strategy 'constant'. + prob = np.full((len(X), 2), 0.5) + + # Class can be assumed to be -1 or 1 for strategy 'random'. + # Class might be zero though for strategy constant. Mask non-zeros. + preds = self.predict(X) + mask = np.flatnonzero(preds) + + # get index of predicted class and one-hot encode it + indices = np.where(preds[mask, None] == self.classes_[None, :])[1] + n_classes = np.max(self.classes_) + 1 + + # overwrite defaults with one-hot encoded classes. + # For strategy 'constant' probabilities are (0.5,0.5). + prob[mask] = np.identity(n_classes)[indices] + return prob diff --git a/tclf/__init__.py b/tclf/__init__.py deleted file mode 100644 index 879c4c6..0000000 --- a/tclf/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -from ._template import TemplateClassifier, TemplateEstimator, TemplateTransformer -from ._version import __version__ - -__all__ = [ - "TemplateEstimator", - "TemplateClassifier", - "TemplateTransformer", - "__version__", -] diff --git a/tclf/_template.py b/tclf/_template.py deleted file mode 100644 index 079d3a7..0000000 --- a/tclf/_template.py +++ /dev/null @@ -1,217 +0,0 @@ -""" -This is a module to be used as a reference for building other modules -""" -import numpy as np -from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin -from sklearn.metrics import euclidean_distances -from sklearn.utils.multiclass import unique_labels -from sklearn.utils.validation import check_array, check_is_fitted, check_X_y - - -class TemplateEstimator(BaseEstimator): - """A template estimator to be used as a reference implementation. - - For more information regarding how to build your own estimator, read more - in the :ref:`User Guide `. - - Parameters - ---------- - demo_param : str, default='demo_param' - A parameter used for demonstation of how to pass and store paramters. - - Examples - -------- - >>> from tclf import TemplateEstimator - >>> import numpy as np - >>> X = np.arange(100).reshape(100, 1) - >>> y = np.zeros((100, )) - >>> estimator = TemplateEstimator() - >>> estimator.fit(X, y) - TemplateEstimator() - """ - - def __init__(self, demo_param="demo_param"): - self.demo_param = demo_param - - def fit(self, X, y): - """A reference implementation of a fitting function. - - Parameters - ---------- - X : {array-like, sparse matrix}, shape (n_samples, n_features) - The training input samples. - y : array-like, shape (n_samples,) or (n_samples, n_outputs) - The target values (class labels in classification, real numbers in - regression). - - Returns - ------- - self : object - Returns self. - """ - X, y = check_X_y(X, y, accept_sparse=True) - self.is_fitted_ = True - # `fit` should always return `self` - return self - - def predict(self, X): - """A reference implementation of a predicting function. - - Parameters - ---------- - X : {array-like, sparse matrix}, shape (n_samples, n_features) - The training input samples. - - Returns - ------- - y : ndarray, shape (n_samples,) - Returns an array of ones. - """ - X = check_array(X, accept_sparse=True) - check_is_fitted(self, "is_fitted_") - return np.ones(X.shape[0], dtype=np.int64) - - -class TemplateClassifier(ClassifierMixin, BaseEstimator): - """An example classifier which implements a 1-NN algorithm. - - For more information regarding how to build your own classifier, read more - in the :ref:`User Guide `. - - Parameters - ---------- - demo_param : str, default='demo' - A parameter used for demonstation of how to pass and store paramters. - - Attributes - ---------- - X_ : ndarray, shape (n_samples, n_features) - The input passed during :meth:`fit`. - y_ : ndarray, shape (n_samples,) - The labels passed during :meth:`fit`. - classes_ : ndarray, shape (n_classes,) - The classes seen at :meth:`fit`. - """ - - def __init__(self, demo_param="demo"): - self.demo_param = demo_param - - def fit(self, X, y): - """A reference implementation of a fitting function for a classifier. - - Parameters - ---------- - X : array-like, shape (n_samples, n_features) - The training input samples. - y : array-like, shape (n_samples,) - The target values. An array of int. - - Returns - ------- - self : object - Returns self. - """ - # Check that X and y have correct shape - X, y = check_X_y(X, y) - # Store the classes seen during fit - self.classes_ = unique_labels(y) - - self.X_ = X - self.y_ = y - # Return the classifier - return self - - def predict(self, X): - """A reference implementation of a prediction for a classifier. - - Parameters - ---------- - X : array-like, shape (n_samples, n_features) - The input samples. - - Returns - ------- - y : ndarray, shape (n_samples,) - The label for each sample is the label of the closest sample - seen during fit. - """ - # Check is fit had been called - check_is_fitted(self, ["X_", "y_"]) - - # Input validation - X = check_array(X) - - closest = np.argmin(euclidean_distances(X, self.X_), axis=1) - return self.y_[closest] - - -class TemplateTransformer(TransformerMixin, BaseEstimator): - """An example transformer that returns the element-wise square root. - - For more information regarding how to build your own transformer, read more - in the :ref:`User Guide `. - - Parameters - ---------- - demo_param : str, default='demo' - A parameter used for demonstation of how to pass and store paramters. - - Attributes - ---------- - n_features_ : int - The number of features of the data passed to :meth:`fit`. - """ - - def __init__(self, demo_param="demo"): - self.demo_param = demo_param - - def fit(self, X, y=None): - """A reference implementation of a fitting function for a transformer. - - Parameters - ---------- - X : {array-like, sparse matrix}, shape (n_samples, n_features) - The training input samples. - y : None - There is no need of a target in a transformer, yet the pipeline API - requires this parameter. - - Returns - ------- - self : object - Returns self. - """ - X = check_array(X, accept_sparse=True) - - self.n_features_ = X.shape[1] - - # Return the transformer - return self - - def transform(self, X): - """A reference implementation of a transform function. - - Parameters - ---------- - X : {array-like, sparse-matrix}, shape (n_samples, n_features) - The input samples. - - Returns - ------- - X_transformed : array, shape (n_samples, n_features) - The array containing the element-wise square roots of the values - in ``X``. - """ - # Check is fit had been called - check_is_fitted(self, "n_features_") - - # Input validation - X = check_array(X, accept_sparse=True) - - # Check that the input is of the same shape as the one passed - # during fit. - if X.shape[1] != self.n_features_: - raise ValueError( - "Shape of input is different from what was seen" "in `fit`" - ) - return np.sqrt(X) diff --git a/tclf/_version.py b/tclf/_version.py deleted file mode 100644 index 27fdca4..0000000 --- a/tclf/_version.py +++ /dev/null @@ -1 +0,0 @@ -__version__ = "0.0.3" diff --git a/tclf/tests/test_common.py b/tclf/tests/test_common.py deleted file mode 100644 index 57a70b9..0000000 --- a/tclf/tests/test_common.py +++ /dev/null @@ -1,11 +0,0 @@ -import pytest -from sklearn.utils.estimator_checks import check_estimator - -from tclf import TemplateClassifier, TemplateEstimator, TemplateTransformer - - -@pytest.mark.parametrize( - "estimator", [TemplateEstimator(), TemplateTransformer(), TemplateClassifier()] -) -def test_all_estimators(estimator): - return check_estimator(estimator) diff --git a/tclf/tests/test_template.py b/tclf/tests/test_template.py deleted file mode 100644 index 1fa8a61..0000000 --- a/tclf/tests/test_template.py +++ /dev/null @@ -1,61 +0,0 @@ -import numpy as np -import pytest -from numpy.testing import assert_allclose, assert_array_equal -from sklearn.datasets import load_iris - -from tclf import TemplateClassifier, TemplateEstimator, TemplateTransformer - - -@pytest.fixture -def data(): - return load_iris(return_X_y=True) - - -def test_template_estimator(data): - est = TemplateEstimator() - assert est.demo_param == "demo_param" - - est.fit(*data) - assert hasattr(est, "is_fitted_") - - X = data[0] - y_pred = est.predict(X) - assert_array_equal(y_pred, np.ones(X.shape[0], dtype=np.int64)) - - -def test_template_transformer_error(data): - X, y = data - trans = TemplateTransformer() - trans.fit(X) - with pytest.raises(ValueError, match="Shape of input is different"): - X_diff_size = np.ones((10, X.shape[1] + 1)) - trans.transform(X_diff_size) - - -def test_template_transformer(data): - X, y = data - trans = TemplateTransformer() - assert trans.demo_param == "demo" - - trans.fit(X) - assert trans.n_features_ == X.shape[1] - - X_trans = trans.transform(X) - assert_allclose(X_trans, np.sqrt(X)) - - X_trans = trans.fit_transform(X) - assert_allclose(X_trans, np.sqrt(X)) - - -def test_template_classifier(data): - X, y = data - clf = TemplateClassifier() - assert clf.demo_param == "demo" - - clf.fit(X, y) - assert hasattr(clf, "classes_") - assert hasattr(clf, "X_") - assert hasattr(clf, "y_") - - y_pred = clf.predict(X) - assert y_pred.shape == (X.shape[0],) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/templates.py b/tests/templates.py new file mode 100644 index 0000000..596acbb --- /dev/null +++ b/tests/templates.py @@ -0,0 +1,52 @@ +"""Tests for Neural networks. + +See: +https://thenerdstation.medium.com/how-to-unit-test-machine-learning-code-57cf6fd81765 +http://karpathy.github.io/2019/04/25/recipe/ +https://krokotsch.eu/posts/deep-learning-unit-tests/ +""" + +import pandas as pd +from sklearn.base import BaseEstimator +from sklearn.utils.estimator_checks import check_estimator + + +class ClassifierMixin: + """Perform automated tests for Classifiers. + + Args: + ---- + unittest (_type_): unittest module + """ + + clf: BaseEstimator + x_test: pd.DataFrame + y_test: pd.Series + + def test_sklearn_compatibility(self) -> None: + """Test, if classifier is compatible with sklearn.""" + check_estimator(self.clf) + + def test_shapes(self) -> None: + """Test, if shapes of the classifier equal the targets. + + Shapes are usually [no. of samples, 1]. + """ + y_pred = self.clf.predict(self.x_test) + + assert self.y_test.shape == y_pred.shape + + def test_proba(self) -> None: + """Test, if probabilities are in [0, 1].""" + y_pred = self.clf.predict_proba(self.x_test) + assert (y_pred >= 0).all() + assert (y_pred <= 1).all() + + def test_score(self) -> None: + """Test, if score is correctly calculated.. + + For a random classification i. e., `layers=[("nan", "ex")]`, the score + should be around 0.5. + """ + accuracy = self.clf.score(self.x_test, self.y_test) + assert 0.0 <= accuracy <= 1.0 diff --git a/tests/test_classical_classifier.py b/tests/test_classical_classifier.py new file mode 100644 index 0000000..3674d9c --- /dev/null +++ b/tests/test_classical_classifier.py @@ -0,0 +1,541 @@ +"""Tests for the classical classifier. + +Use of artificial data to test the classifier. +""" + +import numpy as np +import pandas as pd +import pytest +from sklearn.utils.validation import check_is_fitted + +from tclf.classical_classifier import ClassicalClassifier +from tests.templates import ClassifierMixin + + +class TestClassicalClassifier(ClassifierMixin): + """Perform automated tests for ClassicalClassifier. + + Args: + ---- + unittest (_type_): unittest module + """ + + def setup(self) -> None: + """Set up basic classifier and data. + + Prepares inputs and expected outputs for testing. + """ + self.x_train = pd.DataFrame( + [[1, 2], [3, 4], [1, 2], [3, 4]], columns=["BEST_ASK", "BEST_BID"] + ) + self.y_train = pd.Series([1, 1, -1, -1]) + self.x_test = pd.DataFrame( + [[1, 2], [3, 4], [1, 2], [3, 4]], columns=["BEST_ASK", "BEST_BID"] + ) + self.y_test = pd.Series([1, -1, 1, -1]) + self.clf = ClassicalClassifier( + layers=[("nan", "ex")], + random_state=7, + ).fit(self.x_train, self.y_train) + + def test_random_state(self) -> None: + """Test, if random state is correctly set. + + Two classifiers with the same random state should give the same results. + """ + first_classifier = ClassicalClassifier( + layers=[("nan", "ex")], + random_state=50, + ).fit(self.x_train, self.y_train) + first_y_pred = first_classifier.predict(self.x_test) + + second_classifier = ClassicalClassifier( + layers=[("nan", "ex")], + random_state=50, + ).fit(self.x_train, self.y_train) + second_y_pred = second_classifier.predict(self.x_test) + + assert (first_y_pred == second_y_pred).all() + + def test_fit(self) -> None: + """Test, if fit works. + + A fitted classifier should have an attribute `layers_`. + """ + fitted_classifier = ClassicalClassifier( + layers=[("nan", "ex")], + random_state=42, + ).fit(self.x_train, self.y_train) + assert check_is_fitted(fitted_classifier) is None + + def test_strategy_const(self) -> None: + """Test, if strategy 'const' returns correct proabilities. + + A classifier with strategy 'constant' should return class probabilities + of (0.5, 0.5), if a trade can not be classified. + """ + fitted_classifier = ClassicalClassifier( + layers=[("nan", "ex")], strategy="const" + ).fit(self.x_train, self.y_train) + assert (fitted_classifier.predict_proba(self.x_test) == 0.5).all() + + def test_invalid_func(self) -> None: + """Test, if only valid function strings can be passed. + + An exception should be raised for invalid function strings. + Test for 'foo', which is no valid rule. + """ + classifier = ClassicalClassifier( + layers=[("foo", "all")], + random_state=42, + ) + with pytest.raises(ValueError, match=r"Unknown function string"): + classifier.fit(self.x_train, self.y_train) + + def test_invalid_subset(self) -> None: + """Test, if only valid subset strings can be passed. + + An exception should be raised for invalid subsets. + Test for 'bar', which is no valid subset. + """ + classifier = ClassicalClassifier( + layers=[("tick", "bar")], + random_state=42, + ) + with pytest.raises(ValueError, match=r"Unknown subset"): + classifier.fit(self.x_train, self.y_train) + + def test_invalid_col_length(self) -> None: + """Test, if only valid column length can be passed. + + An exception should be raised if length of columns list does not match + the number of columns in the data. `features` is only used if, data is + not passed as `pd.DataFrame`.Test for columns list of length 2, which + does not match the data. + """ + classifier = ClassicalClassifier( + layers=[("tick", "all")], random_state=42, features=["one"] + ) + with pytest.raises(ValueError, match=r"Expected"): + classifier.fit(self.x_train.values, self.y_train.values) + + def test_override(self) -> None: + """Test, if classifier does not override valid results from layer one. + + If all data can be classified using first rule, first rule should + only be applied. + """ + x_train = pd.DataFrame( + [[0, 0, 0], [0, 0, 0], [0, 0, 0]], + columns=["TRADE_PRICE", "price_ex_lag", "price_all_lead"], + ) + y_train = pd.Series([-1, 1, -1]) + x_test = pd.DataFrame( + [[1, 2, 0], [2, 1, 3]], + columns=["TRADE_PRICE", "price_ex_lag", "price_all_lead"], + ) + y_test = pd.Series([-1, 1]) + fitted_classifier = ClassicalClassifier( + layers=[("tick", "ex"), ("rev_tick", "all")], + random_state=7, + ).fit(x_train, y_train) + y_pred = fitted_classifier.predict(x_test) + assert (y_pred == y_test).all() + + def test_np_array(self) -> None: + """Test, if classifier works, if only np.ndarrays are provided. + + If only np.ndarrays are provided, the classifier should work, by constructing + a dataframe from the arrays and the `columns` list. + """ + x_train = np.array([[0, 0, 0], [0, 0, 0], [0, 0, 0]]) + x_test = np.array([[1, 2, 0], [2, 1, 3]]) + y_train = np.array([0, 0, 0]) + y_test = np.array([-1, 1]) + + columns = ["TRADE_PRICE", "price_ex_lag", "price_all_lead"] + fitted_classifier = ClassicalClassifier( + layers=[("tick", "ex"), ("rev_tick", "all")], + random_state=7, + features=columns, + ).fit(x_train, y_train) + y_pred = fitted_classifier.predict(x_test) + assert (y_pred == y_test).all() + + @pytest.mark.parametrize("subset", ["best", "ex"]) + def test_mid(self, subset: str) -> None: + """Test, if no mid is calculated, if bid exceeds ask etc.""" + x_train = pd.DataFrame( + [[0, 0, 0], [0, 0, 0], [0, 0, 0]], + columns=["TRADE_PRICE", f"bid_{subset}", f"ask_{subset}"], + ) + y_train = pd.Series([-1, 1, -1]) + # first two by rule, all other by random chance. + x_test = pd.DataFrame( + [ + [1.5, 1, 3], + [2.5, 1, 3], + [1.5, 3, 1], # bid > ask + [2.5, 3, 1], # bid > ask + [1, np.nan, 1], # missing data + [3, np.nan, np.nan], # missing_data + ], + columns=["TRADE_PRICE", f"bid_{subset}", f"ask_{subset}"], + ) + y_test = pd.Series([-1, 1, 1, -1, -1, 1]) + fitted_classifier = ClassicalClassifier( + layers=[("quote", subset)], random_state=45 + ).fit(x_train, y_train) + y_pred = fitted_classifier.predict(x_test) + assert (y_pred == y_test).all() + + @pytest.mark.parametrize("subset", ["all", "ex"]) + def test_tick_rule(self, subset: str) -> None: + """Test, if tick rule is correctly applied. + + Tests cases where prev. trade price is higher, lower, equal or missing. + + Args: + subset (str): subset e. g., 'ex' + """ + x_train = pd.DataFrame( + [[0, 0], [0, 0], [0, 0]], columns=["TRADE_PRICE", f"price_{subset}_lag"] + ) + y_train = pd.Series([-1, 1, -1]) + x_test = pd.DataFrame( + [[1, 2], [2, 1], [1, 1], [1, np.nan]], + columns=["TRADE_PRICE", f"price_{subset}_lag"], + ) + + # first two by rule (see p. 28 Grauer et al.), remaining two by random chance. + y_test = pd.Series([-1, 1, 1, -1]) + fitted_classifier = ClassicalClassifier( + layers=[("tick", subset)], + random_state=7, + ).fit(x_train, y_train) + y_pred = fitted_classifier.predict(x_test) + assert (y_pred == y_test).all() + + @pytest.mark.parametrize("subset", ["all", "ex"]) + def test_rev_tick_rule(self, subset: str) -> None: + """Test, if rev. tick rule is correctly applied. + + Tests cases where suc. trade price is higher, lower, equal or missing. + + Args: + subset (str): subset e. g., 'ex' + """ + x_train = pd.DataFrame( + [[0, 0], [0, 0], [0, 0]], columns=["TRADE_PRICE", f"price_{subset}_lead"] + ) + y_train = pd.Series([-1, 1, -1]) + x_test = pd.DataFrame( + [[1, 2], [2, 1], [1, 1], [1, np.nan]], + columns=["TRADE_PRICE", f"price_{subset}_lead"], + ) + + # first two by rule (see p. 28 Grauer et al.), remaining two by random chance. + y_test = pd.Series([-1, 1, 1, -1]) + fitted_classifier = ClassicalClassifier( + layers=[("rev_tick", subset)], random_state=7 + ).fit(x_train, y_train) + y_pred = fitted_classifier.predict(x_test) + assert (y_pred == y_test).all() + + @pytest.mark.parametrize("subset", ["best", "ex"]) + def test_quote_rule(self, subset: str) -> None: + """Test, if quote rule is correctly applied. + + Tests cases where prev. trade price is higher, lower, equal or missing. + + Args: + subset (str): subset e. g., 'ex' + """ + x_train = pd.DataFrame( + [[0, 0, 0], [0, 0, 0], [0, 0, 0]], + columns=["TRADE_PRICE", f"bid_{subset}", f"ask_{subset}"], + ) + y_train = pd.Series([-1, 1, -1]) + # first two by rule (see p. 28 Grauer et al.), remaining four by random chance. + x_test = pd.DataFrame( + [ + [1, 1, 3], + [3, 1, 3], + [1, 1, 1], + [3, 2, 4], + [1, np.nan, 1], + [3, np.nan, np.nan], + ], + columns=["TRADE_PRICE", f"bid_{subset}", f"ask_{subset}"], + ) + y_test = pd.Series([-1, 1, 1, -1, -1, 1]) + fitted_classifier = ClassicalClassifier( + layers=[("quote", subset)], random_state=45 + ).fit(x_train, y_train) + y_pred = fitted_classifier.predict(x_test) + assert (y_pred == y_test).all() + + @pytest.mark.parametrize("subset", ["best", "ex"]) + def test_lr(self, subset: str) -> None: + """Test, if the lr algorithm is correctly applied. + + Tests cases where both quote rule and tick rule all are used. + + Args: + subset (str): subset e. g., 'ex' + """ + x_train = pd.DataFrame( + [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], + columns=["TRADE_PRICE", f"bid_{subset}", f"ask_{subset}", "price_all_lag"], + ) + y_train = pd.Series([-1, 1, -1]) + # first two by quote rule, remaining two by tick rule. + x_test = pd.DataFrame( + [[1, 1, 3, 0], [3, 1, 3, 0], [1, 1, 1, 0], [3, 2, 4, 4]], + columns=["TRADE_PRICE", f"bid_{subset}", f"ask_{subset}", "price_all_lag"], + ) + y_test = pd.Series([-1, 1, 1, -1]) + fitted_classifier = ClassicalClassifier( + layers=[("lr", subset)], random_state=7 + ).fit(x_train, y_train) + y_pred = fitted_classifier.predict(x_test) + assert (y_pred == y_test).all() + + @pytest.mark.parametrize("subset", ["best", "ex"]) + def test_rev_lr(self, subset: str) -> None: + """Test, if the rev. lr algorithm is correctly applied. + + Tests cases where both quote rule and tick rule all are used. + + Args: + subset (str): subset e. g., 'ex' + """ + x_train = pd.DataFrame( + [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], + columns=["TRADE_PRICE", f"bid_{subset}", f"ask_{subset}", "price_all_lead"], + ) + y_train = pd.Series([-1, 1, -1]) + # first two by quote rule, two by tick rule, and two by random chance. + x_test = pd.DataFrame( + [ + [1, 1, 3, 0], + [3, 1, 3, 0], + [1, 1, 1, 0], + [3, 2, 4, 4], + [1, 1, np.nan, np.nan], + [1, 1, np.nan, np.nan], + ], + columns=["TRADE_PRICE", f"bid_{subset}", f"ask_{subset}", "price_all_lead"], + ) + y_test = pd.Series([-1, 1, 1, -1, -1, 1]) + fitted_classifier = ClassicalClassifier( + layers=[("rev_lr", subset)], random_state=42 + ).fit(x_train, y_train) + y_pred = fitted_classifier.predict(x_test) + assert (y_pred == y_test).all() + + @pytest.mark.parametrize("subset", ["best", "ex"]) + def test_emo(self, subset: str) -> None: + """Test, if the emo algorithm is correctly applied. + + Tests cases where both quote rule at bid or ask and tick rule all are used. + + Args: + subset (str): subset e.g., best + """ + x_train = pd.DataFrame( + [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], + columns=["TRADE_PRICE", f"bid_{subset}", f"ask_{subset}", "price_all_lag"], + ) + y_train = pd.Series([-1, 1, -1]) + # first two by quote rule, two by tick rule, two by random chance. + x_test = pd.DataFrame( + [ + [1, 1, 3, 0], + [3, 1, 3, 0], + [ + 1, + 1, + 1, + 0, + ], + [3, 2, 4, 4], + [1, 1, np.inf, np.nan], + [1, 1, np.nan, np.nan], + ], + columns=["TRADE_PRICE", f"bid_{subset}", f"ask_{subset}", "price_all_lag"], + ) + y_test = pd.Series([-1, 1, 1, -1, -1, 1]) + fitted_classifier = ClassicalClassifier( + layers=[("emo", subset)], random_state=42 + ).fit(x_train, y_train) + y_pred = fitted_classifier.predict(x_test) + assert (y_pred == y_test).all() + + @pytest.mark.parametrize("subset", ["best", "ex"]) + def test_rev_emo(self, subset: str) -> None: + """Test, if the rev. emo algorithm is correctly applied. + + Tests cases where both quote rule at bid or ask and rev. tick rule all are used. + + Args: + subset (str): subset e. g., 'ex' + """ + x_train = pd.DataFrame( + [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], + columns=["TRADE_PRICE", f"bid_{subset}", f"ask_{subset}", "price_all_lead"], + ) + y_train = pd.Series([-1, 1, -1]) + # first two by quote rule, two by tick rule, two by random chance. + x_test = pd.DataFrame( + [ + [1, 1, 3, 0], + [3, 1, 3, 0], + [1, 1, 1, 0], + [3, 2, 4, 4], + [1, 1, np.inf, np.nan], + [1, 1, np.nan, np.nan], + ], + columns=["TRADE_PRICE", f"ask_{subset}", f"bid_{subset}", "price_all_lead"], + ) + y_test = pd.Series([-1, 1, 1, -1, -1, 1]) + fitted_classifier = ClassicalClassifier( + layers=[("rev_emo", subset)], random_state=42 + ).fit(x_train, y_train) + y_pred = fitted_classifier.predict(x_test) + assert (y_pred == y_test).all() + + @pytest.mark.parametrize("subset", ["best", "ex"]) + def test_clnv(self, subset: str) -> None: + """Test, if the clnv algorithm is correctly applied. + + Tests cases where both quote rule and tick rule all are used. + + Args: + subset (str): subset e. g., 'ex' + """ + x_train = pd.DataFrame( + [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], + columns=["TRADE_PRICE", f"ask_{subset}", f"bid_{subset}", "price_all_lag"], + ) + y_train = pd.Series([-1, 1, -1]) + # first two by quote rule, two by tick rule, two by random chance. + x_test = pd.DataFrame( + [ + [5, 3, 1, 0], # tick rule + [0, 3, 1, 1], # tick rule + [2.9, 3, 1, 1], # quote rule + [2.3, 3, 1, 3], # tick rule + [1.7, 3, 1, 0], # tick rule + [1.3, 3, 1, 1], # quote rule + ], + columns=["TRADE_PRICE", f"ask_{subset}", f"bid_{subset}", "price_all_lag"], + ) + y_test = pd.Series([1, -1, 1, -1, 1, -1]) + fitted_classifier = ClassicalClassifier( + layers=[("clnv", subset)], random_state=42 + ).fit(x_train, y_train) + y_pred = fitted_classifier.predict(x_test) + assert (y_pred == y_test).all() + + @pytest.mark.parametrize("subset", ["best", "ex"]) + def test_rev_clnv(self, subset: str) -> None: + """Test, if the rev. clnv algorithm is correctly applied. + + Tests cases where both quote rule and rev. tick rule all are used. + + Args: + subset (str): subset e. g., 'ex' + """ + x_train = pd.DataFrame( + [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], + columns=["TRADE_PRICE", f"ask_{subset}", f"bid_{subset}", "price_all_lead"], + ) + y_train = pd.Series([-1, 1, -1]) + # . + x_test = pd.DataFrame( + [ + [5, 3, 1, 0], # rev tick rule + [0, 3, 1, 1], # rev tick rule + [2.9, 3, 1, 1], # quote rule + [2.3, 3, 1, 3], # rev tick rule + [1.7, 3, 1, 0], # rev tick rule + [1.3, 3, 1, 1], # quote rule + ], + columns=["TRADE_PRICE", f"ask_{subset}", f"bid_{subset}", "price_all_lead"], + ) + y_test = pd.Series([1, -1, 1, -1, 1, -1]) + fitted_classifier = ClassicalClassifier( + layers=[("rev_clnv", subset)], random_state=5 + ).fit(x_train, y_train) + y_pred = fitted_classifier.predict(x_test) + assert (y_pred == y_test).all() + + def test_trade_size(self) -> None: + """Test, if the trade size algorithm is correctly applied. + + Tests cases where relevant data is present or missing. + """ + x_train = pd.DataFrame( + [[0, 0, 0], [0, 0, 0], [0, 0, 0]], + columns=["TRADE_SIZE", "ask_size_ex", "bid_size_ex"], + ) + y_train = pd.Series([-1, 1, -1]) + # first two by trade size, random, at bid size, random, random. + x_test = pd.DataFrame( + [ + [1, 1, 3], + [3, 1, 3], + [1, 1, 1], + [3, np.nan, 3], + [1, np.inf, 2], + [1, np.inf, 2], + ], + columns=["TRADE_SIZE", "ask_size_ex", "bid_size_ex"], + ) + y_test = pd.Series([-1, 1, -1, 1, -1, 1]) + fitted_classifier = ClassicalClassifier( + layers=[("trade_size", "ex")], random_state=42 + ).fit(x_train, y_train) + y_pred = fitted_classifier.predict(x_test) + assert (y_pred == y_test).all() + + def test_depth(self) -> None: + """Test, if the depth rule is correctly applied. + + Tests cases where relevant data is present or missing. + """ + x_train = pd.DataFrame( + [[2, 1, 4, 4, 4], [1, 2, 2, 4, 3], [2, 1, 2, 4, 2], [1, 2, 2, 4, 2]], + columns=[ + "ask_size_ex", + "bid_size_ex", + "ask_ex", + "bid_ex", + "TRADE_PRICE", + ], + ) + y_train = pd.Series([-1, 1, -1, 1]) + # first three by depth, all other random as mid is different from trade price. + x_test = pd.DataFrame( + [ + [2, 1, 2, 4, 3], + [1, 2, 2, 4, 3], + [2, 1, 4, 4, 4], + [2, 1, 2, 4, 2], + [2, 1, 2, 4, 2], + ], + columns=[ + "ask_size_ex", + "bid_size_ex", + "ask_ex", + "bid_ex", + "TRADE_PRICE", + ], + ) + y_test = pd.Series([1, -1, 1, 1, -1]) + fitted_classifier = ClassicalClassifier( + layers=[("depth", "ex")], random_state=5 + ).fit(x_train, y_train) + y_pred = fitted_classifier.predict(x_test) + assert (y_pred == y_test).all()