From 201048812fe1d808e7882c55f75b07db426741b9 Mon Sep 17 00:00:00 2001 From: Antoine Guillaume Date: Wed, 9 Nov 2022 10:39:09 +0100 Subject: [PATCH 1/4] Correction n_jobs setting, ambiguous names and prime generation --- convst/__init__.py | 2 +- convst/classifiers/rdst_ensemble.py | 4 +- convst/classifiers/rdst_ridge.py | 13 ++++-- convst/transformers/_commons.py | 6 +-- .../transformers/_multivariate_same_length.py | 4 +- .../_multivariate_variable_length.py | 4 +- .../transformers/_univariate_same_length.py | 6 +-- .../_univariate_variable_length.py | 4 +- convst/transformers/rdst.py | 40 ++++++++++------- convst/utils/dataset_utils.py | 11 +++-- pyproject.toml | 2 +- tests/test_rdst.py | 45 +++++++++++++++++++ 12 files changed, 99 insertions(+), 42 deletions(-) diff --git a/convst/__init__.py b/convst/__init__.py index 73a524f..317c74f 100644 --- a/convst/__init__.py +++ b/convst/__init__.py @@ -1,5 +1,5 @@ __author__ = 'Antoine Guillaume antoine.guillaume45@gmail.com' -__version__ = "0.2.1" +__version__ = "0.2.3" __all__ = ['transformers', 'classifiers', 'utils', 'interpreters'] \ No newline at end of file diff --git a/convst/classifiers/rdst_ensemble.py b/convst/classifiers/rdst_ensemble.py index 89163f9..53e961a 100644 --- a/convst/classifiers/rdst_ensemble.py +++ b/convst/classifiers/rdst_ensemble.py @@ -16,6 +16,7 @@ from sklearn.preprocessing import StandardScaler from sklearn.pipeline import make_pipeline +from numba import set_num_threads class _internalRidgeCV(RidgeClassifierCV): def __init__(self, **kwargs): @@ -107,6 +108,7 @@ def fit(self, X, y): Derivate(), Periodigram() ] + set_num_threads(self.n_jobs_rdst) models = Parallel( n_jobs=self.n_jobs, prefer=self.backend, @@ -118,7 +120,7 @@ def fit(self, X, y): R_DST( n_shapelets=self.n_shapelets_per_estimator, alpha=self.shp_alpha, n_samples=self.n_samples, - proba_norm=self.proba_norm[i], n_jobs=-1, + proba_norm=self.proba_norm[i], n_jobs=False, shapelet_lengths=self.shapelet_lengths, phase_invariance=self.phase_invariance, prime_dilations=self.prime_dilations, diff --git a/convst/classifiers/rdst_ridge.py b/convst/classifiers/rdst_ridge.py index 50feaf7..8b4b2c9 100644 --- a/convst/classifiers/rdst_ridge.py +++ b/convst/classifiers/rdst_ridge.py @@ -9,8 +9,11 @@ from convst.transformers._input_transformers import c_StandardScaler from convst.transformers import R_DST +from convst.utils.checks_utils import check_n_jobs from sklearn.metrics import accuracy_score +from numba import set_num_threads + class R_DST_Ridge(BaseEstimator, ClassifierMixin): """ A wrapper class which use R_DST as a transformer, followed by a Ridge @@ -95,10 +98,14 @@ def __init__( self.shapelet_lengths=shapelet_lengths self.proba_norm=proba_norm self.percentiles=percentiles - self.n_jobs=n_jobs + if isinstance(n_jobs, bool): + self.n_jobs=n_jobs + else: + self.n_jobs=check_n_jobs(n_jobs) + set_num_threads(self.n_jobs) self.random_state=random_state self.min_len=min_len - + def _more_tags(self): return { "capability:variable_length": True, @@ -126,10 +133,10 @@ def _init_components(self): normalize_output=self.normalize_output, n_samples=self.n_samples, n_shapelets=self.n_shapelets, + n_jobs=False, shapelet_lengths=self.shapelet_lengths, proba_norm=self.proba_norm, percentiles=self.percentiles, - n_jobs=self.n_jobs, random_state=self.random_state, min_len=self.min_len ) diff --git a/convst/transformers/_commons.py b/convst/transformers/_commons.py index 4ebd0d6..9b7ba1e 100644 --- a/convst/transformers/_commons.py +++ b/convst/transformers/_commons.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- from numba import njit, prange -from numpy import float_, sqrt, zeros, unique, bool_, where, int64, all as _all +from numpy import float_, sqrt, zeros, unique, bool_, where, int64 ############################################################################### # # @@ -576,8 +576,8 @@ def _combinations_1d(x,y): @njit(cache=True) def prime_up_to(n): - is_p = zeros(n, dtype=bool_) - for i in range(n): + is_p = zeros(n+1, dtype=bool_) + for i in range(n+1): is_p[i] = is_prime(i) return where(is_p)[0] diff --git a/convst/transformers/_multivariate_same_length.py b/convst/transformers/_multivariate_same_length.py index 0f19eda..ae52bd6 100644 --- a/convst/transformers/_multivariate_same_length.py +++ b/convst/transformers/_multivariate_same_length.py @@ -18,7 +18,7 @@ from numba import njit, prange @njit(cache=True) -def _init_random_shapelet_params( +def M_SL_init_random_shapelet_params( n_shapelets, shapelet_sizes, n_timestamps, p_norm, max_channels, prime_scheme ): """ @@ -147,7 +147,7 @@ def M_SL_generate_shapelet( #Initialize shapelets values, lengths, dilations, threshold, normalize, n_channels, channel_ids = \ - _init_random_shapelet_params( + M_SL_init_random_shapelet_params( n_shapelets, shapelet_sizes, n_timestamps, p_norm, max_channels, prime_scheme ) #Initialize self similarity mask diff --git a/convst/transformers/_multivariate_variable_length.py b/convst/transformers/_multivariate_variable_length.py index 07cc487..d3cae4c 100644 --- a/convst/transformers/_multivariate_variable_length.py +++ b/convst/transformers/_multivariate_variable_length.py @@ -18,7 +18,7 @@ from numba import njit, prange @njit(cache=True) -def _init_random_shapelet_params( +def M_VL_init_random_shapelet_params( n_shapelets, shapelet_sizes, n_timestamps, p_norm, max_channels, prime_scheme ): """ @@ -152,7 +152,7 @@ def M_VL_generate_shapelet( #Initialize shapelets values, lengths, dilations, threshold, normalize, n_channels, channel_ids = \ - _init_random_shapelet_params( + M_VL_init_random_shapelet_params( n_shapelets, shapelet_sizes, min_len, p_norm, max_channels, prime_scheme ) #Initialize self similarity mask diff --git a/convst/transformers/_univariate_same_length.py b/convst/transformers/_univariate_same_length.py index 8d2d36b..a70f87f 100644 --- a/convst/transformers/_univariate_same_length.py +++ b/convst/transformers/_univariate_same_length.py @@ -17,7 +17,7 @@ from numba import njit, prange @njit(cache=True) -def _init_random_shapelet_params( +def U_SL_init_random_shapelet_params( n_shapelets, shapelet_sizes, n_timestamps, p_norm, prime_scheme ): """ @@ -64,8 +64,6 @@ def _init_random_shapelet_params( for i in prange(n_shapelets): powers[i] = uniform(0, upper_bounds[i]) dilations = floor(power(2, powers)).astype(int64) - - #PRIME DILATION # Init threshold array threshold = zeros(n_shapelets) @@ -135,7 +133,7 @@ def U_SL_generate_shapelet( #Initialize shapelets values, lengths, dilations, threshold, normalize = \ - _init_random_shapelet_params( + U_SL_init_random_shapelet_params( n_shapelets, shapelet_sizes, n_timestamps, p_norm, prime_scheme ) #Initialize self similarity mask diff --git a/convst/transformers/_univariate_variable_length.py b/convst/transformers/_univariate_variable_length.py index 61d783c..5934d72 100644 --- a/convst/transformers/_univariate_variable_length.py +++ b/convst/transformers/_univariate_variable_length.py @@ -20,7 +20,7 @@ # TODO : check if numba could support Tuple of variable length numpy arrays as input @njit(cache=True) -def _init_random_shapelet_params( +def U_VL_init_random_shapelet_params( n_shapelets, shapelet_sizes, n_timestamps, p_norm, prime_scheme ): """ @@ -145,7 +145,7 @@ def U_VL_generate_shapelet( #Initialize shapelets values, lengths, dilations, threshold, normalize = \ - _init_random_shapelet_params( + U_VL_init_random_shapelet_params( n_shapelets, shapelet_sizes, min_len, p_norm, prime_scheme ) diff --git a/convst/transformers/rdst.py b/convst/transformers/rdst.py index 31b9207..63fd4bb 100644 --- a/convst/transformers/rdst.py +++ b/convst/transformers/rdst.py @@ -13,8 +13,8 @@ from sklearn.utils.validation import check_is_fitted, check_random_state from convst.utils.checks_utils import ( - check_array_3D, check_array_1D, check_n_jobs, check_is_numeric, - check_is_boolean + check_array_3D, check_array_1D, check_is_numeric, + check_is_boolean, check_n_jobs ) from convst.transformers._commons import manhattan, euclidean, squared_euclidean @@ -108,10 +108,10 @@ def __init__( prime_dilations=False, proba_norm=0.8, percentiles=[5,10], - n_jobs=1, random_state=None, max_channels=None, - min_len=None + min_len=None, + n_jobs=1 ): self.transform_type = self._validate_transform_type(transform_type) self.phase_invariance = check_is_boolean(phase_invariance) @@ -124,18 +124,21 @@ def __init__( if shapelet_lengths_bounds is None: self.shapelet_lengths_bounds = None elif len(shapelet_lengths_bounds)==2: - self.shapelet_lengths_bounds = check_array_1D(shapelet_lengths_bounds) + self.shapelet_lengths_bounds = shapelet_lengths_bounds else: raise ValueError('Shapelets lengths bounds should be a 1D array with 2 values') self.lengths_bounds_reduction=check_is_numeric(lengths_bounds_reduction) + if self.lengths_bounds_reduction>=1: + raise ValueError('lengths_bounds_reduction parameter should be in range [0,1[') self.prime_dilations = check_is_boolean(prime_dilations) self.proba_norm = check_is_numeric(proba_norm) self.percentiles = self._validate_percentiles(percentiles) - if n_jobs != -1: - self.n_jobs = check_n_jobs(n_jobs) - else: - self.n_jobs = n_jobs self.random_state = check_random_state(random_state) + if isinstance(n_jobs, bool): + self.n_jobs=n_jobs + else: + self.n_jobs=check_n_jobs(n_jobs) + set_num_threads(self.n_jobs) self.max_channels=max_channels self.min_len=min_len @@ -148,14 +151,20 @@ def _set_lengths(self): else: b0 = self.shapelet_lengths_bounds[0] b1 = self.shapelet_lengths_bounds[1] - min_l = max(5,int(b0*self.min_len)) - max_l = max(6,int(b1*self.min_len)) + + if isinstance(b0, float): + b0 = int(b0*self.min_len) + min_l = max(5,b0) + if isinstance(b1, float): + b1 = int(b1*self.min_len) + max_l = max(6,max(b0+1,b1+1)) #6 to ensure range 5,6 -> 5 lengths = np.asarray(list(range(min_l, max_l))) if lengths.shape[0]>3: n_remove = int(lengths.shape[0]*self.lengths_bounds_reduction) - step = lengths.shape[0]//n_remove - lengths = lengths[::step] + if n_remove > 0: + step = lengths.shape[0]//n_remove + lengths = lengths[::step] return lengths def fit(self, X, y): @@ -173,8 +182,6 @@ def fit(self, X, y): Class of the input time series. """ - if self.n_jobs != -1: - set_num_threads(self.n_jobs) self._set_fit_transform(X) if self.transform_type in [STR_MULTIVARIATE_VARIABLE, STR_UNIVARIATE_VARIABLE]: X, X_len = self._format_uneven_timestamps(X) @@ -205,7 +212,6 @@ def fit(self, X, y): self.shapelet_lengths = self._set_lengths() shapelet_lengths, seed = self._check_params(self.min_len) - print(shapelet_lengths) # Generate the shapelets if self.transform_type == STR_UNIVARIATE_VARIABLE: self.shapelets_ = self.fitter( @@ -482,7 +488,7 @@ def _check_params(self, n_timestamps): raise ValueError('Input data goint {} timestamps, at least 4 are requiered. Input format should be (n_samples, n_features, n_timestamps)'.format(n_timestamps)) else: warnings.warn("All the values in 'shapelet_lengths' must be lower than or equal to 'n_timestamps' (got {} > {}). Changed shapelet size to {}".format(shapelet_lengths.max(), n_timestamps, n_timestamps//2)) - shapelet_lengths = np.array([n_timestamps//2]) + shapelet_lengths = shapelet_lengths[shapelet_lengths > n_timestamps] = n_timestamps//2 rng = check_random_state(self.random_state) diff --git a/convst/utils/dataset_utils.py b/convst/utils/dataset_utils.py index 40966e6..2d2d45f 100644 --- a/convst/utils/dataset_utils.py +++ b/convst/utils/dataset_utils.py @@ -18,7 +18,6 @@ def _custom_from_nested_to_3d_numpy(X): return np.array([X[i].values.T for i in range(len(X))]) else: return [X[i].values.T for i in range(len(X))] - @njit(cache=True) def z_norm_3D(X): @@ -66,7 +65,7 @@ def z_norm_3D_list(X): return X -def load_sktime_dataset_split(name, normalize=True): +def load_sktime_dataset_split(name, normalize=False): """ Load the original train and test splits of a dataset from the UCR/UEA archive by name using sktime API. @@ -120,7 +119,7 @@ def load_sktime_dataset_split(name, normalize=True): return X_train, X_test, y_train, y_test, min_len -def load_sktime_arff_file(path, normalize=True): +def load_sktime_arff_file(path, normalize=False): """ Load a dataset from .arff files. @@ -171,7 +170,7 @@ def load_sktime_arff_file(path, normalize=True): return X_train, X_test, y_train, y_test, le -def load_sktime_arff_file_resample_id(path, rs_id, normalize=True): +def load_sktime_arff_file_resample_id(path, rs_id, normalize=False): """ Load a dataset resample from .arff files and the identifier of the resample. @@ -224,7 +223,7 @@ def load_sktime_arff_file_resample_id(path, rs_id, normalize=True): return X_train, X_test, y_train, y_test, le -def load_sktime_ts_file(path, normalize=True): +def load_sktime_ts_file(path, normalize=False): """ Load a dataset from .ts files @@ -274,7 +273,7 @@ def load_sktime_ts_file(path, normalize=True): return X_train, X_test, y_train, y_test, le -def load_sktime_dataset(name, normalize=True): +def load_sktime_dataset(name, normalize=False): """ Load a dataset from the UCR/UEA archive by name using sktime API diff --git a/pyproject.toml b/pyproject.toml index dd275fa..9b2b885 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "convst" -version = "0.2.1" +version = "0.2.3" description = "The Random Dilation Shapelet Transform algorithm and associated works" readme = "README.md" authors = [ diff --git a/tests/test_rdst.py b/tests/test_rdst.py index 6c75654..6575532 100644 --- a/tests/test_rdst.py +++ b/tests/test_rdst.py @@ -10,6 +10,7 @@ from convst.transformers import R_DST from convst.utils.dataset_utils import load_sktime_dataset_split from convst.utils.experiments_utils import cross_validate_UCR_UEA +from convst.transformers._commons import is_prime import logging @@ -60,6 +61,50 @@ def test_mutliple_lengths(name, lengths): assert False assert True +@pytest.mark.parametrize("name", [ + ('ForbA'), + ('ForbB') +]) +def test_prime_dilations(name): + X_train, X_test, y_train, y_test, min_len = load_sktime_dataset_split( + name=name + ) + try: + rdst = R_DST_Ridge(min_len=min_len, prime_dilations=True).fit(X_train, y_train) + rdst.score(X_test, y_test) + except Exception as e: + LOGGER.info('An exception as occured during prime dilation tests: {}'.format( + e + )) + assert False + assert all([is_prime(i) for i in rdst.transformer.shapelets_[2]]) + + +# TODO : this may fail due to unlucky generation of shapelets, if a length +# is not selected randomly, the expected array will be bigger than actual +@pytest.mark.parametrize("name", "bounds", "reduction", "expected" [ + ('GunPoint', [6,12], 0., [ 6, 7, 8, 9, 10, 11, 12]), + ('GunPoint', [6,12], 0.5, [ 6, 8, 10, 12]), + ('GunPoint', [0.1,0.15], 0., [15, 16, 17, 18, 19, 20, 21, 22]), + ('GunPoint', [0.1,0.15], 0.5, [15, 17, 19, 21]) +]) +def test_length_bounds(name, bounds, reduction, expected): + X_train, X_test, y_train, y_test, min_len = load_sktime_dataset_split( + name=name + ) + try: + rdst = R_DST_Ridge( + min_len=min_len, shapelet_lengths_bounds=bounds, + lengths_bounds_reduction=reduction + ).fit(X_train, y_train) + rdst.score(X_test, y_test) + except Exception as e: + LOGGER.info('An exception as occured during length bounds tests: {}'.format( + e + )) + assert False + assert all(rdst.transformer.shapelets_[1] == expected) + # Lower than actual best accuracy to account for possible deviation due to random sampling @pytest.mark.parametrize("name, expected", [ ('GunPoint',0.98), From dcc52cfb925ba17e27ae86cf7e17da994ab4fb5c Mon Sep 17 00:00:00 2001 From: Antoine Guillaume Date: Wed, 9 Nov 2022 10:46:33 +0100 Subject: [PATCH 2/4] Correcting typos in test_rdst --- tests/test_rdst.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/test_rdst.py b/tests/test_rdst.py index 6575532..f47c613 100644 --- a/tests/test_rdst.py +++ b/tests/test_rdst.py @@ -82,11 +82,11 @@ def test_prime_dilations(name): # TODO : this may fail due to unlucky generation of shapelets, if a length # is not selected randomly, the expected array will be bigger than actual -@pytest.mark.parametrize("name", "bounds", "reduction", "expected" [ - ('GunPoint', [6,12], 0., [ 6, 7, 8, 9, 10, 11, 12]), - ('GunPoint', [6,12], 0.5, [ 6, 8, 10, 12]), - ('GunPoint', [0.1,0.15], 0., [15, 16, 17, 18, 19, 20, 21, 22]), - ('GunPoint', [0.1,0.15], 0.5, [15, 17, 19, 21]) +@pytest.mark.parametrize("name, bounds, reduction, expected" [ + ('GunPoint', [6, 12], 0., [6, 7, 8, 9, 10, 11, 12]), + ('GunPoint', [6, 12], 0.5, [6, 8, 10, 12]), + ('GunPoint', [0.1, 0.15], 0., [15, 16, 17, 18, 19, 20, 21, 22]), + ('GunPoint', [0.1, 0.15], 0.5, [15, 17, 19, 21]) ]) def test_length_bounds(name, bounds, reduction, expected): X_train, X_test, y_train, y_test, min_len = load_sktime_dataset_split( From 12193339d76d96dac6c4762e6e4bacd873c16812 Mon Sep 17 00:00:00 2001 From: Antoine Guillaume Date: Wed, 9 Nov 2022 10:50:05 +0100 Subject: [PATCH 3/4] Tiny weeny commatiny --- tests/test_rdst.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_rdst.py b/tests/test_rdst.py index f47c613..ad4eeab 100644 --- a/tests/test_rdst.py +++ b/tests/test_rdst.py @@ -82,7 +82,7 @@ def test_prime_dilations(name): # TODO : this may fail due to unlucky generation of shapelets, if a length # is not selected randomly, the expected array will be bigger than actual -@pytest.mark.parametrize("name, bounds, reduction, expected" [ +@pytest.mark.parametrize("name, bounds, reduction, expected", [ ('GunPoint', [6, 12], 0., [6, 7, 8, 9, 10, 11, 12]), ('GunPoint', [6, 12], 0.5, [6, 8, 10, 12]), ('GunPoint', [0.1, 0.15], 0., [15, 16, 17, 18, 19, 20, 21, 22]), From bc39e373ce8032a418f88c958c720d31992b74f2 Mon Sep 17 00:00:00 2001 From: Antoine Guillaume Date: Wed, 9 Nov 2022 11:08:45 +0100 Subject: [PATCH 4/4] Fixing wrong dataset names and assert condition for rdst tests --- tests/test_rdst.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/test_rdst.py b/tests/test_rdst.py index ad4eeab..d457190 100644 --- a/tests/test_rdst.py +++ b/tests/test_rdst.py @@ -6,6 +6,8 @@ """ import pytest +import numpy as np + from convst.classifiers import R_DST_Ridge from convst.transformers import R_DST from convst.utils.dataset_utils import load_sktime_dataset_split @@ -62,8 +64,8 @@ def test_mutliple_lengths(name, lengths): assert True @pytest.mark.parametrize("name", [ - ('ForbA'), - ('ForbB') + ('FordA'), + ('FordB') ]) def test_prime_dilations(name): X_train, X_test, y_train, y_test, min_len = load_sktime_dataset_split( @@ -103,7 +105,7 @@ def test_length_bounds(name, bounds, reduction, expected): e )) assert False - assert all(rdst.transformer.shapelets_[1] == expected) + assert list(np.unique(rdst.transformer.shapelets_[1])) == expected # Lower than actual best accuracy to account for possible deviation due to random sampling @pytest.mark.parametrize("name, expected", [