From 201048812fe1d808e7882c55f75b07db426741b9 Mon Sep 17 00:00:00 2001
From: Antoine Guillaume <antoine.guillaume45@gmail.com>
Date: Wed, 9 Nov 2022 10:39:09 +0100
Subject: [PATCH 1/4] Correction n_jobs setting, ambiguous names and prime
 generation

---
 convst/__init__.py                            |  2 +-
 convst/classifiers/rdst_ensemble.py           |  4 +-
 convst/classifiers/rdst_ridge.py              | 13 ++++--
 convst/transformers/_commons.py               |  6 +--
 .../transformers/_multivariate_same_length.py |  4 +-
 .../_multivariate_variable_length.py          |  4 +-
 .../transformers/_univariate_same_length.py   |  6 +--
 .../_univariate_variable_length.py            |  4 +-
 convst/transformers/rdst.py                   | 40 ++++++++++-------
 convst/utils/dataset_utils.py                 | 11 +++--
 pyproject.toml                                |  2 +-
 tests/test_rdst.py                            | 45 +++++++++++++++++++
 12 files changed, 99 insertions(+), 42 deletions(-)

diff --git a/convst/__init__.py b/convst/__init__.py
index 73a524f..317c74f 100644
--- a/convst/__init__.py
+++ b/convst/__init__.py
@@ -1,5 +1,5 @@
 
 __author__ = 'Antoine Guillaume antoine.guillaume45@gmail.com'
-__version__ = "0.2.1"
+__version__ = "0.2.3"
 
 __all__ = ['transformers', 'classifiers', 'utils', 'interpreters']
\ No newline at end of file
diff --git a/convst/classifiers/rdst_ensemble.py b/convst/classifiers/rdst_ensemble.py
index 89163f9..53e961a 100644
--- a/convst/classifiers/rdst_ensemble.py
+++ b/convst/classifiers/rdst_ensemble.py
@@ -16,6 +16,7 @@
 from sklearn.preprocessing import StandardScaler
 from sklearn.pipeline import make_pipeline
 
+from numba import set_num_threads
 
 class _internalRidgeCV(RidgeClassifierCV): 
     def __init__(self, **kwargs):
@@ -107,6 +108,7 @@ def fit(self, X, y):
             Derivate(),
             Periodigram()
         ]
+        set_num_threads(self.n_jobs_rdst)
         models = Parallel(
             n_jobs=self.n_jobs,
             prefer=self.backend,
@@ -118,7 +120,7 @@ def fit(self, X, y):
                     R_DST(
                         n_shapelets=self.n_shapelets_per_estimator,
                         alpha=self.shp_alpha, n_samples=self.n_samples, 
-                        proba_norm=self.proba_norm[i], n_jobs=-1,
+                        proba_norm=self.proba_norm[i], n_jobs=False,
                         shapelet_lengths=self.shapelet_lengths,
                         phase_invariance=self.phase_invariance,
                         prime_dilations=self.prime_dilations,
diff --git a/convst/classifiers/rdst_ridge.py b/convst/classifiers/rdst_ridge.py
index 50feaf7..8b4b2c9 100644
--- a/convst/classifiers/rdst_ridge.py
+++ b/convst/classifiers/rdst_ridge.py
@@ -9,8 +9,11 @@
 from convst.transformers._input_transformers import c_StandardScaler
 from convst.transformers import R_DST
 
+from convst.utils.checks_utils import check_n_jobs
 from sklearn.metrics import accuracy_score
 
+from numba import set_num_threads
+
 class R_DST_Ridge(BaseEstimator, ClassifierMixin):
     """
     A wrapper class which use R_DST as a transformer, followed by a Ridge 
@@ -95,10 +98,14 @@ def __init__(
         self.shapelet_lengths=shapelet_lengths
         self.proba_norm=proba_norm
         self.percentiles=percentiles
-        self.n_jobs=n_jobs
+        if isinstance(n_jobs, bool):
+            self.n_jobs=n_jobs
+        else:
+            self.n_jobs=check_n_jobs(n_jobs)
+            set_num_threads(self.n_jobs)
         self.random_state=random_state
         self.min_len=min_len
-
+    
     def _more_tags(self):
         return {
             "capability:variable_length": True,
@@ -126,10 +133,10 @@ def _init_components(self):
             normalize_output=self.normalize_output,
             n_samples=self.n_samples,
             n_shapelets=self.n_shapelets,
+            n_jobs=False,
             shapelet_lengths=self.shapelet_lengths,
             proba_norm=self.proba_norm,
             percentiles=self.percentiles,
-            n_jobs=self.n_jobs,
             random_state=self.random_state,
             min_len=self.min_len 
         )
diff --git a/convst/transformers/_commons.py b/convst/transformers/_commons.py
index 4ebd0d6..9b7ba1e 100644
--- a/convst/transformers/_commons.py
+++ b/convst/transformers/_commons.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 
 from numba import njit, prange
-from numpy import float_, sqrt, zeros, unique, bool_, where, int64, all as _all
+from numpy import float_, sqrt, zeros, unique, bool_, where, int64
 
 ###############################################################################
 #                                                                             #
@@ -576,8 +576,8 @@ def _combinations_1d(x,y):
 
 @njit(cache=True)
 def prime_up_to(n):
-    is_p = zeros(n, dtype=bool_)
-    for i in range(n):
+    is_p = zeros(n+1, dtype=bool_)
+    for i in range(n+1):
         is_p[i] = is_prime(i)
     return where(is_p)[0]
 
diff --git a/convst/transformers/_multivariate_same_length.py b/convst/transformers/_multivariate_same_length.py
index 0f19eda..ae52bd6 100644
--- a/convst/transformers/_multivariate_same_length.py
+++ b/convst/transformers/_multivariate_same_length.py
@@ -18,7 +18,7 @@
 from numba import njit, prange
 
 @njit(cache=True)
-def _init_random_shapelet_params(
+def M_SL_init_random_shapelet_params(
     n_shapelets, shapelet_sizes, n_timestamps, p_norm, max_channels, prime_scheme
 ):
     """
@@ -147,7 +147,7 @@ def M_SL_generate_shapelet(
 
     #Initialize shapelets
     values, lengths, dilations, threshold, normalize, n_channels, channel_ids = \
-    _init_random_shapelet_params(
+    M_SL_init_random_shapelet_params(
         n_shapelets, shapelet_sizes, n_timestamps, p_norm, max_channels, prime_scheme
     )
     #Initialize self similarity mask
diff --git a/convst/transformers/_multivariate_variable_length.py b/convst/transformers/_multivariate_variable_length.py
index 07cc487..d3cae4c 100644
--- a/convst/transformers/_multivariate_variable_length.py
+++ b/convst/transformers/_multivariate_variable_length.py
@@ -18,7 +18,7 @@
 from numba import njit, prange
 
 @njit(cache=True)
-def _init_random_shapelet_params(
+def M_VL_init_random_shapelet_params(
     n_shapelets, shapelet_sizes, n_timestamps, p_norm, max_channels, prime_scheme
 ):
     """
@@ -152,7 +152,7 @@ def M_VL_generate_shapelet(
 
     #Initialize shapelets
     values, lengths, dilations, threshold, normalize, n_channels, channel_ids = \
-    _init_random_shapelet_params(
+    M_VL_init_random_shapelet_params(
         n_shapelets, shapelet_sizes, min_len, p_norm, max_channels, prime_scheme
     )
     #Initialize self similarity mask
diff --git a/convst/transformers/_univariate_same_length.py b/convst/transformers/_univariate_same_length.py
index 8d2d36b..a70f87f 100644
--- a/convst/transformers/_univariate_same_length.py
+++ b/convst/transformers/_univariate_same_length.py
@@ -17,7 +17,7 @@
 from numba import njit, prange
 
 @njit(cache=True)
-def _init_random_shapelet_params(
+def U_SL_init_random_shapelet_params(
     n_shapelets, shapelet_sizes, n_timestamps, p_norm, prime_scheme
 ):
     """
@@ -64,8 +64,6 @@ def _init_random_shapelet_params(
         for i in prange(n_shapelets):
             powers[i] = uniform(0, upper_bounds[i])
         dilations = floor(power(2, powers)).astype(int64)
-    
-    #PRIME DILATION    
     # Init threshold array
     threshold = zeros(n_shapelets)
     
@@ -135,7 +133,7 @@ def U_SL_generate_shapelet(
 
     #Initialize shapelets
     values, lengths, dilations, threshold, normalize = \
-    _init_random_shapelet_params(
+    U_SL_init_random_shapelet_params(
         n_shapelets, shapelet_sizes, n_timestamps, p_norm, prime_scheme
     )
     #Initialize self similarity mask
diff --git a/convst/transformers/_univariate_variable_length.py b/convst/transformers/_univariate_variable_length.py
index 61d783c..5934d72 100644
--- a/convst/transformers/_univariate_variable_length.py
+++ b/convst/transformers/_univariate_variable_length.py
@@ -20,7 +20,7 @@
 # TODO : check if numba could support Tuple of variable length numpy arrays as input
 
 @njit(cache=True)
-def _init_random_shapelet_params(
+def U_VL_init_random_shapelet_params(
     n_shapelets, shapelet_sizes, n_timestamps, p_norm, prime_scheme
 ):
     """
@@ -145,7 +145,7 @@ def U_VL_generate_shapelet(
     
     #Initialize shapelets
     values, lengths, dilations, threshold, normalize = \
-    _init_random_shapelet_params(
+    U_VL_init_random_shapelet_params(
         n_shapelets, shapelet_sizes, min_len, p_norm, prime_scheme
     )
     
diff --git a/convst/transformers/rdst.py b/convst/transformers/rdst.py
index 31b9207..63fd4bb 100644
--- a/convst/transformers/rdst.py
+++ b/convst/transformers/rdst.py
@@ -13,8 +13,8 @@
 from sklearn.utils.validation import check_is_fitted, check_random_state
 
 from convst.utils.checks_utils import (
-    check_array_3D, check_array_1D, check_n_jobs, check_is_numeric, 
-    check_is_boolean
+    check_array_3D, check_array_1D, check_is_numeric, 
+    check_is_boolean, check_n_jobs
 )
 from convst.transformers._commons import manhattan, euclidean, squared_euclidean
 
@@ -108,10 +108,10 @@ def __init__(
         prime_dilations=False,
         proba_norm=0.8,
         percentiles=[5,10],
-        n_jobs=1,
         random_state=None,
         max_channels=None,
-        min_len=None 
+        min_len=None,
+        n_jobs=1
     ):
         self.transform_type = self._validate_transform_type(transform_type)
         self.phase_invariance = check_is_boolean(phase_invariance)
@@ -124,18 +124,21 @@ def __init__(
         if shapelet_lengths_bounds is None:
             self.shapelet_lengths_bounds = None
         elif len(shapelet_lengths_bounds)==2:
-            self.shapelet_lengths_bounds = check_array_1D(shapelet_lengths_bounds)
+            self.shapelet_lengths_bounds = shapelet_lengths_bounds
         else:
             raise ValueError('Shapelets lengths bounds should be a 1D array with 2 values')
         self.lengths_bounds_reduction=check_is_numeric(lengths_bounds_reduction)
+        if self.lengths_bounds_reduction>=1:
+            raise ValueError('lengths_bounds_reduction parameter should be in range [0,1[')
         self.prime_dilations = check_is_boolean(prime_dilations)
         self.proba_norm = check_is_numeric(proba_norm)
         self.percentiles = self._validate_percentiles(percentiles)
-        if n_jobs != -1:
-            self.n_jobs = check_n_jobs(n_jobs)
-        else:
-            self.n_jobs = n_jobs
         self.random_state = check_random_state(random_state)
+        if isinstance(n_jobs, bool):
+            self.n_jobs=n_jobs
+        else:
+            self.n_jobs=check_n_jobs(n_jobs)
+            set_num_threads(self.n_jobs)
         self.max_channels=max_channels
         self.min_len=min_len
     
@@ -148,14 +151,20 @@ def _set_lengths(self):
         else:
             b0 = self.shapelet_lengths_bounds[0]
             b1 = self.shapelet_lengths_bounds[1]
-            min_l = max(5,int(b0*self.min_len))
-            max_l = max(6,int(b1*self.min_len))
+            
+            if isinstance(b0, float):
+                b0 = int(b0*self.min_len)
+            min_l = max(5,b0)
+            if isinstance(b1, float):
+                b1 = int(b1*self.min_len)
+            max_l = max(6,max(b0+1,b1+1))
             #6 to ensure range 5,6 -> 5
             lengths = np.asarray(list(range(min_l, max_l)))
             if lengths.shape[0]>3:
                 n_remove = int(lengths.shape[0]*self.lengths_bounds_reduction)
-                step = lengths.shape[0]//n_remove
-                lengths = lengths[::step]
+                if n_remove > 0:
+                    step = lengths.shape[0]//n_remove
+                    lengths = lengths[::step]
             return lengths
         
     def fit(self, X, y):
@@ -173,8 +182,6 @@ def fit(self, X, y):
             Class of the input time series.
 
         """
-        if self.n_jobs != -1:
-            set_num_threads(self.n_jobs)
         self._set_fit_transform(X)
         if self.transform_type in [STR_MULTIVARIATE_VARIABLE, STR_UNIVARIATE_VARIABLE]:
             X, X_len = self._format_uneven_timestamps(X)
@@ -205,7 +212,6 @@ def fit(self, X, y):
         self.shapelet_lengths = self._set_lengths()
         
         shapelet_lengths, seed = self._check_params(self.min_len)
-        print(shapelet_lengths)
         # Generate the shapelets
         if self.transform_type == STR_UNIVARIATE_VARIABLE:
             self.shapelets_ = self.fitter(
@@ -482,7 +488,7 @@ def _check_params(self, n_timestamps):
                 raise ValueError('Input data goint {} timestamps, at least 4 are requiered. Input format should be (n_samples, n_features, n_timestamps)'.format(n_timestamps))
             else:
                 warnings.warn("All the values in 'shapelet_lengths' must be lower than or equal to 'n_timestamps' (got {} > {}). Changed shapelet size to {}".format(shapelet_lengths.max(), n_timestamps, n_timestamps//2))
-                shapelet_lengths = np.array([n_timestamps//2])
+                shapelet_lengths = shapelet_lengths[shapelet_lengths > n_timestamps] = n_timestamps//2
 
 
         rng = check_random_state(self.random_state)
diff --git a/convst/utils/dataset_utils.py b/convst/utils/dataset_utils.py
index 40966e6..2d2d45f 100644
--- a/convst/utils/dataset_utils.py
+++ b/convst/utils/dataset_utils.py
@@ -18,7 +18,6 @@ def _custom_from_nested_to_3d_numpy(X):
         return np.array([X[i].values.T for i in range(len(X))])
     else:
         return [X[i].values.T for i in range(len(X))]
-    
 
 @njit(cache=True)
 def z_norm_3D(X):
@@ -66,7 +65,7 @@ def z_norm_3D_list(X):
     return X
     
 
-def load_sktime_dataset_split(name, normalize=True):
+def load_sktime_dataset_split(name, normalize=False):
     """
     Load the original train and test splits of a dataset 
     from the UCR/UEA archive by name using sktime API.
@@ -120,7 +119,7 @@ def load_sktime_dataset_split(name, normalize=True):
     return X_train, X_test, y_train, y_test, min_len
 
 
-def load_sktime_arff_file(path, normalize=True):
+def load_sktime_arff_file(path, normalize=False):
     """
     Load a dataset from .arff files.
 
@@ -171,7 +170,7 @@ def load_sktime_arff_file(path, normalize=True):
     return X_train, X_test, y_train, y_test, le
 
 
-def load_sktime_arff_file_resample_id(path, rs_id, normalize=True):
+def load_sktime_arff_file_resample_id(path, rs_id, normalize=False):
     """
     Load a dataset resample from .arff files and the identifier of the 
     resample.
@@ -224,7 +223,7 @@ def load_sktime_arff_file_resample_id(path, rs_id, normalize=True):
 
     return X_train, X_test, y_train, y_test, le
 
-def load_sktime_ts_file(path, normalize=True):
+def load_sktime_ts_file(path, normalize=False):
     """
     Load a dataset from .ts files
 
@@ -274,7 +273,7 @@ def load_sktime_ts_file(path, normalize=True):
 
     return X_train, X_test, y_train, y_test, le
 
-def load_sktime_dataset(name, normalize=True):
+def load_sktime_dataset(name, normalize=False):
     """
     Load a dataset from the UCR/UEA archive by name using sktime API
 
diff --git a/pyproject.toml b/pyproject.toml
index dd275fa..9b2b885 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "convst"
-version = "0.2.1"
+version = "0.2.3"
 description = "The Random Dilation Shapelet Transform algorithm and associated works"
 readme = "README.md"
 authors = [
diff --git a/tests/test_rdst.py b/tests/test_rdst.py
index 6c75654..6575532 100644
--- a/tests/test_rdst.py
+++ b/tests/test_rdst.py
@@ -10,6 +10,7 @@
 from convst.transformers import R_DST
 from convst.utils.dataset_utils import load_sktime_dataset_split
 from convst.utils.experiments_utils import cross_validate_UCR_UEA
+from convst.transformers._commons import is_prime
 
 import logging
 
@@ -60,6 +61,50 @@ def test_mutliple_lengths(name, lengths):
         assert False 
     assert True
 
+@pytest.mark.parametrize("name", [
+    ('ForbA'),
+    ('ForbB')
+])
+def test_prime_dilations(name):
+    X_train, X_test, y_train, y_test, min_len = load_sktime_dataset_split(
+        name=name
+    )
+    try:
+        rdst = R_DST_Ridge(min_len=min_len, prime_dilations=True).fit(X_train, y_train)
+        rdst.score(X_test, y_test)
+    except Exception as e:
+        LOGGER.info('An exception as occured during prime dilation tests: {}'.format(
+            e  
+        ))
+        assert False 
+    assert all([is_prime(i) for i in rdst.transformer.shapelets_[2]])
+
+
+# TODO : this may fail due to unlucky generation of shapelets, if a length 
+# is not selected randomly, the expected array will be bigger than actual
+@pytest.mark.parametrize("name", "bounds", "reduction", "expected" [
+    ('GunPoint', [6,12], 0., [ 6,  7,  8,  9, 10, 11, 12]),
+    ('GunPoint', [6,12], 0.5, [ 6,  8, 10, 12]),
+    ('GunPoint', [0.1,0.15], 0., [15, 16, 17, 18, 19, 20, 21, 22]),
+    ('GunPoint', [0.1,0.15], 0.5, [15, 17, 19, 21])
+])
+def test_length_bounds(name, bounds, reduction, expected):
+    X_train, X_test, y_train, y_test, min_len = load_sktime_dataset_split(
+        name=name
+    )
+    try:
+        rdst = R_DST_Ridge(
+            min_len=min_len, shapelet_lengths_bounds=bounds, 
+            lengths_bounds_reduction=reduction
+        ).fit(X_train, y_train)
+        rdst.score(X_test, y_test)
+    except Exception as e:
+        LOGGER.info('An exception as occured during length bounds tests: {}'.format(
+            e  
+        ))
+        assert False 
+    assert all(rdst.transformer.shapelets_[1] == expected)
+
 # Lower than actual best accuracy to account for possible deviation due to random sampling
 @pytest.mark.parametrize("name, expected", [
     ('GunPoint',0.98),

From dcc52cfb925ba17e27ae86cf7e17da994ab4fb5c Mon Sep 17 00:00:00 2001
From: Antoine Guillaume <antoine.guillaume45@gmail.com>
Date: Wed, 9 Nov 2022 10:46:33 +0100
Subject: [PATCH 2/4] Correcting typos in test_rdst

---
 tests/test_rdst.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/test_rdst.py b/tests/test_rdst.py
index 6575532..f47c613 100644
--- a/tests/test_rdst.py
+++ b/tests/test_rdst.py
@@ -82,11 +82,11 @@ def test_prime_dilations(name):
 
 # TODO : this may fail due to unlucky generation of shapelets, if a length 
 # is not selected randomly, the expected array will be bigger than actual
-@pytest.mark.parametrize("name", "bounds", "reduction", "expected" [
-    ('GunPoint', [6,12], 0., [ 6,  7,  8,  9, 10, 11, 12]),
-    ('GunPoint', [6,12], 0.5, [ 6,  8, 10, 12]),
-    ('GunPoint', [0.1,0.15], 0., [15, 16, 17, 18, 19, 20, 21, 22]),
-    ('GunPoint', [0.1,0.15], 0.5, [15, 17, 19, 21])
+@pytest.mark.parametrize("name, bounds, reduction, expected" [
+    ('GunPoint', [6, 12], 0., [6, 7, 8, 9, 10, 11, 12]),
+    ('GunPoint', [6, 12], 0.5, [6, 8, 10, 12]),
+    ('GunPoint', [0.1, 0.15], 0., [15, 16, 17, 18, 19, 20, 21, 22]),
+    ('GunPoint', [0.1, 0.15], 0.5, [15, 17, 19, 21])
 ])
 def test_length_bounds(name, bounds, reduction, expected):
     X_train, X_test, y_train, y_test, min_len = load_sktime_dataset_split(

From 12193339d76d96dac6c4762e6e4bacd873c16812 Mon Sep 17 00:00:00 2001
From: Antoine Guillaume <antoine.guillaume45@gmail.com>
Date: Wed, 9 Nov 2022 10:50:05 +0100
Subject: [PATCH 3/4] Tiny weeny commatiny

---
 tests/test_rdst.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_rdst.py b/tests/test_rdst.py
index f47c613..ad4eeab 100644
--- a/tests/test_rdst.py
+++ b/tests/test_rdst.py
@@ -82,7 +82,7 @@ def test_prime_dilations(name):
 
 # TODO : this may fail due to unlucky generation of shapelets, if a length 
 # is not selected randomly, the expected array will be bigger than actual
-@pytest.mark.parametrize("name, bounds, reduction, expected" [
+@pytest.mark.parametrize("name, bounds, reduction, expected", [
     ('GunPoint', [6, 12], 0., [6, 7, 8, 9, 10, 11, 12]),
     ('GunPoint', [6, 12], 0.5, [6, 8, 10, 12]),
     ('GunPoint', [0.1, 0.15], 0., [15, 16, 17, 18, 19, 20, 21, 22]),

From bc39e373ce8032a418f88c958c720d31992b74f2 Mon Sep 17 00:00:00 2001
From: Antoine Guillaume <antoine.guillaume45@gmail.com>
Date: Wed, 9 Nov 2022 11:08:45 +0100
Subject: [PATCH 4/4] Fixing wrong dataset names and assert condition for rdst
 tests

---
 tests/test_rdst.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/tests/test_rdst.py b/tests/test_rdst.py
index ad4eeab..d457190 100644
--- a/tests/test_rdst.py
+++ b/tests/test_rdst.py
@@ -6,6 +6,8 @@
 """
 import pytest
 
+import numpy as np
+
 from convst.classifiers import R_DST_Ridge
 from convst.transformers import R_DST
 from convst.utils.dataset_utils import load_sktime_dataset_split
@@ -62,8 +64,8 @@ def test_mutliple_lengths(name, lengths):
     assert True
 
 @pytest.mark.parametrize("name", [
-    ('ForbA'),
-    ('ForbB')
+    ('FordA'),
+    ('FordB')
 ])
 def test_prime_dilations(name):
     X_train, X_test, y_train, y_test, min_len = load_sktime_dataset_split(
@@ -103,7 +105,7 @@ def test_length_bounds(name, bounds, reduction, expected):
             e  
         ))
         assert False 
-    assert all(rdst.transformer.shapelets_[1] == expected)
+    assert list(np.unique(rdst.transformer.shapelets_[1])) == expected
 
 # Lower than actual best accuracy to account for possible deviation due to random sampling
 @pytest.mark.parametrize("name, expected", [