Merge pull request #25 from baraline/21-bug-alpha-similarity-with-mul…

…tiple-input-lengths Correction n_jobs setting, ambiguous names and prime generation
baraline · Nov 9, 2022 · 47869a6 · 47869a6
2 parents 6fa02d5 + bc39e37
commit 47869a6
Show file tree

Hide file tree

Showing 12 changed files with 103 additions and 42 deletions.
diff --git a/convst/__init__.py b/convst/__init__.py
@@ -1,5 +1,5 @@
 
 __author__ = 'Antoine Guillaume [email protected]'
-__version__ = "0.2.2"
+__version__ = "0.2.3"
 
 __all__ = ['transformers', 'classifiers', 'utils', 'interpreters']
diff --git a/convst/classifiers/rdst_ensemble.py b/convst/classifiers/rdst_ensemble.py
@@ -16,6 +16,7 @@
 from sklearn.preprocessing import StandardScaler
 from sklearn.pipeline import make_pipeline
 
+from numba import set_num_threads
 
 class _internalRidgeCV(RidgeClassifierCV): 
     def __init__(self, **kwargs):
@@ -107,6 +108,7 @@ def fit(self, X, y):
             Derivate(),
             Periodigram()
         ]
+        set_num_threads(self.n_jobs_rdst)
         models = Parallel(
             n_jobs=self.n_jobs,
             prefer=self.backend,
@@ -118,7 +120,7 @@ def fit(self, X, y):
                     R_DST(
                         n_shapelets=self.n_shapelets_per_estimator,
                         alpha=self.shp_alpha, n_samples=self.n_samples, 
-                        proba_norm=self.proba_norm[i], n_jobs=-1,
+                        proba_norm=self.proba_norm[i], n_jobs=False,
                         shapelet_lengths=self.shapelet_lengths,
                         phase_invariance=self.phase_invariance,
                         prime_dilations=self.prime_dilations,

diff --git a/convst/classifiers/rdst_ridge.py b/convst/classifiers/rdst_ridge.py
@@ -9,8 +9,11 @@
 from convst.transformers._input_transformers import c_StandardScaler
 from convst.transformers import R_DST
 
+from convst.utils.checks_utils import check_n_jobs
 from sklearn.metrics import accuracy_score
 
+from numba import set_num_threads
+
 class R_DST_Ridge(BaseEstimator, ClassifierMixin):
     """
     A wrapper class which use R_DST as a transformer, followed by a Ridge 
@@ -95,10 +98,14 @@ def __init__(
         self.shapelet_lengths=shapelet_lengths
         self.proba_norm=proba_norm
         self.percentiles=percentiles
-        self.n_jobs=n_jobs
+        if isinstance(n_jobs, bool):
+            self.n_jobs=n_jobs
+        else:
+            self.n_jobs=check_n_jobs(n_jobs)
+            set_num_threads(self.n_jobs)
         self.random_state=random_state
         self.min_len=min_len
-
+    
     def _more_tags(self):
         return {
             "capability:variable_length": True,
@@ -126,10 +133,10 @@ def _init_components(self):
             normalize_output=self.normalize_output,
             n_samples=self.n_samples,
             n_shapelets=self.n_shapelets,
+            n_jobs=False,
             shapelet_lengths=self.shapelet_lengths,
             proba_norm=self.proba_norm,
             percentiles=self.percentiles,
-            n_jobs=self.n_jobs,
             random_state=self.random_state,
             min_len=self.min_len 
         )

diff --git a/convst/transformers/_commons.py b/convst/transformers/_commons.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 
 from numba import njit, prange
-from numpy import float_, sqrt, zeros, unique, bool_, where, int64, all as _all
+from numpy import float_, sqrt, zeros, unique, bool_, where, int64
 
 ###############################################################################
 #                                                                             #
@@ -576,8 +576,8 @@ def _combinations_1d(x,y):
 
 @njit(cache=True)
 def prime_up_to(n):
-    is_p = zeros(n, dtype=bool_)
-    for i in range(n):
+    is_p = zeros(n+1, dtype=bool_)
+    for i in range(n+1):
         is_p[i] = is_prime(i)
     return where(is_p)[0]
 

diff --git a/convst/transformers/_multivariate_same_length.py b/convst/transformers/_multivariate_same_length.py
@@ -18,7 +18,7 @@
 from numba import njit, prange
 
 @njit(cache=True)
-def _init_random_shapelet_params(
+def M_SL_init_random_shapelet_params(
     n_shapelets, shapelet_sizes, n_timestamps, p_norm, max_channels, prime_scheme
 ):
     """
@@ -147,7 +147,7 @@ def M_SL_generate_shapelet(
 
     #Initialize shapelets
     values, lengths, dilations, threshold, normalize, n_channels, channel_ids = \
-    _init_random_shapelet_params(
+    M_SL_init_random_shapelet_params(
         n_shapelets, shapelet_sizes, n_timestamps, p_norm, max_channels, prime_scheme
     )
     #Initialize self similarity mask

diff --git a/convst/transformers/_multivariate_variable_length.py b/convst/transformers/_multivariate_variable_length.py
@@ -18,7 +18,7 @@
 from numba import njit, prange
 
 @njit(cache=True)
-def _init_random_shapelet_params(
+def M_VL_init_random_shapelet_params(
     n_shapelets, shapelet_sizes, n_timestamps, p_norm, max_channels, prime_scheme
 ):
     """
@@ -152,7 +152,7 @@ def M_VL_generate_shapelet(
 
     #Initialize shapelets
     values, lengths, dilations, threshold, normalize, n_channels, channel_ids = \
-    _init_random_shapelet_params(
+    M_VL_init_random_shapelet_params(
         n_shapelets, shapelet_sizes, min_len, p_norm, max_channels, prime_scheme
     )
     #Initialize self similarity mask

diff --git a/convst/transformers/_univariate_same_length.py b/convst/transformers/_univariate_same_length.py
@@ -17,7 +17,7 @@
 from numba import njit, prange
 
 @njit(cache=True)
-def _init_random_shapelet_params(
+def U_SL_init_random_shapelet_params(
     n_shapelets, shapelet_sizes, n_timestamps, p_norm, prime_scheme
 ):
     """
@@ -64,8 +64,6 @@ def _init_random_shapelet_params(
         for i in prange(n_shapelets):
             powers[i] = uniform(0, upper_bounds[i])
         dilations = floor(power(2, powers)).astype(int64)
-
-    #PRIME DILATION    
     # Init threshold array
     threshold = zeros(n_shapelets)
 
@@ -135,7 +133,7 @@ def U_SL_generate_shapelet(
 
     #Initialize shapelets
     values, lengths, dilations, threshold, normalize = \
-    _init_random_shapelet_params(
+    U_SL_init_random_shapelet_params(
         n_shapelets, shapelet_sizes, n_timestamps, p_norm, prime_scheme
     )
     #Initialize self similarity mask

diff --git a/convst/transformers/_univariate_variable_length.py b/convst/transformers/_univariate_variable_length.py
@@ -20,7 +20,7 @@
 # TODO : check if numba could support Tuple of variable length numpy arrays as input
 
 @njit(cache=True)
-def _init_random_shapelet_params(
+def U_VL_init_random_shapelet_params(
     n_shapelets, shapelet_sizes, n_timestamps, p_norm, prime_scheme
 ):
     """
@@ -145,7 +145,7 @@ def U_VL_generate_shapelet(
 
     #Initialize shapelets
     values, lengths, dilations, threshold, normalize = \
-    _init_random_shapelet_params(
+    U_VL_init_random_shapelet_params(
         n_shapelets, shapelet_sizes, min_len, p_norm, prime_scheme
     )
 

diff --git a/convst/transformers/rdst.py b/convst/transformers/rdst.py
@@ -13,8 +13,8 @@
 from sklearn.utils.validation import check_is_fitted, check_random_state
 
 from convst.utils.checks_utils import (
-    check_array_3D, check_array_1D, check_n_jobs, check_is_numeric, 
-    check_is_boolean
+    check_array_3D, check_array_1D, check_is_numeric, 
+    check_is_boolean, check_n_jobs
 )
 from convst.transformers._commons import manhattan, euclidean, squared_euclidean
 
@@ -108,10 +108,10 @@ def __init__(
         prime_dilations=False,
         proba_norm=0.8,
         percentiles=[5,10],
-        n_jobs=1,
         random_state=None,
         max_channels=None,
-        min_len=None 
+        min_len=None,
+        n_jobs=1
     ):
         self.transform_type = self._validate_transform_type(transform_type)
         self.phase_invariance = check_is_boolean(phase_invariance)
@@ -124,18 +124,21 @@ def __init__(
         if shapelet_lengths_bounds is None:
             self.shapelet_lengths_bounds = None
         elif len(shapelet_lengths_bounds)==2:
-            self.shapelet_lengths_bounds = check_array_1D(shapelet_lengths_bounds)
+            self.shapelet_lengths_bounds = shapelet_lengths_bounds
         else:
             raise ValueError('Shapelets lengths bounds should be a 1D array with 2 values')
         self.lengths_bounds_reduction=check_is_numeric(lengths_bounds_reduction)
+        if self.lengths_bounds_reduction>=1:
+            raise ValueError('lengths_bounds_reduction parameter should be in range [0,1[')
         self.prime_dilations = check_is_boolean(prime_dilations)
         self.proba_norm = check_is_numeric(proba_norm)
         self.percentiles = self._validate_percentiles(percentiles)
-        if n_jobs != -1:
-            self.n_jobs = check_n_jobs(n_jobs)
-        else:
-            self.n_jobs = n_jobs
         self.random_state = check_random_state(random_state)
+        if isinstance(n_jobs, bool):
+            self.n_jobs=n_jobs
+        else:
+            self.n_jobs=check_n_jobs(n_jobs)
+            set_num_threads(self.n_jobs)
         self.max_channels=max_channels
         self.min_len=min_len
 
@@ -148,14 +151,20 @@ def _set_lengths(self):
         else:
             b0 = self.shapelet_lengths_bounds[0]
             b1 = self.shapelet_lengths_bounds[1]
-            min_l = max(5,int(b0*self.min_len))
-            max_l = max(6,int(b1*self.min_len))
+
+            if isinstance(b0, float):
+                b0 = int(b0*self.min_len)
+            min_l = max(5,b0)
+            if isinstance(b1, float):
+                b1 = int(b1*self.min_len)
+            max_l = max(6,max(b0+1,b1+1))
             #6 to ensure range 5,6 -> 5
             lengths = np.asarray(list(range(min_l, max_l)))
             if lengths.shape[0]>3:
                 n_remove = int(lengths.shape[0]*self.lengths_bounds_reduction)
-                step = lengths.shape[0]//n_remove
-                lengths = lengths[::step]
+                if n_remove > 0:
+                    step = lengths.shape[0]//n_remove
+                    lengths = lengths[::step]
             return lengths
 
     def fit(self, X, y):
@@ -173,8 +182,6 @@ def fit(self, X, y):
             Class of the input time series.
 
         """
-        if self.n_jobs != -1:
-            set_num_threads(self.n_jobs)
         self._set_fit_transform(X)
         if self.transform_type in [STR_MULTIVARIATE_VARIABLE, STR_UNIVARIATE_VARIABLE]:
             X, X_len = self._format_uneven_timestamps(X)
@@ -205,7 +212,6 @@ def fit(self, X, y):
         self.shapelet_lengths = self._set_lengths()
 
         shapelet_lengths, seed = self._check_params(self.min_len)
-        print(shapelet_lengths)
         # Generate the shapelets
         if self.transform_type == STR_UNIVARIATE_VARIABLE:
             self.shapelets_ = self.fitter(
@@ -482,7 +488,7 @@ def _check_params(self, n_timestamps):
                 raise ValueError('Input data goint {} timestamps, at least 4 are requiered. Input format should be (n_samples, n_features, n_timestamps)'.format(n_timestamps))
             else:
                 warnings.warn("All the values in 'shapelet_lengths' must be lower than or equal to 'n_timestamps' (got {} > {}). Changed shapelet size to {}".format(shapelet_lengths.max(), n_timestamps, n_timestamps//2))
-                shapelet_lengths = np.array([n_timestamps//2])
+                shapelet_lengths = shapelet_lengths[shapelet_lengths > n_timestamps] = n_timestamps//2
 
 
         rng = check_random_state(self.random_state)

diff --git a/convst/utils/dataset_utils.py b/convst/utils/dataset_utils.py
@@ -18,7 +18,6 @@ def _custom_from_nested_to_3d_numpy(X):
         return np.array([X[i].values.T for i in range(len(X))])
     else:
         return [X[i].values.T for i in range(len(X))]
-
 
 @njit(cache=True)
 def z_norm_3D(X):
@@ -66,7 +65,7 @@ def z_norm_3D_list(X):
     return X
 
 
-def load_sktime_dataset_split(name, normalize=True):
+def load_sktime_dataset_split(name, normalize=False):
     """
     Load the original train and test splits of a dataset 
     from the UCR/UEA archive by name using sktime API.
@@ -120,7 +119,7 @@ def load_sktime_dataset_split(name, normalize=True):
     return X_train, X_test, y_train, y_test, min_len
 
 
-def load_sktime_arff_file(path, normalize=True):
+def load_sktime_arff_file(path, normalize=False):
     """
     Load a dataset from .arff files.
 
@@ -171,7 +170,7 @@ def load_sktime_arff_file(path, normalize=True):
     return X_train, X_test, y_train, y_test, le
 
 
-def load_sktime_arff_file_resample_id(path, rs_id, normalize=True):
+def load_sktime_arff_file_resample_id(path, rs_id, normalize=False):
     """
     Load a dataset resample from .arff files and the identifier of the 
     resample.
@@ -224,7 +223,7 @@ def load_sktime_arff_file_resample_id(path, rs_id, normalize=True):
 
     return X_train, X_test, y_train, y_test, le
 
-def load_sktime_ts_file(path, normalize=True):
+def load_sktime_ts_file(path, normalize=False):
     """
     Load a dataset from .ts files
 
@@ -274,7 +273,7 @@ def load_sktime_ts_file(path, normalize=True):
 
     return X_train, X_test, y_train, y_test, le
 
-def load_sktime_dataset(name, normalize=True):
+def load_sktime_dataset(name, normalize=False):
     """
     Load a dataset from the UCR/UEA archive by name using sktime API
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,8 @@
 [project]
 name = "convst"
-version = "0.2.2"
+
+version = "0.2.3"
+
 description = "The Random Dilation Shapelet Transform algorithm and associated works"
 readme = "README.md"
 authors = [