Merge pull request #45 from baraline/44-bug-multivariate-channel-init…

…ialisation Fix for parallel multivariate initialisation
baraline · Mar 16, 2023 · a2ec562 · a2ec562
2 parents 06cc446 + 2fcab45
commit a2ec562
Show file tree

Hide file tree

Showing 5 changed files with 71 additions and 91 deletions.
diff --git a/convst/__init__.py b/convst/__init__.py
@@ -1,6 +1,6 @@
 
 __author__ = 'Antoine Guillaume [email protected]'
-__version__ = "0.2.6"
+__version__ = "0.2.7"
 
 __all__ = ['transformers', 'classifiers', 'utils', 'interpreters']
 

diff --git a/convst/transformers/_input_transformers.py b/convst/transformers/_input_transformers.py
@@ -12,8 +12,6 @@
 
 from numba import njit, prange
 
-from pyts.approximation import DiscreteFourierTransform, SymbolicAggregateApproximation
-
 from scipy.signal import periodogram
 from scipy.fft import fht, fhtoffset
 
@@ -124,55 +122,6 @@ def _get_windows(self):
              "cosine","exponential","tukey","taylor"]
         )
 
-#TODO : adapt to multivariate/uneven length context
-class Sax(BaseEstimator, TransformerMixin):
-    def __init__(self, n_bins=10, strategy="uniform", random=False):
-        self.random = random
-        self.n_bins = n_bins
-        self.strategy = strategy
-
-
-    def fit(self, X, y=None):
-        if self.random:
-            self._random_init(X.shape[1])
-        self.transformer = SymbolicAggregateApproximation(
-            n_bins=self.n_bins, strategy=self.strategy, alphabet='ordinal'
-        )
-        self.transformer.fit(X[:,0,:])
-        return self
-
-    def transform(self, X):
-        X = self.transformer.transform(X[:,0,:])
-        return X[:, np.newaxis, :]
-
-    def _random_init(self, n_timestamps):
-        self.set_params(**{"n_bins":np.random.choice(np.arange(2,min(n_timestamps,26)))})
-
-#TODO : adapt to multivariate/uneven length context
-class FourrierCoefs(BaseEstimator, TransformerMixin):
-    def __init__(
-        self, n_coefs=None, drop_sum=False, anova=False, norm_mean=False,
-        norm_std=False
-    ):
-        self.n_coefs = n_coefs
-        self.drop_sum = drop_sum
-        self.anova = anova
-        self.norm_mean = norm_mean
-        self.norm_std = norm_std
-
-    def fit(self, X, y=None):
-        self.transformer = DiscreteFourierTransform(
-            n_coefs=self.n_coefs, drop_sum=self.drop_sum, anova=self.anova,
-            norm_std=self.norm_std, norm_mean=self.norm_mean,
-        )
-        self.transformer.fit(X[:,0,:], y=y)
-        return self
-
-    def transform(self, X):
-        X = self.transformer.transform(X[:,0,:])
-        return X[:, np.newaxis, :]
-
-
 class FastHankelTransform(BaseEstimator, TransformerMixin):
     def __init__(
         self, dln=0.01, mu=1, offset=0.0, bias=0.0, use_optimal_offset=True

diff --git a/convst/transformers/_multivariate_same_length.py b/convst/transformers/_multivariate_same_length.py
@@ -171,10 +171,10 @@ def M_SL_generate_shapelet(
         (2,unique_dil.shape[0],n_samples,n_features,n_timestamps), dtype=bool_
     )
     mask_return = ones(n_shapelets, dtype=bool_)
-    #Counter for values array indexes
-    a1 = 0
-    #Counter for channels_ids array indexes
-    a2 = 0     
+    #values[idx_val[i]:idx_val[i+1]]=_val
+    a1 = concatenate((zeros(1, dtype=int64),cumsum(n_channels*lengths)))
+    #same for channels
+    a2 = concatenate((zeros(1, dtype=int64),cumsum(n_channels)))   
 
     #For each dilation, we can do in parallel
     for i_d in prange(unique_dil.shape[0]):
@@ -251,32 +251,47 @@ def M_SL_generate_shapelet(
 
                     _values[a3:b3] = _v
                     a3 = b3
-                #Counter for values array indexes
-                b1 = a1 + _n_channels*_length
-                #Counter for channels_ids array indexes
-                b2 = a2 + _n_channels
 
-                values[a1:b1] = _values
-                channel_ids[a2:b2] = _channel_ids
+                values[a1[i_shp]:a1[i_shp+1]] = _values
+                channel_ids[a2[i_shp]:a2[i_shp+1]] = _channel_ids
 
                 #Extract value between two percentile as threshold for SO
                 ps = percentile(x_dist, [p_min,p_max])
                 threshold[i_shp] = uniform(
                     ps[0], ps[1]
                 )
-                a1 = b1
-                a2 = b2
             else:
                 mask_return[i_shp] = False
 
+    lengths = lengths[mask_return]
+    n_channels = n_channels[mask_return]
+    mask_channel_ids = zeros(n_channels.sum(), dtype=int64)
+    mask_values = zeros(
+        int64(
+            dot(lengths.astype(float64), n_channels.astype(float64))
+        )
+    )
+
+    c1 = 0
+    c2 = 0
+    for idx, i_shp in enumerate(where(mask_return)[0]):        
+        d1 = c1 + (n_channels[idx] * lengths[idx])
+        d2 = c2 + n_channels[idx]
+
+        mask_values[c1:d1] = values[a1[i_shp]:a1[i_shp+1]]
+        mask_channel_ids[c2:d2] = channel_ids[a2[i_shp]:a2[i_shp+1]]
+
+        c1 = d1
+        c2 = d2
+
     return (
-        values[:a1],
-        lengths[mask_return],
+        mask_values,
+        lengths,
         dilations[mask_return],
         threshold[mask_return],
         normalize[mask_return],
-        n_channels[mask_return],
-        channel_ids[:a2]
+        n_channels,
+        mask_channel_ids
     )
 
 @njit(cache=__USE_NUMBA_CACHE__, parallel=__USE_NUMBA_PARALLEL__, fastmath=__USE_NUMBA_FASTMATH__, nogil=__USE_NUMBA_NOGIL__)

diff --git a/convst/transformers/_multivariate_variable_length.py b/convst/transformers/_multivariate_variable_length.py
@@ -176,9 +176,10 @@ def M_VL_generate_shapelet(
     )
     mask_return = ones(n_shapelets, dtype=bool_)
     #Counter for values array indexes
-    a1 = 0
-    #Counter for channels_ids array indexes
-    a2 = 0     
+    a1 = concatenate((zeros(1, dtype=int64),cumsum(n_channels*lengths)))
+    #Indexes per shapelets for channel_ids array
+    a2 = concatenate((zeros(1, dtype=int64),cumsum(n_channels)))
+
     for i in prange(n_samples):
         mask_sampling[:,:,i,:,X_len[i]:] = 0
 
@@ -280,32 +281,48 @@ def M_VL_generate_shapelet(
 
                     _values[a3:b3] = _v
                     a3 = b3
-                #Counter for values array indexes
-                b1 = a1 + _n_channels*_length
-                #Counter for channels_ids array indexes
-                b2 = a2 + _n_channels
+
 
-                values[a1:b1] = _values
-                channel_ids[a2:b2] = _channel_ids
+                values[a1[i_shp]:a1[i_shp+1]] = _values
+                channel_ids[a2[i_shp]:a2[i_shp+1]] = _channel_ids
 
                 #Extract value between two percentile as threshold for SO
                 ps = percentile(x_dist, [p_min,p_max])
                 threshold[i_shp] = uniform(
                     ps[0], ps[1]
                 )
-                a1 = b1
-                a2 = b2
             else:
                 mask_return[i_shp] = False
 
+    lengths = lengths[mask_return]
+    n_channels = n_channels[mask_return]
+    mask_channel_ids = zeros(n_channels.sum(), dtype=int64)
+    mask_values = zeros(
+        int64(
+            dot(lengths.astype(float64), n_channels.astype(float64))
+        )
+    )
+
+    c1 = 0
+    c2 = 0
+    for idx, i_shp in enumerate(where(mask_return)[0]):        
+        d1 = c1 + (n_channels[idx] * lengths[idx])
+        d2 = c2 + n_channels[idx]
+
+        mask_values[c1:d1] = values[a1[i_shp]:a1[i_shp+1]]
+        mask_channel_ids[c2:d2] = channel_ids[a2[i_shp]:a2[i_shp+1]]
+
+        c1 = d1
+        c2 = d2
+
     return (
-        values[:a1],
-        lengths[mask_return],
+        mask_values,
+        lengths,
         dilations[mask_return],
         threshold[mask_return],
         normalize[mask_return],
-        n_channels[mask_return],
-        channel_ids[:a2]
+        n_channels,
+        mask_channel_ids
     )
 
 @njit(cache=__USE_NUMBA_CACHE__, parallel=__USE_NUMBA_PARALLEL__, fastmath=__USE_NUMBA_FASTMATH__, nogil=__USE_NUMBA_NOGIL__)

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,7 +1,7 @@
 [project]
 name = "convst"
 
-version = "0.2.6"
+version = "0.2.7"
 
 description = "The Random Dilation Shapelet Transform algorithm and associated works"
 readme = "README.md"
@@ -36,15 +36,14 @@ requires-python = ">=3.7,<3.11"
 dependencies = [
     "sktime>=0.15",
     "numba>=0.55",
-    "numpy>=1.21.0",
-    "pandas>=1.1.0",
+    "numpy>=1.21.0,<1.25",
+    "pandas>=1.1.0,<1.6.0",
     "joblib>=1.1.1",
-    "scikit-learn>=1.0",
+    "scikit-learn>=0.24.0,<1.3.0",
     "statsmodels>=0.12.1",
-    "scipy>=1.2.0",
-    "pyts>=0.12",
-    "matplotlib>=3.3.2",
-    "seaborn>=0.11.0",
+    "scipy<2.0.0,>=1.2.0",
+    "matplotlib>=3.1",
+    "seaborn>=0.10.0",
     "pytest>=7.0",
     "sphinx >= 4.2.0",
     "sphinx_gallery >= 0.10.1",