Merge pull request #7 from joaopfonseca/dev

Release 0.3.3
joaopfonseca · Oct 7, 2021 · f4c561b · f4c561b
2 parents 602a814 + bac7826
commit f4c561b
Show file tree

Hide file tree

Showing 6 changed files with 136 additions and 34 deletions.
diff --git a/README.rst b/README.rst
@@ -1,7 +1,7 @@
-|Python Versions| |Documentation Status| |Pypi Version|
+|Python Versions| |Documentation Status| |Pypi Version| |DOI|
 
-Research
-========
+ML-Research
+===========
 
 This repository contains the code developed for all the publications I
 was involved in. The LaTeX and Python code for generating the paper,
@@ -41,11 +41,22 @@ Installation and Setup
 A Python distribution of version 3.7 or higher is required to run this
 project.
 
-Basic Installation
-~~~~~~~~~~~~~~~~~~
+User Installation
+~~~~~~~~~~~~~~~~~
 
-The following commands should allow you to setup this project with
-minimal effort:
+If you already have a working installation of numpy and scipy, the easiest way
+to install scikit-learn is using ``pip`` ::
+
+    pip install -U ml-research
+
+The documentation includes more detailed `installation instructions
+<https://mlresearch.readthedocs.io/en/latest/getting-started.html>`_.
+
+Installing from source
+~~~~~~~~~~~~~~~~~~~~~~
+
+The following commands should allow you to setup the development version of the
+project with minimal effort:
 
 ::
 
@@ -60,6 +71,27 @@ minimal effort:
     # Install project requirements and the research package
     make requirements
 
+Citing ML-Research
+------------------
+
+If you use ML-Research in a scientific publication, we would appreciate
+citations to the following paper::
+
+
+    @article{Fonseca2021,
+      doi = {10.3390/RS13132619},
+      url = {https://doi.org/10.3390/RS13132619},
+      keywords = {SMOTE,active learning,artificial data generation,land use/land cover classification,oversampling},
+      year = {2021},
+      month = {jul},
+      publisher = {Multidisciplinary Digital Publishing Institute},
+      volume = {13},
+      pages = {2619},
+      author = {Fonseca, Joao and Douzas, Georgios and Bacao, Fernando},
+      title = {{Increasing the Effectiveness of Active Learning: Introducing Artificial Data Generation in Active Learning for Land Use/Land Cover Classification}},
+      journal = {Remote Sensing}
+    }
+
 
 .. |Python Versions| image:: https://img.shields.io/badge/python-3.7%20%7C%203.8%20%7C%203.9-blue
 
@@ -68,3 +100,6 @@ minimal effort:
 
 .. |Pypi Version| image:: https://badge.fury.io/py/ml-research.svg
    :target: https://badge.fury.io/py/ml-research
+
+.. |DOI| image:: https://zenodo.org/badge/DOI/10.3390/RS13132619.svg
+   :target: https://doi.org/10.3390/RS13132619
diff --git a/requirements.dev.txt b/requirements.dev.txt
@@ -1,9 +1,6 @@
 # external requirements
-click
 coverage
-awscli
 flake8
-python-dotenv>=0.5.1
 
 # documentation requirements
 Sphinx

diff --git a/research/_version.py b/research/_version.py
@@ -1 +1 @@
-__version__ = '0.3.2'
+__version__ = '0.3.3'
diff --git a/research/active_learning/_active_learning.py b/research/active_learning/_active_learning.py
@@ -7,10 +7,14 @@
 # License: MIT
 
 import numpy as np
+from copy import deepcopy
 from sklearn.base import clone
 from sklearn.base import ClassifierMixin, BaseEstimator
 from sklearn.utils import check_X_y
-from sklearn.model_selection import train_test_split
+from sklearn.model_selection import (
+    train_test_split,
+    GridSearchCV
+)
 from sklearn.preprocessing import MinMaxScaler
 from sklearn.ensemble import RandomForestClassifier
 from imblearn.pipeline import Pipeline
@@ -56,6 +60,26 @@ class ALSimulation(ClassifierMixin, BaseEstimator):
         instances to be added to the labeled/training dataset. Selection strategies may
         be added or changed in the ``UNCERTAINTY_FUNCTIONS`` dictionary.
 
+    param_grid : dict or list of dictionaries
+        Dictionary with parameters names (``str``) as keys and lists of
+        parameter settings to try as values, or a list of such
+        dictionaries, in which case the grids spanned by each dictionary
+        in the list are explored. This enables searching over any sequence
+        of parameter settings.
+
+    cv : int, cross-validation generator or an iterable, default=None
+        Determines the cross-validation splitting strategy.
+        Possible inputs for cv are:
+
+        - None, to use the default 5-fold cross validation,
+        - integer, to specify the number of folds in a `(Stratified)KFold`,
+        - :term:`CV splitter`.
+
+        For integer/None inputs, if the estimator is a classifier and ``y`` is
+        either binary or multiclass, :class:`StratifiedKFold` is used. In all
+        other cases, :class:`KFold` is used. These splitters are instantiated
+        with `shuffle=False` so the splits will be the same across calls.
+
     max_iter : int, default=None
         Maximum number of iterations allowed. If None, the experiment will run until 100%
         of the dataset is added to the training set.
@@ -119,6 +143,8 @@ def __init__(
         init_clusterer=None,
         init_strategy='random',
         selection_strategy='entropy',
+        param_grid=None,
+        cv=None,
         max_iter=None,
         n_initial=.02,
         increment=.02,
@@ -134,6 +160,8 @@ def __init__(
         self.use_sample_weight = use_sample_weight
         self.init_clusterer = init_clusterer
         self.init_strategy = init_strategy
+        self.param_grid = param_grid
+        self.cv = cv
         self.selection_strategy = selection_strategy
         self.max_iter = max_iter
         self.n_initial = n_initial
@@ -149,7 +177,7 @@ def __init__(
         self.random_state = random_state
 
     def _check(self, X, y):
-        """Set ups simple initialization parameters to run an AL simulation."""
+        """Set up simple initialization parameters to run an AL simulation."""
 
         X, y = check_X_y(X, y)
 
@@ -218,6 +246,17 @@ def _check(self, X, y):
 
         return X, X_test, y, y_test
 
+    def _check_cross_validation(self, y):
+        min_frequency = np.unique(y, return_counts=True)[-1].min()
+        cv = deepcopy(self.cv)
+
+        if hasattr(self.cv, 'n_splits'):
+            cv.n_splits = min(min_frequency, self.cv.n_splits)
+        elif type(self.cv) == int:
+            cv = min(min_frequency, self.cv)
+
+        return cv
+
     def _get_performance_scores(self):
         data_utilization = [
             i[1] for i in self.data_utilization_
@@ -274,7 +313,6 @@ def fit(self, X, y):
         """
 
         # Original "unlabeled" dataset
-        iter_n = 0
         X, X_test, y, y_test = self._check(X, y)
         selection = np.zeros(shape=(X.shape[0])).astype(bool)
         sample_weight = None
@@ -291,7 +329,7 @@ def fit(self, X, y):
 
         selection[ids] = True
 
-        while iter_n < self.max_iter_:
+        for iter_n in range(self.max_iter_):
 
             # Generator + Chooser (in this case chooser==Predictor)
             if self.generator is not None:
@@ -305,14 +343,31 @@ def fit(self, X, y):
             else:
                 classifier = clone(self._classifier)
 
-            if isinstance(classifier, Pipeline) and self.use_sample_weight:
-                generator = classifier.steps[-2][-1]
-                classifier.steps[-2] = ('generator', generator)
+            # Set up parameter tuning within iterations
+            if self.param_grid is not None:
+                cv = self._check_cross_validation(y[selection])
+                classifier = GridSearchCV(
+                    estimator=classifier,
+                    param_grid=self.param_grid,
+                    scoring=self.evaluation_metric,
+                    cv=cv,
+                    refit=True
+                )
 
             # Generate artificial data and train classifier
             if self.use_sample_weight:
-                classifier.fit(X[selection], y[selection],
-                               generator__sample_weight=sample_weight)
+
+                # Save oversampler name to pass sample weight
+                ovr_name = (
+                    classifier.steps[-2][0]
+                    if self.param_grid is None
+                    else classifier.estimator.steps[-2][0]
+                )
+
+                classifier.fit(
+                    X[selection], y[selection],
+                    **{f'{ovr_name}__sample_weight': sample_weight}
+                )
 
                 # Compute the class probabilities of labeled observations
                 labeled_ids = np.argwhere(selection).squeeze()
@@ -361,9 +416,6 @@ def fit(self, X, y):
                 if np.isnan(sample_weight).all():
                     sample_weight = np.ones(sample_weight.shape)
 
-            # keep track of iter_n
-            iter_n += 1
-
             # stop if all examples have been included
             if selection.all():
                 break

diff --git a/research/data_augmentation/_oversampling_augmentation.py b/research/data_augmentation/_oversampling_augmentation.py
@@ -79,7 +79,9 @@ class OverSamplingAugmentation(BaseOverSampler):
           according to the specified ratio.
 
         - When ``oversampling``, the data augmentation is done according to the
-          sampling strategy passed in the ``oversampler`` object.
+          sampling strategy passed in the ``oversampler`` object. If ``value`` is not
+          `None`, then the number of samples generated for each class equals the number
+          of samples in the majority class multiplied by ``value``.
 
         - When ``constant``, each class frequency is augmented to match
           the value passed in the parameter ``value``.
@@ -90,7 +92,8 @@ class OverSamplingAugmentation(BaseOverSampler):
 
     value : int, float, default=None
         Value to be used as the new absolute frequency of each class. It is
-        ignored unless the augmentation strategy is set to 'constant'.
+        ignored unless the augmentation strategy is set to ``constant`` or
+        ``oversampling``.
 
     random_state : int, RandomState instance, default=None
         Control the randomization of the algorithm.
@@ -182,9 +185,19 @@ def fit(self, X, y):
                     f" original dataset. Originally, there are {y.shape[0]} samples"
                     f" and {self.value} samples are asked."
                 )
-        elif self.augmentation_strategy == 'oversampling':
+        elif self.augmentation_strategy == 'oversampling' and self.value is None:
             self.sampling_strategy_ = self.oversampler.sampling_strategy
 
+        elif self.augmentation_strategy == 'oversampling':
+            counts = OrderedDict(Counter(y))
+            max_freq = max(counts.values())
+            self.sampling_strategy_ = {
+                k: int(np.round(max_freq*self.value))
+                if max_freq*self.value > freq
+                else freq
+                for k, freq in counts.items()
+            }
+
         elif type(self.augmentation_strategy) in [int, float]:
             counts = OrderedDict(Counter(y))
             self.sampling_strategy_ = {

diff --git a/research/utils/_check_pipelines.py b/research/utils/_check_pipelines.py
@@ -3,6 +3,7 @@
 from imblearn.pipeline import Pipeline
 from sklearn.base import clone
 
+
 def check_pipelines(objects_list, random_state, n_runs):
     """Extract estimators and parameters grids."""
 
@@ -11,21 +12,25 @@ def check_pipelines(objects_list, random_state, n_runs):
 
     pipelines = []
     param_grid = []
-    for comb in product(*objects_list):
+    for comb, rs in product(product(*objects_list), random_states):
         name = '|'.join([i[0] for i in comb])
 
-        # name, object, grid
-        comb = [(nm, ob, grd) for nm, ob, grd in comb if ob is not None]
+        # name, object, sub grid
+        comb = [(nm, ob, sg) for nm, ob, sg in comb if ob is not None]
 
-        pipelines.append((name, Pipeline([(nm, ob) for nm, ob, _ in comb])))
+        if name not in [n[0] for n in pipelines]:
+            pipelines.append((name, Pipeline([(nm, ob) for nm, ob, _ in comb])))
 
-        grids = {'est_name': [name]}
+        grid = {'est_name': [name]}
         for obj_name, obj, sub_grid in comb:
             if 'random_state' in obj.get_params().keys():
-                grids[f'{name}__{obj_name}__random_state'] = random_states
+                grid[f'{name}__{obj_name}__random_state'] = [rs]
             for param, values in sub_grid.items():
-                grids[f'{name}__{obj_name}__{param}'] = values
-        param_grid.append(grids)
+                grid[f'{name}__{obj_name}__{param}'] = values
+
+        # Avoid multiple runs over pipelines without random state
+        if grid not in param_grid:
+            param_grid.append(grid)
 
     return pipelines, param_grid