diff --git a/README.rst b/README.rst index b31ff26..a9b43ea 100644 --- a/README.rst +++ b/README.rst @@ -1,7 +1,7 @@ -|Python Versions| |Documentation Status| |Pypi Version| +|Python Versions| |Documentation Status| |Pypi Version| |DOI| -Research -======== +ML-Research +=========== This repository contains the code developed for all the publications I was involved in. The LaTeX and Python code for generating the paper, @@ -41,11 +41,22 @@ Installation and Setup A Python distribution of version 3.7 or higher is required to run this project. -Basic Installation -~~~~~~~~~~~~~~~~~~ +User Installation +~~~~~~~~~~~~~~~~~ -The following commands should allow you to setup this project with -minimal effort: +If you already have a working installation of numpy and scipy, the easiest way +to install scikit-learn is using ``pip`` :: + + pip install -U ml-research + +The documentation includes more detailed `installation instructions +`_. + +Installing from source +~~~~~~~~~~~~~~~~~~~~~~ + +The following commands should allow you to setup the development version of the +project with minimal effort: :: @@ -60,6 +71,27 @@ minimal effort: # Install project requirements and the research package make requirements +Citing ML-Research +------------------ + +If you use ML-Research in a scientific publication, we would appreciate +citations to the following paper:: + + + @article{Fonseca2021, + doi = {10.3390/RS13132619}, + url = {https://doi.org/10.3390/RS13132619}, + keywords = {SMOTE,active learning,artificial data generation,land use/land cover classification,oversampling}, + year = {2021}, + month = {jul}, + publisher = {Multidisciplinary Digital Publishing Institute}, + volume = {13}, + pages = {2619}, + author = {Fonseca, Joao and Douzas, Georgios and Bacao, Fernando}, + title = {{Increasing the Effectiveness of Active Learning: Introducing Artificial Data Generation in Active Learning for Land Use/Land Cover Classification}}, + journal = {Remote Sensing} + } + .. |Python Versions| image:: https://img.shields.io/badge/python-3.7%20%7C%203.8%20%7C%203.9-blue @@ -68,3 +100,6 @@ minimal effort: .. |Pypi Version| image:: https://badge.fury.io/py/ml-research.svg :target: https://badge.fury.io/py/ml-research + +.. |DOI| image:: https://zenodo.org/badge/DOI/10.3390/RS13132619.svg + :target: https://doi.org/10.3390/RS13132619 diff --git a/requirements.dev.txt b/requirements.dev.txt index 60b756f..7e9d095 100644 --- a/requirements.dev.txt +++ b/requirements.dev.txt @@ -1,9 +1,6 @@ # external requirements -click coverage -awscli flake8 -python-dotenv>=0.5.1 # documentation requirements Sphinx diff --git a/research/_version.py b/research/_version.py index 73e3bb4..80eb7f9 100644 --- a/research/_version.py +++ b/research/_version.py @@ -1 +1 @@ -__version__ = '0.3.2' +__version__ = '0.3.3' diff --git a/research/active_learning/_active_learning.py b/research/active_learning/_active_learning.py index 1005211..c2be456 100644 --- a/research/active_learning/_active_learning.py +++ b/research/active_learning/_active_learning.py @@ -7,10 +7,14 @@ # License: MIT import numpy as np +from copy import deepcopy from sklearn.base import clone from sklearn.base import ClassifierMixin, BaseEstimator from sklearn.utils import check_X_y -from sklearn.model_selection import train_test_split +from sklearn.model_selection import ( + train_test_split, + GridSearchCV +) from sklearn.preprocessing import MinMaxScaler from sklearn.ensemble import RandomForestClassifier from imblearn.pipeline import Pipeline @@ -56,6 +60,26 @@ class ALSimulation(ClassifierMixin, BaseEstimator): instances to be added to the labeled/training dataset. Selection strategies may be added or changed in the ``UNCERTAINTY_FUNCTIONS`` dictionary. + param_grid : dict or list of dictionaries + Dictionary with parameters names (``str``) as keys and lists of + parameter settings to try as values, or a list of such + dictionaries, in which case the grids spanned by each dictionary + in the list are explored. This enables searching over any sequence + of parameter settings. + + cv : int, cross-validation generator or an iterable, default=None + Determines the cross-validation splitting strategy. + Possible inputs for cv are: + + - None, to use the default 5-fold cross validation, + - integer, to specify the number of folds in a `(Stratified)KFold`, + - :term:`CV splitter`. + + For integer/None inputs, if the estimator is a classifier and ``y`` is + either binary or multiclass, :class:`StratifiedKFold` is used. In all + other cases, :class:`KFold` is used. These splitters are instantiated + with `shuffle=False` so the splits will be the same across calls. + max_iter : int, default=None Maximum number of iterations allowed. If None, the experiment will run until 100% of the dataset is added to the training set. @@ -119,6 +143,8 @@ def __init__( init_clusterer=None, init_strategy='random', selection_strategy='entropy', + param_grid=None, + cv=None, max_iter=None, n_initial=.02, increment=.02, @@ -134,6 +160,8 @@ def __init__( self.use_sample_weight = use_sample_weight self.init_clusterer = init_clusterer self.init_strategy = init_strategy + self.param_grid = param_grid + self.cv = cv self.selection_strategy = selection_strategy self.max_iter = max_iter self.n_initial = n_initial @@ -149,7 +177,7 @@ def __init__( self.random_state = random_state def _check(self, X, y): - """Set ups simple initialization parameters to run an AL simulation.""" + """Set up simple initialization parameters to run an AL simulation.""" X, y = check_X_y(X, y) @@ -218,6 +246,17 @@ def _check(self, X, y): return X, X_test, y, y_test + def _check_cross_validation(self, y): + min_frequency = np.unique(y, return_counts=True)[-1].min() + cv = deepcopy(self.cv) + + if hasattr(self.cv, 'n_splits'): + cv.n_splits = min(min_frequency, self.cv.n_splits) + elif type(self.cv) == int: + cv = min(min_frequency, self.cv) + + return cv + def _get_performance_scores(self): data_utilization = [ i[1] for i in self.data_utilization_ @@ -274,7 +313,6 @@ def fit(self, X, y): """ # Original "unlabeled" dataset - iter_n = 0 X, X_test, y, y_test = self._check(X, y) selection = np.zeros(shape=(X.shape[0])).astype(bool) sample_weight = None @@ -291,7 +329,7 @@ def fit(self, X, y): selection[ids] = True - while iter_n < self.max_iter_: + for iter_n in range(self.max_iter_): # Generator + Chooser (in this case chooser==Predictor) if self.generator is not None: @@ -305,14 +343,31 @@ def fit(self, X, y): else: classifier = clone(self._classifier) - if isinstance(classifier, Pipeline) and self.use_sample_weight: - generator = classifier.steps[-2][-1] - classifier.steps[-2] = ('generator', generator) + # Set up parameter tuning within iterations + if self.param_grid is not None: + cv = self._check_cross_validation(y[selection]) + classifier = GridSearchCV( + estimator=classifier, + param_grid=self.param_grid, + scoring=self.evaluation_metric, + cv=cv, + refit=True + ) # Generate artificial data and train classifier if self.use_sample_weight: - classifier.fit(X[selection], y[selection], - generator__sample_weight=sample_weight) + + # Save oversampler name to pass sample weight + ovr_name = ( + classifier.steps[-2][0] + if self.param_grid is None + else classifier.estimator.steps[-2][0] + ) + + classifier.fit( + X[selection], y[selection], + **{f'{ovr_name}__sample_weight': sample_weight} + ) # Compute the class probabilities of labeled observations labeled_ids = np.argwhere(selection).squeeze() @@ -361,9 +416,6 @@ def fit(self, X, y): if np.isnan(sample_weight).all(): sample_weight = np.ones(sample_weight.shape) - # keep track of iter_n - iter_n += 1 - # stop if all examples have been included if selection.all(): break diff --git a/research/data_augmentation/_oversampling_augmentation.py b/research/data_augmentation/_oversampling_augmentation.py index d822dfc..1a66188 100644 --- a/research/data_augmentation/_oversampling_augmentation.py +++ b/research/data_augmentation/_oversampling_augmentation.py @@ -79,7 +79,9 @@ class OverSamplingAugmentation(BaseOverSampler): according to the specified ratio. - When ``oversampling``, the data augmentation is done according to the - sampling strategy passed in the ``oversampler`` object. + sampling strategy passed in the ``oversampler`` object. If ``value`` is not + `None`, then the number of samples generated for each class equals the number + of samples in the majority class multiplied by ``value``. - When ``constant``, each class frequency is augmented to match the value passed in the parameter ``value``. @@ -90,7 +92,8 @@ class OverSamplingAugmentation(BaseOverSampler): value : int, float, default=None Value to be used as the new absolute frequency of each class. It is - ignored unless the augmentation strategy is set to 'constant'. + ignored unless the augmentation strategy is set to ``constant`` or + ``oversampling``. random_state : int, RandomState instance, default=None Control the randomization of the algorithm. @@ -182,9 +185,19 @@ def fit(self, X, y): f" original dataset. Originally, there are {y.shape[0]} samples" f" and {self.value} samples are asked." ) - elif self.augmentation_strategy == 'oversampling': + elif self.augmentation_strategy == 'oversampling' and self.value is None: self.sampling_strategy_ = self.oversampler.sampling_strategy + elif self.augmentation_strategy == 'oversampling': + counts = OrderedDict(Counter(y)) + max_freq = max(counts.values()) + self.sampling_strategy_ = { + k: int(np.round(max_freq*self.value)) + if max_freq*self.value > freq + else freq + for k, freq in counts.items() + } + elif type(self.augmentation_strategy) in [int, float]: counts = OrderedDict(Counter(y)) self.sampling_strategy_ = { diff --git a/research/utils/_check_pipelines.py b/research/utils/_check_pipelines.py index 8d4aa72..ee4aa0b 100644 --- a/research/utils/_check_pipelines.py +++ b/research/utils/_check_pipelines.py @@ -3,6 +3,7 @@ from imblearn.pipeline import Pipeline from sklearn.base import clone + def check_pipelines(objects_list, random_state, n_runs): """Extract estimators and parameters grids.""" @@ -11,21 +12,25 @@ def check_pipelines(objects_list, random_state, n_runs): pipelines = [] param_grid = [] - for comb in product(*objects_list): + for comb, rs in product(product(*objects_list), random_states): name = '|'.join([i[0] for i in comb]) - # name, object, grid - comb = [(nm, ob, grd) for nm, ob, grd in comb if ob is not None] + # name, object, sub grid + comb = [(nm, ob, sg) for nm, ob, sg in comb if ob is not None] - pipelines.append((name, Pipeline([(nm, ob) for nm, ob, _ in comb]))) + if name not in [n[0] for n in pipelines]: + pipelines.append((name, Pipeline([(nm, ob) for nm, ob, _ in comb]))) - grids = {'est_name': [name]} + grid = {'est_name': [name]} for obj_name, obj, sub_grid in comb: if 'random_state' in obj.get_params().keys(): - grids[f'{name}__{obj_name}__random_state'] = random_states + grid[f'{name}__{obj_name}__random_state'] = [rs] for param, values in sub_grid.items(): - grids[f'{name}__{obj_name}__{param}'] = values - param_grid.append(grids) + grid[f'{name}__{obj_name}__{param}'] = values + + # Avoid multiple runs over pipelines without random state + if grid not in param_grid: + param_grid.append(grid) return pipelines, param_grid