Skip to content

Commit

Permalink
Merge pull request #7 from joaopfonseca/dev
Browse files Browse the repository at this point in the history
Release 0.3.3
  • Loading branch information
joaopfonseca authored Oct 7, 2021
2 parents 602a814 + bac7826 commit f4c561b
Show file tree
Hide file tree
Showing 6 changed files with 136 additions and 34 deletions.
49 changes: 42 additions & 7 deletions README.rst
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
|Python Versions| |Documentation Status| |Pypi Version|
|Python Versions| |Documentation Status| |Pypi Version| |DOI|

Research
========
ML-Research
===========

This repository contains the code developed for all the publications I
was involved in. The LaTeX and Python code for generating the paper,
Expand Down Expand Up @@ -41,11 +41,22 @@ Installation and Setup
A Python distribution of version 3.7 or higher is required to run this
project.

Basic Installation
~~~~~~~~~~~~~~~~~~
User Installation
~~~~~~~~~~~~~~~~~

The following commands should allow you to setup this project with
minimal effort:
If you already have a working installation of numpy and scipy, the easiest way
to install scikit-learn is using ``pip`` ::

pip install -U ml-research

The documentation includes more detailed `installation instructions
<https://mlresearch.readthedocs.io/en/latest/getting-started.html>`_.

Installing from source
~~~~~~~~~~~~~~~~~~~~~~

The following commands should allow you to setup the development version of the
project with minimal effort:

::

Expand All @@ -60,6 +71,27 @@ minimal effort:
# Install project requirements and the research package
make requirements

Citing ML-Research
------------------

If you use ML-Research in a scientific publication, we would appreciate
citations to the following paper::


@article{Fonseca2021,
doi = {10.3390/RS13132619},
url = {https://doi.org/10.3390/RS13132619},
keywords = {SMOTE,active learning,artificial data generation,land use/land cover classification,oversampling},
year = {2021},
month = {jul},
publisher = {Multidisciplinary Digital Publishing Institute},
volume = {13},
pages = {2619},
author = {Fonseca, Joao and Douzas, Georgios and Bacao, Fernando},
title = {{Increasing the Effectiveness of Active Learning: Introducing Artificial Data Generation in Active Learning for Land Use/Land Cover Classification}},
journal = {Remote Sensing}
}


.. |Python Versions| image:: https://img.shields.io/badge/python-3.7%20%7C%203.8%20%7C%203.9-blue

Expand All @@ -68,3 +100,6 @@ minimal effort:

.. |Pypi Version| image:: https://badge.fury.io/py/ml-research.svg
:target: https://badge.fury.io/py/ml-research

.. |DOI| image:: https://zenodo.org/badge/DOI/10.3390/RS13132619.svg
:target: https://doi.org/10.3390/RS13132619
3 changes: 0 additions & 3 deletions requirements.dev.txt
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
# external requirements
click
coverage
awscli
flake8
python-dotenv>=0.5.1

# documentation requirements
Sphinx
Expand Down
2 changes: 1 addition & 1 deletion research/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '0.3.2'
__version__ = '0.3.3'
76 changes: 64 additions & 12 deletions research/active_learning/_active_learning.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,14 @@
# License: MIT

import numpy as np
from copy import deepcopy
from sklearn.base import clone
from sklearn.base import ClassifierMixin, BaseEstimator
from sklearn.utils import check_X_y
from sklearn.model_selection import train_test_split
from sklearn.model_selection import (
train_test_split,
GridSearchCV
)
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from imblearn.pipeline import Pipeline
Expand Down Expand Up @@ -56,6 +60,26 @@ class ALSimulation(ClassifierMixin, BaseEstimator):
instances to be added to the labeled/training dataset. Selection strategies may
be added or changed in the ``UNCERTAINTY_FUNCTIONS`` dictionary.
param_grid : dict or list of dictionaries
Dictionary with parameters names (``str``) as keys and lists of
parameter settings to try as values, or a list of such
dictionaries, in which case the grids spanned by each dictionary
in the list are explored. This enables searching over any sequence
of parameter settings.
cv : int, cross-validation generator or an iterable, default=None
Determines the cross-validation splitting strategy.
Possible inputs for cv are:
- None, to use the default 5-fold cross validation,
- integer, to specify the number of folds in a `(Stratified)KFold`,
- :term:`CV splitter`.
For integer/None inputs, if the estimator is a classifier and ``y`` is
either binary or multiclass, :class:`StratifiedKFold` is used. In all
other cases, :class:`KFold` is used. These splitters are instantiated
with `shuffle=False` so the splits will be the same across calls.
max_iter : int, default=None
Maximum number of iterations allowed. If None, the experiment will run until 100%
of the dataset is added to the training set.
Expand Down Expand Up @@ -119,6 +143,8 @@ def __init__(
init_clusterer=None,
init_strategy='random',
selection_strategy='entropy',
param_grid=None,
cv=None,
max_iter=None,
n_initial=.02,
increment=.02,
Expand All @@ -134,6 +160,8 @@ def __init__(
self.use_sample_weight = use_sample_weight
self.init_clusterer = init_clusterer
self.init_strategy = init_strategy
self.param_grid = param_grid
self.cv = cv
self.selection_strategy = selection_strategy
self.max_iter = max_iter
self.n_initial = n_initial
Expand All @@ -149,7 +177,7 @@ def __init__(
self.random_state = random_state

def _check(self, X, y):
"""Set ups simple initialization parameters to run an AL simulation."""
"""Set up simple initialization parameters to run an AL simulation."""

X, y = check_X_y(X, y)

Expand Down Expand Up @@ -218,6 +246,17 @@ def _check(self, X, y):

return X, X_test, y, y_test

def _check_cross_validation(self, y):
min_frequency = np.unique(y, return_counts=True)[-1].min()
cv = deepcopy(self.cv)

if hasattr(self.cv, 'n_splits'):
cv.n_splits = min(min_frequency, self.cv.n_splits)
elif type(self.cv) == int:
cv = min(min_frequency, self.cv)

return cv

def _get_performance_scores(self):
data_utilization = [
i[1] for i in self.data_utilization_
Expand Down Expand Up @@ -274,7 +313,6 @@ def fit(self, X, y):
"""

# Original "unlabeled" dataset
iter_n = 0
X, X_test, y, y_test = self._check(X, y)
selection = np.zeros(shape=(X.shape[0])).astype(bool)
sample_weight = None
Expand All @@ -291,7 +329,7 @@ def fit(self, X, y):

selection[ids] = True

while iter_n < self.max_iter_:
for iter_n in range(self.max_iter_):

# Generator + Chooser (in this case chooser==Predictor)
if self.generator is not None:
Expand All @@ -305,14 +343,31 @@ def fit(self, X, y):
else:
classifier = clone(self._classifier)

if isinstance(classifier, Pipeline) and self.use_sample_weight:
generator = classifier.steps[-2][-1]
classifier.steps[-2] = ('generator', generator)
# Set up parameter tuning within iterations
if self.param_grid is not None:
cv = self._check_cross_validation(y[selection])
classifier = GridSearchCV(
estimator=classifier,
param_grid=self.param_grid,
scoring=self.evaluation_metric,
cv=cv,
refit=True
)

# Generate artificial data and train classifier
if self.use_sample_weight:
classifier.fit(X[selection], y[selection],
generator__sample_weight=sample_weight)

# Save oversampler name to pass sample weight
ovr_name = (
classifier.steps[-2][0]
if self.param_grid is None
else classifier.estimator.steps[-2][0]
)

classifier.fit(
X[selection], y[selection],
**{f'{ovr_name}__sample_weight': sample_weight}
)

# Compute the class probabilities of labeled observations
labeled_ids = np.argwhere(selection).squeeze()
Expand Down Expand Up @@ -361,9 +416,6 @@ def fit(self, X, y):
if np.isnan(sample_weight).all():
sample_weight = np.ones(sample_weight.shape)

# keep track of iter_n
iter_n += 1

# stop if all examples have been included
if selection.all():
break
Expand Down
19 changes: 16 additions & 3 deletions research/data_augmentation/_oversampling_augmentation.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,9 @@ class OverSamplingAugmentation(BaseOverSampler):
according to the specified ratio.
- When ``oversampling``, the data augmentation is done according to the
sampling strategy passed in the ``oversampler`` object.
sampling strategy passed in the ``oversampler`` object. If ``value`` is not
`None`, then the number of samples generated for each class equals the number
of samples in the majority class multiplied by ``value``.
- When ``constant``, each class frequency is augmented to match
the value passed in the parameter ``value``.
Expand All @@ -90,7 +92,8 @@ class OverSamplingAugmentation(BaseOverSampler):
value : int, float, default=None
Value to be used as the new absolute frequency of each class. It is
ignored unless the augmentation strategy is set to 'constant'.
ignored unless the augmentation strategy is set to ``constant`` or
``oversampling``.
random_state : int, RandomState instance, default=None
Control the randomization of the algorithm.
Expand Down Expand Up @@ -182,9 +185,19 @@ def fit(self, X, y):
f" original dataset. Originally, there are {y.shape[0]} samples"
f" and {self.value} samples are asked."
)
elif self.augmentation_strategy == 'oversampling':
elif self.augmentation_strategy == 'oversampling' and self.value is None:
self.sampling_strategy_ = self.oversampler.sampling_strategy

elif self.augmentation_strategy == 'oversampling':
counts = OrderedDict(Counter(y))
max_freq = max(counts.values())
self.sampling_strategy_ = {
k: int(np.round(max_freq*self.value))
if max_freq*self.value > freq
else freq
for k, freq in counts.items()
}

elif type(self.augmentation_strategy) in [int, float]:
counts = OrderedDict(Counter(y))
self.sampling_strategy_ = {
Expand Down
21 changes: 13 additions & 8 deletions research/utils/_check_pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from imblearn.pipeline import Pipeline
from sklearn.base import clone


def check_pipelines(objects_list, random_state, n_runs):
"""Extract estimators and parameters grids."""

Expand All @@ -11,21 +12,25 @@ def check_pipelines(objects_list, random_state, n_runs):

pipelines = []
param_grid = []
for comb in product(*objects_list):
for comb, rs in product(product(*objects_list), random_states):
name = '|'.join([i[0] for i in comb])

# name, object, grid
comb = [(nm, ob, grd) for nm, ob, grd in comb if ob is not None]
# name, object, sub grid
comb = [(nm, ob, sg) for nm, ob, sg in comb if ob is not None]

pipelines.append((name, Pipeline([(nm, ob) for nm, ob, _ in comb])))
if name not in [n[0] for n in pipelines]:
pipelines.append((name, Pipeline([(nm, ob) for nm, ob, _ in comb])))

grids = {'est_name': [name]}
grid = {'est_name': [name]}
for obj_name, obj, sub_grid in comb:
if 'random_state' in obj.get_params().keys():
grids[f'{name}__{obj_name}__random_state'] = random_states
grid[f'{name}__{obj_name}__random_state'] = [rs]
for param, values in sub_grid.items():
grids[f'{name}__{obj_name}__{param}'] = values
param_grid.append(grids)
grid[f'{name}__{obj_name}__{param}'] = values

# Avoid multiple runs over pipelines without random state
if grid not in param_grid:
param_grid.append(grid)

return pipelines, param_grid

Expand Down

0 comments on commit f4c561b

Please sign in to comment.