diff --git a/HISTORY.md b/HISTORY.md index fa41f42cc..5accc1118 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,5 +1,9 @@ # History +## 0.2.3 - 2020-07-09 + +* Implement OneHot and Label encoding as transformers - Issue [#112](https://github.com/sdv-dev/RDT/issues/112) by @csala + ## 0.2.2 - 2020-06-26 ### Bugs Fixed diff --git a/rdt/__init__.py b/rdt/__init__.py index b7442e489..5d4a4a4a3 100644 --- a/rdt/__init__.py +++ b/rdt/__init__.py @@ -5,7 +5,7 @@ __author__ = """MIT Data To AI Lab""" __email__ = 'dailabmit@gmail.com' -__version__ = '0.2.2' +__version__ = '0.2.3.dev1' import numpy as np import pandas as pd diff --git a/rdt/hyper_transformer.py b/rdt/hyper_transformer.py index 9572c9396..06a7cdcd8 100644 --- a/rdt/hyper_transformer.py +++ b/rdt/hyper_transformer.py @@ -151,10 +151,10 @@ def transform(self, data): shape = transformed.shape - if len(shape) == 2 and shape[1] == 2: - data[column_name] = transformed[:, 0] - new_column = '{}#{}'.format(column_name, 1) - data[new_column] = transformed[:, 1] + if len(shape) == 2: + for index in range(shape[1]): + new_column = '{}#{}'.format(column_name, index) + data[new_column] = transformed[:, index] else: data[column_name] = transformed diff --git a/rdt/transformers/__init__.py b/rdt/transformers/__init__.py index c1f9391fa..366e91745 100644 --- a/rdt/transformers/__init__.py +++ b/rdt/transformers/__init__.py @@ -1,6 +1,7 @@ from rdt.transformers.base import BaseTransformer from rdt.transformers.boolean import BooleanTransformer -from rdt.transformers.categorical import CategoricalTransformer +from rdt.transformers.categorical import ( + CategoricalTransformer, LabelEncodingTransformer, OneHotEncodingTransformer) from rdt.transformers.datetime import DatetimeTransformer from rdt.transformers.null import NullTransformer from rdt.transformers.numerical import NumericalTransformer @@ -12,6 +13,8 @@ 'DatetimeTransformer', 'NumericalTransformer', 'NullTransformer', + 'OneHotEncodingTransformer', + 'LabelEncodingTransformer', ] TRANSFORMERS = { diff --git a/rdt/transformers/categorical.py b/rdt/transformers/categorical.py index dded9dcec..2a4b227c7 100644 --- a/rdt/transformers/categorical.py +++ b/rdt/transformers/categorical.py @@ -1,6 +1,7 @@ import numpy as np import pandas as pd from faker import Faker +from scipy.stats import norm from rdt.transformers.base import BaseTransformer @@ -26,12 +27,21 @@ class CategoricalTransformer(BaseTransformer): Args: anonymize (str, tuple or list): Anonymization category. ``None`` disables anonymization. Defaults to ``None``. + fuzzy (bool): + Whether to generate gassian noise around the class representative of each interval + or just use the mean for all the replaced values. Defaults to ``False``. + clip (bool): + If ``True``, clip the values to [0, 1]. Otherwise normalize them using modulo 1. + Defaults to ``False``. """ mapping = None + intervals = None - def __init__(self, anonymize=False): + def __init__(self, anonymize=False, fuzzy=False, clip=False): self.anonymize = anonymize + self.fuzzy = fuzzy + self.clip = clip def _get_faker(self): """Return the faker object to anonymize data. @@ -44,7 +54,6 @@ def _get_faker(self): ValueError: A ``ValueError`` is raised if the faker category we want don't exist. """ - if isinstance(self.anonymize, (tuple, list)): category, *args = self.anonymize else: @@ -99,7 +108,9 @@ def _get_intervals(data): for value, frequency in frequencies.items(): prob = frequency / elements end = start + prob - intervals[value] = (start, end) + mean = (start + end) / 2 + std = prob / 6 + intervals[value] = (start, end, mean, std) start = end return intervals @@ -127,8 +138,11 @@ def fit(self, data): def _get_value(self, category): """Get the value that represents this category""" - start, end = self.intervals[category] - return (start + end) / 2 + mean, std = self.intervals[category][2:] + if self.fuzzy: + return norm.rvs(mean, std) + else: + return mean def transform(self, data): """Transform categorical values to float values. @@ -151,18 +165,21 @@ def transform(self, data): if self.anonymize: data = data.map(MAPS[id(self)]) - return data.fillna(np.nan).apply(self._get_value) + if len(self.intervals) == 2: + category = list(self.intervals.values())[0] + return (data == category).astype(int) - @staticmethod - def _normalize(data): + return data.fillna(np.nan).apply(self._get_value).values + + def _normalize(self, data): """Normalize data to the range [0, 1]. - This is done by substracting to each value its integer part, leaving only - the decimal part, and then shifting the sign of the negative values. + This is done by either clipping or computing the values modulo 1. """ - data = data - data.astype(int) - data[data < 0] = -data[data < 0] - return data + if self.clip: + return data.clip(0, 1) + + return np.mod(data, 1) def reverse_transform(self, data): """Convert float values back to the original categorical values. @@ -175,13 +192,44 @@ def reverse_transform(self, data): pandas.Series """ if not isinstance(data, pd.Series): + if len(data.shape) > 1: + data = data[:, 0] + data = pd.Series(data) data = self._normalize(data) result = pd.Series(index=data.index) - for category, (start, end) in self.intervals.items(): + for category, values in self.intervals.items(): + start, end = values[:2] result[(start < data) & (data < end)] = category return result + + +class OneHotEncodingTransformer(BaseTransformer): + + def fit(self, data): + self.dummies = pd.Series(data.value_counts().index) + + def transform(self, data): + dummies = pd.get_dummies(data) + return dummies.reindex(columns=self.dummies, fill_value=0).values.astype(int) + + def reverse_transform(self, data): + indices = np.argmax(data, axis=1) + return pd.Series(indices).map(self.dummies) + + +class LabelEncodingTransformer(BaseTransformer): + + def fit(self, data): + self.values = pd.Series(data.unique()).to_dict() + self.labels = {label: value for value, label in self.values.items()} + + def transform(self, data): + return data.map(self.labels) + + def reverse_transform(self, data): + return pd.Series(data).map(self.values) diff --git a/setup.cfg b/setup.cfg index 1d0a4937c..b68128ea9 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.2.2 +current_version = 0.2.3.dev1 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index d794310fd..f7d14a167 100644 --- a/setup.py +++ b/setup.py @@ -89,6 +89,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/sdv-dev/RDT', - version='0.2.2', + version='0.2.3.dev1', zip_safe=False, ) diff --git a/tests/transformers/test_categorical.py b/tests/transformers/test_categorical.py index 5f21f1116..86c993827 100644 --- a/tests/transformers/test_categorical.py +++ b/tests/transformers/test_categorical.py @@ -119,9 +119,9 @@ def test__get_intervals(self): # Asserts expected_intervals = { - 'foo': (0, 0.5), - 'tar': (0.5, 0.75), - 'bar': (0.75, 1) + 'foo': (0, 0.5, 0.25, 0.5 / 6), + 'tar': (0.5, 0.75, 0.625, 0.25 / 6), + 'bar': (0.75, 1, 0.875, 0.25 / 6) } assert result == expected_intervals @@ -261,13 +261,12 @@ def test_fit_series_anonymize(self): expect_intervals_call_args ) - @patch('scipy.stats.norm.rvs') - def test__get_value(self, scipy_mock): - """Test convert category value into num between 0 and 1""" + def test__get_value_no_fuzzy(self): # Run transformer = Mock() + transformer.fuzzy = False transformer.intervals = { - 'foo': (0, 0.5), + 'foo': (0, 0.5, 0.25, 0.5 / 6), } result = CategoricalTransformer._get_value(transformer, 'foo') @@ -275,6 +274,23 @@ def test__get_value(self, scipy_mock): # Asserts assert result == 0.25 + @patch('scipy.stats.norm.rvs') + def test__get_value_fuzzy(self, rvs_mock): + # setup + rvs_mock.return_value = 0.2745 + + # Run + transformer = Mock() + transformer.fuzzy = True + transformer.intervals = { + 'foo': (0, 0.5, 0.25, 0.5 / 6), + } + + result = CategoricalTransformer._get_value(transformer, 'foo') + + # Asserts + assert result == 0.2745 + @patch('rdt.transformers.categorical.MAPS') def test_transform_array_anonymize(self, mock_maps): """Test transform a numpy.array, anonymize""" @@ -286,6 +302,7 @@ def test_transform_array_anonymize(self, mock_maps): # Run transformer = Mock() transformer.anonymize = 'email' + transformer.intervals = [1, 2, 3] mock_maps[id(transformer)] = np.array(['bar_x', 'foo_x', 'foo_x', 'tar_x']) @@ -309,6 +326,7 @@ def test_transform_array_no_anonymize(self, mock_maps): # Run transformer = Mock() transformer.anonymize = None + transformer.intervals = [1, 2, 3] CategoricalTransformer.transform(transformer, data) @@ -321,16 +339,35 @@ def test_transform_array_no_anonymize(self, mock_maps): "Dont call to the map encoder when not anonymize" ) - def test__normalize(self): + def test__normalize_no_clip(self): """Test normalize data""" # Setup data = pd.Series([-0.43, 0.1234, 1.5, -1.31]) + transformer = Mock() + transformer.clip = False + + # Run + result = CategoricalTransformer._normalize(transformer, data) + + # Asserts + expect = pd.Series([0.57, 0.1234, 0.5, 0.69], dtype=float) + + pd.testing.assert_series_equal(result, expect) + + def test__normalize_clip(self): + """Test normalize data with clip=True""" + # Setup + data = pd.Series([-0.43, 0.1234, 1.5, -1.31]) + + transformer = Mock() + transformer.clip = True + # Run - result = CategoricalTransformer._normalize(data) + result = CategoricalTransformer._normalize(transformer, data) # Asserts - expect = pd.Series([0.43, 0.1234, 0.5, 0.31], dtype=float) + expect = pd.Series([0.0, 0.1234, 1.0, 0.0], dtype=float) pd.testing.assert_series_equal(result, expect)