Skip to content

Commit

Permalink
make release-tag: Merge branch 'master' into stable
Browse files Browse the repository at this point in the history
  • Loading branch information
csala committed Jul 9, 2020
2 parents bc885cc + efa077f commit 16ce0ea
Show file tree
Hide file tree
Showing 8 changed files with 124 additions and 32 deletions.
4 changes: 4 additions & 0 deletions HISTORY.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# History

## 0.2.3 - 2020-07-09

* Implement OneHot and Label encoding as transformers - Issue [#112](https://github.com/sdv-dev/RDT/issues/112) by @csala

## 0.2.2 - 2020-06-26

### Bugs Fixed
Expand Down
2 changes: 1 addition & 1 deletion rdt/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

__author__ = """MIT Data To AI Lab"""
__email__ = '[email protected]'
__version__ = '0.2.2'
__version__ = '0.2.3.dev1'

import numpy as np
import pandas as pd
Expand Down
8 changes: 4 additions & 4 deletions rdt/hyper_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,10 +151,10 @@ def transform(self, data):

shape = transformed.shape

if len(shape) == 2 and shape[1] == 2:
data[column_name] = transformed[:, 0]
new_column = '{}#{}'.format(column_name, 1)
data[new_column] = transformed[:, 1]
if len(shape) == 2:
for index in range(shape[1]):
new_column = '{}#{}'.format(column_name, index)
data[new_column] = transformed[:, index]

else:
data[column_name] = transformed
Expand Down
5 changes: 4 additions & 1 deletion rdt/transformers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from rdt.transformers.base import BaseTransformer
from rdt.transformers.boolean import BooleanTransformer
from rdt.transformers.categorical import CategoricalTransformer
from rdt.transformers.categorical import (
CategoricalTransformer, LabelEncodingTransformer, OneHotEncodingTransformer)
from rdt.transformers.datetime import DatetimeTransformer
from rdt.transformers.null import NullTransformer
from rdt.transformers.numerical import NumericalTransformer
Expand All @@ -12,6 +13,8 @@
'DatetimeTransformer',
'NumericalTransformer',
'NullTransformer',
'OneHotEncodingTransformer',
'LabelEncodingTransformer',
]

TRANSFORMERS = {
Expand Down
76 changes: 62 additions & 14 deletions rdt/transformers/categorical.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import numpy as np
import pandas as pd
from faker import Faker
from scipy.stats import norm

from rdt.transformers.base import BaseTransformer

Expand All @@ -26,12 +27,21 @@ class CategoricalTransformer(BaseTransformer):
Args:
anonymize (str, tuple or list):
Anonymization category. ``None`` disables anonymization. Defaults to ``None``.
fuzzy (bool):
Whether to generate gassian noise around the class representative of each interval
or just use the mean for all the replaced values. Defaults to ``False``.
clip (bool):
If ``True``, clip the values to [0, 1]. Otherwise normalize them using modulo 1.
Defaults to ``False``.
"""

mapping = None
intervals = None

def __init__(self, anonymize=False):
def __init__(self, anonymize=False, fuzzy=False, clip=False):
self.anonymize = anonymize
self.fuzzy = fuzzy
self.clip = clip

def _get_faker(self):
"""Return the faker object to anonymize data.
Expand All @@ -44,7 +54,6 @@ def _get_faker(self):
ValueError:
A ``ValueError`` is raised if the faker category we want don't exist.
"""

if isinstance(self.anonymize, (tuple, list)):
category, *args = self.anonymize
else:
Expand Down Expand Up @@ -99,7 +108,9 @@ def _get_intervals(data):
for value, frequency in frequencies.items():
prob = frequency / elements
end = start + prob
intervals[value] = (start, end)
mean = (start + end) / 2
std = prob / 6
intervals[value] = (start, end, mean, std)
start = end

return intervals
Expand Down Expand Up @@ -127,8 +138,11 @@ def fit(self, data):

def _get_value(self, category):
"""Get the value that represents this category"""
start, end = self.intervals[category]
return (start + end) / 2
mean, std = self.intervals[category][2:]
if self.fuzzy:
return norm.rvs(mean, std)
else:
return mean

def transform(self, data):
"""Transform categorical values to float values.
Expand All @@ -151,18 +165,21 @@ def transform(self, data):
if self.anonymize:
data = data.map(MAPS[id(self)])

return data.fillna(np.nan).apply(self._get_value)
if len(self.intervals) == 2:
category = list(self.intervals.values())[0]
return (data == category).astype(int)

@staticmethod
def _normalize(data):
return data.fillna(np.nan).apply(self._get_value).values

def _normalize(self, data):
"""Normalize data to the range [0, 1].
This is done by substracting to each value its integer part, leaving only
the decimal part, and then shifting the sign of the negative values.
This is done by either clipping or computing the values modulo 1.
"""
data = data - data.astype(int)
data[data < 0] = -data[data < 0]
return data
if self.clip:
return data.clip(0, 1)

return np.mod(data, 1)

def reverse_transform(self, data):
"""Convert float values back to the original categorical values.
Expand All @@ -175,13 +192,44 @@ def reverse_transform(self, data):
pandas.Series
"""
if not isinstance(data, pd.Series):
if len(data.shape) > 1:
data = data[:, 0]

data = pd.Series(data)

data = self._normalize(data)

result = pd.Series(index=data.index)

for category, (start, end) in self.intervals.items():
for category, values in self.intervals.items():
start, end = values[:2]
result[(start < data) & (data < end)] = category

return result


class OneHotEncodingTransformer(BaseTransformer):

def fit(self, data):
self.dummies = pd.Series(data.value_counts().index)

def transform(self, data):
dummies = pd.get_dummies(data)
return dummies.reindex(columns=self.dummies, fill_value=0).values.astype(int)

def reverse_transform(self, data):
indices = np.argmax(data, axis=1)
return pd.Series(indices).map(self.dummies)


class LabelEncodingTransformer(BaseTransformer):

def fit(self, data):
self.values = pd.Series(data.unique()).to_dict()
self.labels = {label: value for value, label in self.values.items()}

def transform(self, data):
return data.map(self.labels)

def reverse_transform(self, data):
return pd.Series(data).map(self.values)
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 0.2.2
current_version = 0.2.3.dev1
commit = True
tag = True
parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,6 @@
test_suite='tests',
tests_require=tests_require,
url='https://github.com/sdv-dev/RDT',
version='0.2.2',
version='0.2.3.dev1',
zip_safe=False,
)
57 changes: 47 additions & 10 deletions tests/transformers/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,9 +119,9 @@ def test__get_intervals(self):

# Asserts
expected_intervals = {
'foo': (0, 0.5),
'tar': (0.5, 0.75),
'bar': (0.75, 1)
'foo': (0, 0.5, 0.25, 0.5 / 6),
'tar': (0.5, 0.75, 0.625, 0.25 / 6),
'bar': (0.75, 1, 0.875, 0.25 / 6)
}
assert result == expected_intervals

Expand Down Expand Up @@ -261,20 +261,36 @@ def test_fit_series_anonymize(self):
expect_intervals_call_args
)

@patch('scipy.stats.norm.rvs')
def test__get_value(self, scipy_mock):
"""Test convert category value into num between 0 and 1"""
def test__get_value_no_fuzzy(self):
# Run
transformer = Mock()
transformer.fuzzy = False
transformer.intervals = {
'foo': (0, 0.5),
'foo': (0, 0.5, 0.25, 0.5 / 6),
}

result = CategoricalTransformer._get_value(transformer, 'foo')

# Asserts
assert result == 0.25

@patch('scipy.stats.norm.rvs')
def test__get_value_fuzzy(self, rvs_mock):
# setup
rvs_mock.return_value = 0.2745

# Run
transformer = Mock()
transformer.fuzzy = True
transformer.intervals = {
'foo': (0, 0.5, 0.25, 0.5 / 6),
}

result = CategoricalTransformer._get_value(transformer, 'foo')

# Asserts
assert result == 0.2745

@patch('rdt.transformers.categorical.MAPS')
def test_transform_array_anonymize(self, mock_maps):
"""Test transform a numpy.array, anonymize"""
Expand All @@ -286,6 +302,7 @@ def test_transform_array_anonymize(self, mock_maps):
# Run
transformer = Mock()
transformer.anonymize = 'email'
transformer.intervals = [1, 2, 3]

mock_maps[id(transformer)] = np.array(['bar_x', 'foo_x', 'foo_x', 'tar_x'])

Expand All @@ -309,6 +326,7 @@ def test_transform_array_no_anonymize(self, mock_maps):
# Run
transformer = Mock()
transformer.anonymize = None
transformer.intervals = [1, 2, 3]

CategoricalTransformer.transform(transformer, data)

Expand All @@ -321,16 +339,35 @@ def test_transform_array_no_anonymize(self, mock_maps):
"Dont call to the map encoder when not anonymize"
)

def test__normalize(self):
def test__normalize_no_clip(self):
"""Test normalize data"""
# Setup
data = pd.Series([-0.43, 0.1234, 1.5, -1.31])

transformer = Mock()
transformer.clip = False

# Run
result = CategoricalTransformer._normalize(transformer, data)

# Asserts
expect = pd.Series([0.57, 0.1234, 0.5, 0.69], dtype=float)

pd.testing.assert_series_equal(result, expect)

def test__normalize_clip(self):
"""Test normalize data with clip=True"""
# Setup
data = pd.Series([-0.43, 0.1234, 1.5, -1.31])

transformer = Mock()
transformer.clip = True

# Run
result = CategoricalTransformer._normalize(data)
result = CategoricalTransformer._normalize(transformer, data)

# Asserts
expect = pd.Series([0.43, 0.1234, 0.5, 0.31], dtype=float)
expect = pd.Series([0.0, 0.1234, 1.0, 0.0], dtype=float)

pd.testing.assert_series_equal(result, expect)

Expand Down

0 comments on commit 16ce0ea

Please sign in to comment.