Skip to content

Commit

Permalink
Merge pull request #91 from eltonlaw/eltonlaw/90-impossible-value-han…
Browse files Browse the repository at this point in the history
…dling

[GH-90] impossible value handling
  • Loading branch information
eltonlaw authored Oct 5, 2019
2 parents aadda08 + e76179b commit 774ae0f
Show file tree
Hide file tree
Showing 47 changed files with 651 additions and 452 deletions.
14 changes: 13 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,19 @@ DOCKER_ID_USER=eltonlaw

.PHONY: all build test upload install docs

all: test
all: test-local

install-local:
cd $(IMPYUTE_ROOT) && python setup.py develop

uninstall-local:
cd $(IMPYUTE_ROOT) && python setup.py develop --uninstall

test-local:
cd $(IMPYUTE_ROOT) && pytest

clean:
find . -type f -name '*.pyc' -delete

rebuild-pybase:
docker rmi -f $(DOCKER_ID_USER)/pybase
Expand Down
11 changes: 10 additions & 1 deletion RELEASE_NOTES.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,13 @@

- Fix `fast_knn` incorrect weighting bug. Replaced distance weighting with inverse distance weighting and ability to swap in custom function (arg: 1D list of distances, ret: 1D list of weight percentages). New namespace created `impyute.util.inverse_distance_weighting` for functions that can be modified with custom args using `functool.partial` (check test for more details).
- pybase dockerfile bug fixes
-
- New `contrib` folder created and some of the utilities from `util` moved there:
* `impyute.util.compare -> `impyute.contrib.compare`
* `impyute.util.count_missing` -> `impyute.contrib.count_missing`
* `impyute.util.describe` -> `impyute.contrib.describe`
- Util namespace breaking changes
* impyute.util.find_null->impyute.ops.matrix.nan_indices
* impyute.util.preprocess->impyute.ops.wrapper.wrappers
* impyute.util.checks->impyute.ops.wrapper.checks
* impyute.util.BadInputError -> impyute.ops.errors.BadInputError
* impyute.util.BadOutputError -> impyute.ops.errors.BadOutputError
8 changes: 7 additions & 1 deletion impyute/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
impyute.imputations.cs: Imputations on cross sectional data
impyute.imputations.ts: Imputations on time series data
impyute.deletion: Deletion type missing data handling
impyute.contrib: Volatile and experimental code
"""
# pylint: disable=wrong-import-position

Expand All @@ -16,7 +17,12 @@

### Top Level Modules

__all__ = ["dataset", "util", "deletion"]
from impyute import dataset
from impyute import deletion
from impyute import ops
from impyute import contrib

__all__ = ["contrib", "dataset", "deletion", "ops"]

### Cross Sectional Imputations

Expand Down
7 changes: 7 additions & 0 deletions impyute/contrib/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
""" Volatile code. Expect stuff in this to change. """

from .describe import describe
from .count_missing import count_missing
from .compare import compare

__all__ = ["describe", "count_missing", "compare"]
2 changes: 1 addition & 1 deletion impyute/util/compare.py → impyute/contrib/compare.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""impyute.util.compare.py"""
"""impyute.contrib.compare.py"""
import importlib
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
""" impyute.util.count_missing.py """
""" impyute.contrib.count_missing.py """
import numpy as np
from impyute.util import find_null
from impyute.ops import matrix

def count_missing(data):
""" Calculate the total percentage of missing values and also the
Expand All @@ -18,13 +18,13 @@ def count_missing(data):
"""
size = len(data.flatten())
null_xy = find_null(data)
np.unique(null_xy)
counter = {y: 0. for y in np.unique(null_xy.T[1])}
nan_xy = matrix.nan_indices(data)
np.unique(nan_xy)
counter = {y: 0. for y in np.unique(nan_xy.T[1])}
change_in_percentage = 1./size
for _, y in null_xy:
for _, y in nan_xy:
counter[y] += change_in_percentage
total_missing = len(null_xy)/size
total_missing = len(nan_xy)/size
counter["total"] = total_missing

return counter
32 changes: 17 additions & 15 deletions impyute/util/describe.py → impyute/contrib/describe.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
""" impyute.util.describe """
from impyute.util import find_null
""" impyute.contrib.describe """
from impyute.ops import matrix

def describe(data): # verbose=True):
""" Print input/output multiple times
Eventually will be used instead of matrix.nan_indices everywhere
Parameters
----------
data: numpy.nd.array
Expand All @@ -16,15 +18,15 @@ def describe(data): # verbose=True):
dict
missingness: list
Confidence interval of data being MCAR, MAR or MNAR - in that order
null_xy: list of tuples
nan_xy: list of tuples
Indices of all null points
null_n: list
nan_n: list
Total number of null values for each column
pmissing_n: float
Percentage of missing values in dataset
null_rows: list
nan_rows: list
Indices of all rows that are completely null
null_cols: list
nan_cols: list
Indices of all columns that are completely null
mean_rows: list
Mean value of each row
Expand All @@ -37,24 +39,24 @@ def describe(data): # verbose=True):
"""
# missingness = [0.33, 0.33, 0.33] # find_missingness(data)
null_xy = find_null(data)
null_n = len(null_xy)
pmissing_n = float(null_n/len(data.flatten))
nan_xy = matrix.nan_indices(data)
nan_n = len(nan_xy)
pmissing_n = float(nan_n/len(data.flatten))
# pmissing_rows = ""
# pmissing_cols = ""
# null_rows = ""
# null_cols = ""
# nan_rows = ""
# nan_cols = ""
# mean_rows = ""
# mean_cols = ""
# std_dev = ""
# "missingness": missingness,
description = {"null_xy": null_xy,
"null_n": null_n,
description = {"nan_xy": nan_xy,
"nan_n": nan_n,
"pmissing_n": pmissing_n}
# "pmissing_rows": pmissing_rows,
# "pmissing_cols": pmissing_cols,
# "null_rows": null_rows,
# "null_cols": null_cols,
# "nan_rows": nan_rows,
# "nan_cols": nan_cols,
# "mean_rows": mean_rows,
# "mean_cols": mean_cols,
# "std_dev": std_dev}
Expand Down
14 changes: 7 additions & 7 deletions impyute/dataset/base.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
""" Shared functions to load/generate data """
import numpy as np
import string
import random
import math
import itertools
import math
import random
import string
import numpy as np
from impyute.dataset.corrupt import Corruptor
from impyute.util import BadInputError
from impyute.ops import error

def randu(bound=(0, 10), shape=(5, 5), missingness="mcar", thr=0.2, dtype="int"):
""" Return randomly generated dataset of numbers with uniformly
Expand Down Expand Up @@ -89,7 +89,7 @@ def randc(nlevels=5, shape=(5, 5), missingness="mcar", thr=0.2):
numpy.ndarray
"""
if shape[0]*shape[1] < nlevels:
raise BadInputError("nlevel exceeds the size of desired dataset. Please decrease the nlevel or increase the shape")
raise error.BadInputError("nlevel exceeds the size of desired dataset. Please decrease the nlevel or increase the shape")

length = len(string.ascii_lowercase)
n_fold = int(math.floor(math.log(nlevels, length)))
Expand Down Expand Up @@ -134,4 +134,4 @@ def mnist(missingness="mcar", thr=0.2):
dataset = fetch_mldata('MNIST original')
corruptor = Corruptor(dataset.data, thr=thr)
data = getattr(corruptor, missingness)()
return {"X": data, "Y": dataset.target}
return {"X": data, "Y": dataset.target}
4 changes: 2 additions & 2 deletions impyute/dataset/corrupt.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,10 @@ def mcar(self):
""" Overwrites values with MCAR placed NaN's """
data_1d = self.data.flatten()
n_total = len(data_1d)
null_x = np.random.choice(range(n_total),
nan_x = np.random.choice(range(n_total),
size=int(self.thr*n_total),
replace=False)
for x_i in null_x:
for x_i in nan_x:
data_1d[x_i] = np.nan
output = data_1d.reshape(self.shape)
return output
Expand Down
7 changes: 3 additions & 4 deletions impyute/deletion/complete_case.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
""" impyute.deletion.complete_case """
import numpy as np
from impyute.util import checks
from impyute.util import preprocess
from impyute.ops import wrapper

@preprocess
@checks
@wrapper.wrappers
@wrapper.checks
def complete_case(data):
""" Return only data rows with all columns
Expand Down
28 changes: 13 additions & 15 deletions impyute/imputation/cs/buck_iterative.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
""" impyute.imputation.cs.buck_iterative """
import numpy as np
from sklearn.linear_model import LinearRegression
from impyute.util import find_null
from impyute.util import checks
from impyute.util import preprocess
from impyute.ops import matrix
from impyute.ops import wrapper
# pylint: disable=too-many-locals

@preprocess
@checks
@wrapper.wrappers
@wrapper.checks
def buck_iterative(data):
""" Iterative variant of buck's method
Expand All @@ -30,32 +28,32 @@ def buck_iterative(data):
Imputed data.
"""
null_xy = find_null(data)
nan_xy = matrix.nan_indices(data)

# Add a column of zeros to the index values
null_xyv = np.append(null_xy, np.zeros((np.shape(null_xy)[0], 1)), axis=1)
nan_xyz = np.append(nan_xy, np.zeros((np.shape(nan_xy)[0], 1)), axis=1)

null_xyv = [[int(x), int(y), v] for x, y, v in null_xyv]
nan_xyz = [[int(x), int(y), v] for x, y, v in nan_xyz]
temp = []
cols_missing = {y for _, y, _ in null_xyv}
cols_missing = {y for _, y, _ in nan_xyz}

# Step 1: Simple Imputation, these are just placeholders
for x_i, y_i, value in null_xyv:
for x_i, y_i, value in nan_xyz:
# Column containing nan value without the nan value
col = data[:, [y_i]][~np.isnan(data[:, [y_i]])]

new_value = np.mean(col)
data[x_i][y_i] = new_value
temp.append([x_i, y_i, new_value])
null_xyv = temp
nan_xyz = temp

# Step 5: Repeat step 2 - 4 until convergence (the 100 is arbitrary)

converged = [False] * len(null_xyv)
converged = [False] * len(nan_xyz)
while not all(converged):
# Step 2: Placeholders are set back to missing for one variable/column
dependent_col = int(np.random.choice(list(cols_missing)))
missing_xs = [int(x) for x, y, value in null_xyv if y == dependent_col]
missing_xs = [int(x) for x, y, value in nan_xyz if y == dependent_col]

# Step 3: Perform linear regression using the other variables
x_train, y_train = [], []
Expand All @@ -68,7 +66,7 @@ def buck_iterative(data):
# Step 4: Missing values for the missing variable/column are replaced
# with predictions from our new linear regression model
# For null indices with the dependent column that was randomly chosen
for i, z in enumerate(null_xyv):
for i, z in enumerate(nan_xyz):
x_i = z[0]
y_i = z[1]
value = data[x_i, y_i]
Expand Down
32 changes: 15 additions & 17 deletions impyute/imputation/cs/central_tendency.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
""" impyute.imputation.cs.central_tendency """
import numpy as np
from impyute.util import find_null
from impyute.util import checks
from impyute.util import preprocess
from impyute.ops import matrix
from impyute.ops import wrapper

@preprocess
@checks
@wrapper.wrappers
@wrapper.checks
def mean(data):
""" Substitute missing values with the mean of that column.
Expand All @@ -20,15 +18,15 @@ def mean(data):
Imputed data.
"""
null_xy = find_null(data)
for x_i, y_i in null_xy:
nan_xy = matrix.nan_indices(data)
for x_i, y_i in nan_xy:
row_wo_nan = data[:, [y_i]][~np.isnan(data[:, [y_i]])]
new_value = np.mean(row_wo_nan)
data[x_i][y_i] = new_value
return data

@preprocess
@checks
@wrapper.wrappers
@wrapper.checks
def median(data):
""" Substitute missing values with the median of that column(middle).
Expand All @@ -43,19 +41,19 @@ def median(data):
Imputed data.
"""
null_xy = find_null(data)
cols_missing = set(null_xy.T[1])
nan_xy = matrix.nan_indices(data)
cols_missing = set(nan_xy.T[1])
medians = {}
for y_i in cols_missing:
cols_wo_nan = data[:, [y_i]][~np.isnan(data[:, [y_i]])]
median_y = np.median(cols_wo_nan)
medians[str(y_i)] = median_y
for x_i, y_i in null_xy:
for x_i, y_i in nan_xy:
data[x_i][y_i] = medians[str(y_i)]
return data

@preprocess
@checks
@wrapper.wrappers
@wrapper.checks
def mode(data):
""" Substitute missing values with the mode of that column(most frequent).
Expand All @@ -73,14 +71,14 @@ def mode(data):
Imputed data.
"""
null_xy = find_null(data)
nan_xy = matrix.nan_indices(data)
modes = []
for y_i in range(np.shape(data)[1]):
unique_counts = np.unique(data[:, [y_i]], return_counts=True)
max_count = np.max(unique_counts[1])
mode_y = [unique for unique, count in np.transpose(unique_counts)
if count == max_count and not np.isnan(unique)]
modes.append(mode_y) # Appends index of column and column modes
for x_i, y_i in null_xy:
for x_i, y_i in nan_xy:
data[x_i][y_i] = np.random.choice(modes[y_i])
return data
Loading

0 comments on commit 774ae0f

Please sign in to comment.