Merge pull request #91 from eltonlaw/eltonlaw/90-impossible-value-han…

…dling [GH-90] impossible value handling
eltonlaw · Oct 5, 2019 · 774ae0f · 774ae0f
2 parents aadda08 + e76179b
commit 774ae0f
Show file tree

Hide file tree

Showing 47 changed files with 651 additions and 452 deletions.
diff --git a/Makefile b/Makefile
@@ -2,7 +2,19 @@ DOCKER_ID_USER=eltonlaw
 
 .PHONY: all build test upload install docs
 
-all: test
+all: test-local
+
+install-local:
+	cd $(IMPYUTE_ROOT) && python setup.py develop
+
+uninstall-local:
+	cd $(IMPYUTE_ROOT) && python setup.py develop --uninstall
+
+test-local:
+	cd $(IMPYUTE_ROOT) && pytest
+
+clean:
+	find . -type f -name '*.pyc' -delete
 
 rebuild-pybase:
 	docker rmi -f $(DOCKER_ID_USER)/pybase

diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
@@ -2,4 +2,13 @@
 
 - Fix `fast_knn` incorrect weighting bug. Replaced distance weighting with inverse distance weighting and ability to swap in custom function (arg: 1D list of distances, ret: 1D list of weight percentages). New namespace created `impyute.util.inverse_distance_weighting` for functions that can be modified with custom args using `functool.partial` (check test for more details). 
 - pybase dockerfile bug fixes
-- 
+- New `contrib` folder created and some of the utilities from `util` moved there:
+    * `impyute.util.compare -> `impyute.contrib.compare`
+    * `impyute.util.count_missing` -> `impyute.contrib.count_missing`
+    * `impyute.util.describe` -> `impyute.contrib.describe`
+- Util namespace breaking changes
+    * impyute.util.find_null->impyute.ops.matrix.nan_indices
+    * impyute.util.preprocess->impyute.ops.wrapper.wrappers
+    * impyute.util.checks->impyute.ops.wrapper.checks
+    * impyute.util.BadInputError -> impyute.ops.errors.BadInputError
+    * impyute.util.BadOutputError -> impyute.ops.errors.BadOutputError
diff --git a/impyute/__init__.py b/impyute/__init__.py
@@ -3,6 +3,7 @@
 impyute.imputations.cs:   Imputations on cross sectional data
 impyute.imputations.ts:   Imputations on time series data
 impyute.deletion:         Deletion type missing data handling
+impyute.contrib:          Volatile and experimental code
 """
 # pylint: disable=wrong-import-position
 
@@ -16,7 +17,12 @@
 
 ### Top Level Modules
 
-__all__ = ["dataset", "util", "deletion"]
+from impyute import dataset
+from impyute import deletion
+from impyute import ops
+from impyute import contrib
+
+__all__ = ["contrib", "dataset", "deletion", "ops"]
 
 ### Cross Sectional Imputations
 

diff --git a/impyute/contrib/__init__.py b/impyute/contrib/__init__.py
@@ -0,0 +1,7 @@
+""" Volatile code. Expect stuff in this to change. """
+
+from .describe import describe
+from .count_missing import count_missing
+from .compare import compare
+
+__all__ = ["describe", "count_missing", "compare"]
diff --git a/impyute/util/compare.py → impyute/contrib/compare.py b/impyute/util/compare.py → impyute/contrib/compare.py
@@ -1,4 +1,4 @@
-"""impyute.util.compare.py"""
+"""impyute.contrib.compare.py"""
 import importlib
 from sklearn.model_selection import train_test_split
 from sklearn.metrics import accuracy_score

diff --git a/impyute/util/count_missing.py → impyute/contrib/count_missing.py b/impyute/util/count_missing.py → impyute/contrib/count_missing.py
@@ -1,6 +1,6 @@
-""" impyute.util.count_missing.py """
+""" impyute.contrib.count_missing.py """
 import numpy as np
-from impyute.util import find_null
+from impyute.ops import matrix
 
 def count_missing(data):
     """ Calculate the total percentage of missing values and also the
@@ -18,13 +18,13 @@ def count_missing(data):
 
     """
     size = len(data.flatten())
-    null_xy = find_null(data)
-    np.unique(null_xy)
-    counter = {y: 0. for y in np.unique(null_xy.T[1])}
+    nan_xy = matrix.nan_indices(data)
+    np.unique(nan_xy)
+    counter = {y: 0. for y in np.unique(nan_xy.T[1])}
     change_in_percentage = 1./size
-    for _, y in null_xy:
+    for _, y in nan_xy:
         counter[y] += change_in_percentage
-    total_missing = len(null_xy)/size
+    total_missing = len(nan_xy)/size
     counter["total"] = total_missing
 
     return counter
diff --git a/impyute/util/describe.py → impyute/contrib/describe.py b/impyute/util/describe.py → impyute/contrib/describe.py
@@ -1,9 +1,11 @@
-""" impyute.util.describe """
-from impyute.util import find_null
+""" impyute.contrib.describe """
+from impyute.ops import matrix
 
 def describe(data): # verbose=True):
     """ Print input/output multiple times
 
+    Eventually will be used instead of matrix.nan_indices everywhere
+
     Parameters
     ----------
     data: numpy.nd.array
@@ -16,15 +18,15 @@ def describe(data): # verbose=True):
     dict
         missingness: list
             Confidence interval of data being MCAR, MAR or MNAR - in that order
-        null_xy: list of tuples
+        nan_xy: list of tuples
             Indices of all null points
-        null_n: list
+        nan_n: list
             Total number of null values for each column
         pmissing_n: float
             Percentage of missing values in dataset
-        null_rows: list
+        nan_rows: list
             Indices of all rows that are completely null
-        null_cols: list
+        nan_cols: list
             Indices of all columns that are completely null
         mean_rows: list
             Mean value of each row
@@ -37,24 +39,24 @@ def describe(data): # verbose=True):
 
     """
 #    missingness = [0.33, 0.33, 0.33]  # find_missingness(data)
-    null_xy = find_null(data)
-    null_n = len(null_xy)
-    pmissing_n = float(null_n/len(data.flatten))
+    nan_xy = matrix.nan_indices(data)
+    nan_n = len(nan_xy)
+    pmissing_n = float(nan_n/len(data.flatten))
 #    pmissing_rows = ""
 #    pmissing_cols = ""
-#    null_rows = ""
-#    null_cols = ""
+#    nan_rows = ""
+#    nan_cols = ""
 #    mean_rows = ""
 #    mean_cols = ""
 #    std_dev = ""
 #                   "missingness": missingness,
-    description = {"null_xy": null_xy,
-                   "null_n": null_n,
+    description = {"nan_xy": nan_xy,
+                   "nan_n": nan_n,
                    "pmissing_n": pmissing_n}
 #                   "pmissing_rows": pmissing_rows,
 #                   "pmissing_cols": pmissing_cols,
-#                   "null_rows": null_rows,
-#                   "null_cols": null_cols,
+#                   "nan_rows": nan_rows,
+#                   "nan_cols": nan_cols,
 #                   "mean_rows": mean_rows,
 #                   "mean_cols": mean_cols,
 #                   "std_dev": std_dev}

diff --git a/impyute/dataset/base.py b/impyute/dataset/base.py
@@ -1,11 +1,11 @@
 """ Shared functions to load/generate data """
-import numpy as np
-import string
-import random
-import math
 import itertools
+import math
+import random
+import string
+import numpy as np
 from impyute.dataset.corrupt import Corruptor
-from impyute.util import BadInputError
+from impyute.ops import error
 
 def randu(bound=(0, 10), shape=(5, 5), missingness="mcar", thr=0.2, dtype="int"):
     """ Return randomly generated dataset of numbers with uniformly
@@ -89,7 +89,7 @@ def randc(nlevels=5, shape=(5, 5), missingness="mcar", thr=0.2):
     numpy.ndarray
     """
     if shape[0]*shape[1] < nlevels:
-        raise BadInputError("nlevel exceeds the size of desired dataset. Please decrease the nlevel or increase the shape")
+        raise error.BadInputError("nlevel exceeds the size of desired dataset. Please decrease the nlevel or increase the shape")
 
     length = len(string.ascii_lowercase)
     n_fold = int(math.floor(math.log(nlevels, length)))
@@ -134,4 +134,4 @@ def mnist(missingness="mcar", thr=0.2):
     dataset = fetch_mldata('MNIST original')
     corruptor = Corruptor(dataset.data, thr=thr)
     data = getattr(corruptor, missingness)()
-    return {"X": data, "Y": dataset.target}
+    return {"X": data, "Y": dataset.target}
diff --git a/impyute/dataset/corrupt.py b/impyute/dataset/corrupt.py
@@ -33,10 +33,10 @@ def mcar(self):
         """ Overwrites values with MCAR placed NaN's """
         data_1d = self.data.flatten()
         n_total = len(data_1d)
-        null_x = np.random.choice(range(n_total),
+        nan_x = np.random.choice(range(n_total),
                                   size=int(self.thr*n_total),
                                   replace=False)
-        for x_i in null_x:
+        for x_i in nan_x:
             data_1d[x_i] = np.nan
         output = data_1d.reshape(self.shape)
         return output

diff --git a/impyute/deletion/complete_case.py b/impyute/deletion/complete_case.py
@@ -1,10 +1,9 @@
 """ impyute.deletion.complete_case """
 import numpy as np
-from impyute.util import checks
-from impyute.util import preprocess
+from impyute.ops import wrapper
 
-@preprocess
-@checks
+@wrapper.wrappers
+@wrapper.checks
 def complete_case(data):
     """ Return only data rows with all columns
 

diff --git a/impyute/imputation/cs/buck_iterative.py b/impyute/imputation/cs/buck_iterative.py
@@ -1,13 +1,11 @@
-""" impyute.imputation.cs.buck_iterative """
 import numpy as np
 from sklearn.linear_model import LinearRegression
-from impyute.util import find_null
-from impyute.util import checks
-from impyute.util import preprocess
+from impyute.ops import matrix
+from impyute.ops import wrapper
 # pylint: disable=too-many-locals
 
-@preprocess
-@checks
+@wrapper.wrappers
+@wrapper.checks
 def buck_iterative(data):
     """ Iterative variant of buck's method
 
@@ -30,32 +28,32 @@ def buck_iterative(data):
         Imputed data.
 
     """
-    null_xy = find_null(data)
+    nan_xy = matrix.nan_indices(data)
 
     # Add a column of zeros to the index values
-    null_xyv = np.append(null_xy, np.zeros((np.shape(null_xy)[0], 1)), axis=1)
+    nan_xyz = np.append(nan_xy, np.zeros((np.shape(nan_xy)[0], 1)), axis=1)
 
-    null_xyv = [[int(x), int(y), v] for x, y, v in null_xyv]
+    nan_xyz = [[int(x), int(y), v] for x, y, v in nan_xyz]
     temp = []
-    cols_missing = {y for _, y, _ in null_xyv}
+    cols_missing = {y for _, y, _ in nan_xyz}
 
     # Step 1: Simple Imputation, these are just placeholders
-    for x_i, y_i, value in null_xyv:
+    for x_i, y_i, value in nan_xyz:
         # Column containing nan value without the nan value
         col = data[:, [y_i]][~np.isnan(data[:, [y_i]])]
 
         new_value = np.mean(col)
         data[x_i][y_i] = new_value
         temp.append([x_i, y_i, new_value])
-    null_xyv = temp
+    nan_xyz = temp
 
     # Step 5: Repeat step 2 - 4 until convergence (the 100 is arbitrary)
 
-    converged = [False] * len(null_xyv)
+    converged = [False] * len(nan_xyz)
     while not all(converged):
         # Step 2: Placeholders are set back to missing for one variable/column
         dependent_col = int(np.random.choice(list(cols_missing)))
-        missing_xs = [int(x) for x, y, value in null_xyv if y == dependent_col]
+        missing_xs = [int(x) for x, y, value in nan_xyz if y == dependent_col]
 
         # Step 3: Perform linear regression using the other variables
         x_train, y_train = [], []
@@ -68,7 +66,7 @@ def buck_iterative(data):
         # Step 4: Missing values for the missing variable/column are replaced
         # with predictions from our new linear regression model
         # For null indices with the dependent column that was randomly chosen
-        for i, z in enumerate(null_xyv):
+        for i, z in enumerate(nan_xyz):
             x_i = z[0]
             y_i = z[1]
             value = data[x_i, y_i]

diff --git a/impyute/imputation/cs/central_tendency.py b/impyute/imputation/cs/central_tendency.py
@@ -1,11 +1,9 @@
-""" impyute.imputation.cs.central_tendency """
 import numpy as np
-from impyute.util import find_null
-from impyute.util import checks
-from impyute.util import preprocess
+from impyute.ops import matrix
+from impyute.ops import wrapper
 
-@preprocess
-@checks
+@wrapper.wrappers
+@wrapper.checks
 def mean(data):
     """ Substitute missing values with the mean of that column.
 
@@ -20,15 +18,15 @@ def mean(data):
         Imputed data.
 
     """
-    null_xy = find_null(data)
-    for x_i, y_i in null_xy:
+    nan_xy = matrix.nan_indices(data)
+    for x_i, y_i in nan_xy:
         row_wo_nan = data[:, [y_i]][~np.isnan(data[:, [y_i]])]
         new_value = np.mean(row_wo_nan)
         data[x_i][y_i] = new_value
     return data
 
-@preprocess
-@checks
+@wrapper.wrappers
+@wrapper.checks
 def median(data):
     """ Substitute missing values with the median of that column(middle).
 
@@ -43,19 +41,19 @@ def median(data):
         Imputed data.
 
     """
-    null_xy = find_null(data)
-    cols_missing = set(null_xy.T[1])
+    nan_xy = matrix.nan_indices(data)
+    cols_missing = set(nan_xy.T[1])
     medians = {}
     for y_i in cols_missing:
         cols_wo_nan = data[:, [y_i]][~np.isnan(data[:, [y_i]])]
         median_y = np.median(cols_wo_nan)
         medians[str(y_i)] = median_y
-    for x_i, y_i in null_xy:
+    for x_i, y_i in nan_xy:
         data[x_i][y_i] = medians[str(y_i)]
     return data
 
-@preprocess
-@checks
+@wrapper.wrappers
+@wrapper.checks
 def mode(data):
     """ Substitute missing values with the mode of that column(most frequent).
 
@@ -73,14 +71,14 @@ def mode(data):
         Imputed data.
 
     """
-    null_xy = find_null(data)
+    nan_xy = matrix.nan_indices(data)
     modes = []
     for y_i in range(np.shape(data)[1]):
         unique_counts = np.unique(data[:, [y_i]], return_counts=True)
         max_count = np.max(unique_counts[1])
         mode_y = [unique for unique, count in np.transpose(unique_counts)
                   if count == max_count and not np.isnan(unique)]
         modes.append(mode_y)  # Appends index of column and column modes
-    for x_i, y_i in null_xy:
+    for x_i, y_i in nan_xy:
         data[x_i][y_i] = np.random.choice(modes[y_i])
     return data