diff --git a/Makefile b/Makefile index 5e3ed0d..351890d 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,19 @@ DOCKER_ID_USER=eltonlaw .PHONY: all build test upload install docs -all: test +all: test-local + +install-local: + cd $(IMPYUTE_ROOT) && python setup.py develop + +uninstall-local: + cd $(IMPYUTE_ROOT) && python setup.py develop --uninstall + +test-local: + cd $(IMPYUTE_ROOT) && pytest + +clean: + find . -type f -name '*.pyc' -delete rebuild-pybase: docker rmi -f $(DOCKER_ID_USER)/pybase diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index 21d92ec..4e65de1 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -2,4 +2,13 @@ - Fix `fast_knn` incorrect weighting bug. Replaced distance weighting with inverse distance weighting and ability to swap in custom function (arg: 1D list of distances, ret: 1D list of weight percentages). New namespace created `impyute.util.inverse_distance_weighting` for functions that can be modified with custom args using `functool.partial` (check test for more details). - pybase dockerfile bug fixes -- +- New `contrib` folder created and some of the utilities from `util` moved there: + * `impyute.util.compare -> `impyute.contrib.compare` + * `impyute.util.count_missing` -> `impyute.contrib.count_missing` + * `impyute.util.describe` -> `impyute.contrib.describe` +- Util namespace breaking changes + * impyute.util.find_null->impyute.ops.matrix.nan_indices + * impyute.util.preprocess->impyute.ops.wrapper.wrappers + * impyute.util.checks->impyute.ops.wrapper.checks + * impyute.util.BadInputError -> impyute.ops.errors.BadInputError + * impyute.util.BadOutputError -> impyute.ops.errors.BadOutputError diff --git a/impyute/__init__.py b/impyute/__init__.py index 4b34f6a..9b44855 100644 --- a/impyute/__init__.py +++ b/impyute/__init__.py @@ -3,6 +3,7 @@ impyute.imputations.cs: Imputations on cross sectional data impyute.imputations.ts: Imputations on time series data impyute.deletion: Deletion type missing data handling +impyute.contrib: Volatile and experimental code """ # pylint: disable=wrong-import-position @@ -16,7 +17,12 @@ ### Top Level Modules -__all__ = ["dataset", "util", "deletion"] +from impyute import dataset +from impyute import deletion +from impyute import ops +from impyute import contrib + +__all__ = ["contrib", "dataset", "deletion", "ops"] ### Cross Sectional Imputations diff --git a/impyute/contrib/__init__.py b/impyute/contrib/__init__.py new file mode 100644 index 0000000..ac15b3a --- /dev/null +++ b/impyute/contrib/__init__.py @@ -0,0 +1,7 @@ +""" Volatile code. Expect stuff in this to change. """ + +from .describe import describe +from .count_missing import count_missing +from .compare import compare + +__all__ = ["describe", "count_missing", "compare"] diff --git a/impyute/util/compare.py b/impyute/contrib/compare.py similarity index 98% rename from impyute/util/compare.py rename to impyute/contrib/compare.py index 3e70b2b..4cbf028 100644 --- a/impyute/util/compare.py +++ b/impyute/contrib/compare.py @@ -1,4 +1,4 @@ -"""impyute.util.compare.py""" +"""impyute.contrib.compare.py""" import importlib from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score diff --git a/impyute/util/count_missing.py b/impyute/contrib/count_missing.py similarity index 66% rename from impyute/util/count_missing.py rename to impyute/contrib/count_missing.py index c5717de..f0117cc 100644 --- a/impyute/util/count_missing.py +++ b/impyute/contrib/count_missing.py @@ -1,6 +1,6 @@ -""" impyute.util.count_missing.py """ +""" impyute.contrib.count_missing.py """ import numpy as np -from impyute.util import find_null +from impyute.ops import matrix def count_missing(data): """ Calculate the total percentage of missing values and also the @@ -18,13 +18,13 @@ def count_missing(data): """ size = len(data.flatten()) - null_xy = find_null(data) - np.unique(null_xy) - counter = {y: 0. for y in np.unique(null_xy.T[1])} + nan_xy = matrix.nan_indices(data) + np.unique(nan_xy) + counter = {y: 0. for y in np.unique(nan_xy.T[1])} change_in_percentage = 1./size - for _, y in null_xy: + for _, y in nan_xy: counter[y] += change_in_percentage - total_missing = len(null_xy)/size + total_missing = len(nan_xy)/size counter["total"] = total_missing return counter diff --git a/impyute/util/describe.py b/impyute/contrib/describe.py similarity index 73% rename from impyute/util/describe.py rename to impyute/contrib/describe.py index c87c717..d2ddb49 100644 --- a/impyute/util/describe.py +++ b/impyute/contrib/describe.py @@ -1,9 +1,11 @@ -""" impyute.util.describe """ -from impyute.util import find_null +""" impyute.contrib.describe """ +from impyute.ops import matrix def describe(data): # verbose=True): """ Print input/output multiple times + Eventually will be used instead of matrix.nan_indices everywhere + Parameters ---------- data: numpy.nd.array @@ -16,15 +18,15 @@ def describe(data): # verbose=True): dict missingness: list Confidence interval of data being MCAR, MAR or MNAR - in that order - null_xy: list of tuples + nan_xy: list of tuples Indices of all null points - null_n: list + nan_n: list Total number of null values for each column pmissing_n: float Percentage of missing values in dataset - null_rows: list + nan_rows: list Indices of all rows that are completely null - null_cols: list + nan_cols: list Indices of all columns that are completely null mean_rows: list Mean value of each row @@ -37,24 +39,24 @@ def describe(data): # verbose=True): """ # missingness = [0.33, 0.33, 0.33] # find_missingness(data) - null_xy = find_null(data) - null_n = len(null_xy) - pmissing_n = float(null_n/len(data.flatten)) + nan_xy = matrix.nan_indices(data) + nan_n = len(nan_xy) + pmissing_n = float(nan_n/len(data.flatten)) # pmissing_rows = "" # pmissing_cols = "" -# null_rows = "" -# null_cols = "" +# nan_rows = "" +# nan_cols = "" # mean_rows = "" # mean_cols = "" # std_dev = "" # "missingness": missingness, - description = {"null_xy": null_xy, - "null_n": null_n, + description = {"nan_xy": nan_xy, + "nan_n": nan_n, "pmissing_n": pmissing_n} # "pmissing_rows": pmissing_rows, # "pmissing_cols": pmissing_cols, -# "null_rows": null_rows, -# "null_cols": null_cols, +# "nan_rows": nan_rows, +# "nan_cols": nan_cols, # "mean_rows": mean_rows, # "mean_cols": mean_cols, # "std_dev": std_dev} diff --git a/impyute/dataset/base.py b/impyute/dataset/base.py index e1f7b90..3f18e56 100644 --- a/impyute/dataset/base.py +++ b/impyute/dataset/base.py @@ -1,11 +1,11 @@ """ Shared functions to load/generate data """ -import numpy as np -import string -import random -import math import itertools +import math +import random +import string +import numpy as np from impyute.dataset.corrupt import Corruptor -from impyute.util import BadInputError +from impyute.ops import error def randu(bound=(0, 10), shape=(5, 5), missingness="mcar", thr=0.2, dtype="int"): """ Return randomly generated dataset of numbers with uniformly @@ -89,7 +89,7 @@ def randc(nlevels=5, shape=(5, 5), missingness="mcar", thr=0.2): numpy.ndarray """ if shape[0]*shape[1] < nlevels: - raise BadInputError("nlevel exceeds the size of desired dataset. Please decrease the nlevel or increase the shape") + raise error.BadInputError("nlevel exceeds the size of desired dataset. Please decrease the nlevel or increase the shape") length = len(string.ascii_lowercase) n_fold = int(math.floor(math.log(nlevels, length))) @@ -134,4 +134,4 @@ def mnist(missingness="mcar", thr=0.2): dataset = fetch_mldata('MNIST original') corruptor = Corruptor(dataset.data, thr=thr) data = getattr(corruptor, missingness)() - return {"X": data, "Y": dataset.target} \ No newline at end of file + return {"X": data, "Y": dataset.target} diff --git a/impyute/dataset/corrupt.py b/impyute/dataset/corrupt.py index 846b27c..566f80d 100644 --- a/impyute/dataset/corrupt.py +++ b/impyute/dataset/corrupt.py @@ -33,10 +33,10 @@ def mcar(self): """ Overwrites values with MCAR placed NaN's """ data_1d = self.data.flatten() n_total = len(data_1d) - null_x = np.random.choice(range(n_total), + nan_x = np.random.choice(range(n_total), size=int(self.thr*n_total), replace=False) - for x_i in null_x: + for x_i in nan_x: data_1d[x_i] = np.nan output = data_1d.reshape(self.shape) return output diff --git a/impyute/deletion/complete_case.py b/impyute/deletion/complete_case.py index 5e50ae6..253fa50 100644 --- a/impyute/deletion/complete_case.py +++ b/impyute/deletion/complete_case.py @@ -1,10 +1,9 @@ """ impyute.deletion.complete_case """ import numpy as np -from impyute.util import checks -from impyute.util import preprocess +from impyute.ops import wrapper -@preprocess -@checks +@wrapper.wrappers +@wrapper.checks def complete_case(data): """ Return only data rows with all columns diff --git a/impyute/imputation/cs/buck_iterative.py b/impyute/imputation/cs/buck_iterative.py index d255832..e1401fe 100644 --- a/impyute/imputation/cs/buck_iterative.py +++ b/impyute/imputation/cs/buck_iterative.py @@ -1,13 +1,11 @@ -""" impyute.imputation.cs.buck_iterative """ import numpy as np from sklearn.linear_model import LinearRegression -from impyute.util import find_null -from impyute.util import checks -from impyute.util import preprocess +from impyute.ops import matrix +from impyute.ops import wrapper # pylint: disable=too-many-locals -@preprocess -@checks +@wrapper.wrappers +@wrapper.checks def buck_iterative(data): """ Iterative variant of buck's method @@ -30,32 +28,32 @@ def buck_iterative(data): Imputed data. """ - null_xy = find_null(data) + nan_xy = matrix.nan_indices(data) # Add a column of zeros to the index values - null_xyv = np.append(null_xy, np.zeros((np.shape(null_xy)[0], 1)), axis=1) + nan_xyz = np.append(nan_xy, np.zeros((np.shape(nan_xy)[0], 1)), axis=1) - null_xyv = [[int(x), int(y), v] for x, y, v in null_xyv] + nan_xyz = [[int(x), int(y), v] for x, y, v in nan_xyz] temp = [] - cols_missing = {y for _, y, _ in null_xyv} + cols_missing = {y for _, y, _ in nan_xyz} # Step 1: Simple Imputation, these are just placeholders - for x_i, y_i, value in null_xyv: + for x_i, y_i, value in nan_xyz: # Column containing nan value without the nan value col = data[:, [y_i]][~np.isnan(data[:, [y_i]])] new_value = np.mean(col) data[x_i][y_i] = new_value temp.append([x_i, y_i, new_value]) - null_xyv = temp + nan_xyz = temp # Step 5: Repeat step 2 - 4 until convergence (the 100 is arbitrary) - converged = [False] * len(null_xyv) + converged = [False] * len(nan_xyz) while not all(converged): # Step 2: Placeholders are set back to missing for one variable/column dependent_col = int(np.random.choice(list(cols_missing))) - missing_xs = [int(x) for x, y, value in null_xyv if y == dependent_col] + missing_xs = [int(x) for x, y, value in nan_xyz if y == dependent_col] # Step 3: Perform linear regression using the other variables x_train, y_train = [], [] @@ -68,7 +66,7 @@ def buck_iterative(data): # Step 4: Missing values for the missing variable/column are replaced # with predictions from our new linear regression model # For null indices with the dependent column that was randomly chosen - for i, z in enumerate(null_xyv): + for i, z in enumerate(nan_xyz): x_i = z[0] y_i = z[1] value = data[x_i, y_i] diff --git a/impyute/imputation/cs/central_tendency.py b/impyute/imputation/cs/central_tendency.py index 53fdf2e..8743653 100644 --- a/impyute/imputation/cs/central_tendency.py +++ b/impyute/imputation/cs/central_tendency.py @@ -1,11 +1,9 @@ -""" impyute.imputation.cs.central_tendency """ import numpy as np -from impyute.util import find_null -from impyute.util import checks -from impyute.util import preprocess +from impyute.ops import matrix +from impyute.ops import wrapper -@preprocess -@checks +@wrapper.wrappers +@wrapper.checks def mean(data): """ Substitute missing values with the mean of that column. @@ -20,15 +18,15 @@ def mean(data): Imputed data. """ - null_xy = find_null(data) - for x_i, y_i in null_xy: + nan_xy = matrix.nan_indices(data) + for x_i, y_i in nan_xy: row_wo_nan = data[:, [y_i]][~np.isnan(data[:, [y_i]])] new_value = np.mean(row_wo_nan) data[x_i][y_i] = new_value return data -@preprocess -@checks +@wrapper.wrappers +@wrapper.checks def median(data): """ Substitute missing values with the median of that column(middle). @@ -43,19 +41,19 @@ def median(data): Imputed data. """ - null_xy = find_null(data) - cols_missing = set(null_xy.T[1]) + nan_xy = matrix.nan_indices(data) + cols_missing = set(nan_xy.T[1]) medians = {} for y_i in cols_missing: cols_wo_nan = data[:, [y_i]][~np.isnan(data[:, [y_i]])] median_y = np.median(cols_wo_nan) medians[str(y_i)] = median_y - for x_i, y_i in null_xy: + for x_i, y_i in nan_xy: data[x_i][y_i] = medians[str(y_i)] return data -@preprocess -@checks +@wrapper.wrappers +@wrapper.checks def mode(data): """ Substitute missing values with the mode of that column(most frequent). @@ -73,7 +71,7 @@ def mode(data): Imputed data. """ - null_xy = find_null(data) + nan_xy = matrix.nan_indices(data) modes = [] for y_i in range(np.shape(data)[1]): unique_counts = np.unique(data[:, [y_i]], return_counts=True) @@ -81,6 +79,6 @@ def mode(data): mode_y = [unique for unique, count in np.transpose(unique_counts) if count == max_count and not np.isnan(unique)] modes.append(mode_y) # Appends index of column and column modes - for x_i, y_i in null_xy: + for x_i, y_i in nan_xy: data[x_i][y_i] = np.random.choice(modes[y_i]) return data diff --git a/impyute/imputation/cs/em.py b/impyute/imputation/cs/em.py index 4eb28e1..184962a 100644 --- a/impyute/imputation/cs/em.py +++ b/impyute/imputation/cs/em.py @@ -1,11 +1,9 @@ -""" impyute.imputation.cs.em""" import numpy as np -from impyute.util import find_null -from impyute.util import preprocess -from impyute.util import checks +from impyute.ops import matrix +from impyute.ops import wrapper -@preprocess -@checks +@wrapper.wrappers +@wrapper.checks def em(data, loops=50): """ Imputes given data using expectation maximization. @@ -28,8 +26,8 @@ def em(data, loops=50): Imputed data. """ - null_xy = find_null(data) - for x_i, y_i in null_xy: + nan_xy = matrix.nan_indices(data) + for x_i, y_i in nan_xy: col = data[:, int(y_i)] mu = col[~np.isnan(col)].mean() std = col[~np.isnan(col)].std() diff --git a/impyute/imputation/cs/fast_knn.py b/impyute/imputation/cs/fast_knn.py index 814f2b6..0411f5c 100644 --- a/impyute/imputation/cs/fast_knn.py +++ b/impyute/imputation/cs/fast_knn.py @@ -1,22 +1,23 @@ -""" impyute.imputation.cs.knn """ import numpy as np from scipy.spatial import KDTree -from impyute.util import find_null -from impyute.util import checks -from impyute.util import preprocess -from impyute.util import inverse_distance_weighting as util_idw -from impyute.imputation.cs import mean +from impyute.ops import matrix +from impyute.ops import wrapper +from impyute.ops import inverse_distance_weighting as idw + +from . import mean # pylint: disable=too-many-arguments -@preprocess -@checks -def fast_knn(data, k=3, eps=0, p=2, distance_upper_bound=np.inf, leafsize=10, idw=util_idw.shepards): +@wrapper.wrappers +@wrapper.checks +def fast_knn(data, k=3, eps=0, p=2, distance_upper_bound=np.inf, leafsize=10, + idw_fn=idw.shepards, init_impute_fn=mean): """ Impute using a variant of the nearest neighbours approach - Basic idea: Impute array with a basic mean impute and then use the resulting complete - array to construct a KDTree. Use this KDTree to compute nearest neighbours. - After finding `k` nearest neighbours, take the weighted average of them. Basically, - find the nearest row in terms of distance + Basic idea: Impute array with a passed in initial impute fn (mean impute) + and then use the resulting complete array to construct a KDTree. Use this + KDTree to compute nearest neighbours. After finding `k` nearest + neighbours, take the weighted average of them. Basically, find the nearest + row in terms of distance This approach is much, much faster than the other implementation (fit+transform for each subset) which is almost prohibitively expensive. @@ -65,11 +66,13 @@ def fast_knn(data, k=3, eps=0, p=2, distance_upper_bound=np.inf, leafsize=10, id [`scipy.spatial.KDTree`](https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.spatial.KDTree.html) for more information. - idw: fn, optional + idw_fn: fn, optional Function that takes one argument, a list of distances, and returns weighted percentages. You can define a custom one or bootstrap from functions defined in `impy.util.inverse_distance_weighting` which can be using functools.partial, for example: `functools.partial(impy.util.inverse_distance_weighting.shepards, power=1)` + init_impute_fn: fn, optional + Returns ------- numpy.ndarray @@ -109,18 +112,18 @@ def fast_knn(data, k=3, eps=0, p=2, distance_upper_bound=np.inf, leafsize=10, id IndexError: index 5 is out of bounds for axis 0 with size 5 """ - null_xy = find_null(data) - data_c = mean(data) + nan_xy = matrix.nan_indices(data) + data_c = init_impute_fn(data) kdtree = KDTree(data_c, leafsize=leafsize) - for x_i, y_i in null_xy: + for x_i, y_i in nan_xy: distances, indices = kdtree.query(data_c[x_i], k=k+1, eps=eps, p=p, distance_upper_bound=distance_upper_bound) # Will always return itself in the first index. Delete it. distances, indices = distances[1:], indices[1:] # Add small constant to distances to avoid division by 0 distances += 1e-3 - weights = idw(distances) + weights = idw_fn(distances) # Assign missing value the weighted average of `k` nearest neighbours data[x_i][y_i] = np.dot(weights, [data_c[ind][y_i] for ind in indices]) return data diff --git a/impyute/imputation/cs/random.py b/impyute/imputation/cs/random.py index 96653ee..f9a6000 100644 --- a/impyute/imputation/cs/random.py +++ b/impyute/imputation/cs/random.py @@ -1,11 +1,9 @@ -""" impyute.imputation.cs.random """ import numpy as np -from impyute.util import find_null -from impyute.util import preprocess -from impyute.util import checks +from impyute.ops import matrix +from impyute.ops import wrapper -@preprocess -@checks +@wrapper.wrappers +@wrapper.checks def random(data): """ Fill missing values in with a randomly selected value from the same column. @@ -21,8 +19,8 @@ def random(data): Imputed data. """ - null_xy = find_null(data) - for x, y in null_xy: + nan_xy = matrix.nan_indices(data) + for x, y in nan_xy: uniques = np.unique(data[:, y]) uniques = uniques[~np.isnan(uniques)] data[x][y] = np.random.choice(uniques) diff --git a/impyute/imputation/ts/locf.py b/impyute/imputation/ts/locf.py index fc5f449..5b87f40 100644 --- a/impyute/imputation/ts/locf.py +++ b/impyute/imputation/ts/locf.py @@ -1,11 +1,10 @@ -""" impyute.imputation.ts.locf """ import numpy as np -from impyute.util import find_null -from impyute.util import checks -from impyute.util import preprocess -from impyute.util.errors import BadInputError -@preprocess -@checks +from impyute.ops import matrix +from impyute.ops import wrapper +from impyute.ops import error + +@wrapper.wrappers +@wrapper.checks def locf(data, axis=0): """ Last Observation Carried Forward @@ -34,10 +33,10 @@ def locf(data, axis=0): elif axis == 1: pass else: - raise BadInputError("Error: Axis value is invalid, please use either 0 (row format) or 1 (column format)") + raise error.BadInputError("Error: Axis value is invalid, please use either 0 (row format) or 1 (column format)") - null_xy = find_null(data) - for x_i, y_i in null_xy: + nan_xy = matrix.nan_indices(data) + for x_i, y_i in nan_xy: # Simplest scenario, look one row back if x_i-1 > -1: data[x_i][y_i] = data[x_i-1][y_i] diff --git a/impyute/imputation/ts/moving_window.py b/impyute/imputation/ts/moving_window.py index 10a3c56..1b84796 100644 --- a/impyute/imputation/ts/moving_window.py +++ b/impyute/imputation/ts/moving_window.py @@ -1,13 +1,12 @@ -""" impyute.imputation.ts.moving_window """ import numpy as np -from impyute.util import find_null -from impyute.util import checks -from impyute.util import preprocess +from impyute.ops import matrix +from impyute.ops import wrapper # pylint: disable=invalid-name, too-many-arguments, too-many-locals, too-many-branches, broad-except, len-as-condition -@preprocess -@checks -def moving_window(data, nindex=None, wsize=5, errors="coerce", func=np.mean, inplace=False): +@wrapper.wrappers +@wrapper.checks +def moving_window(data, nindex=None, wsize=5, errors="coerce", func=np.mean, + inplace=False): """ Interpolate the missing values based on nearby values. For example, with an array like this: @@ -90,9 +89,9 @@ def moving_window(data, nindex=None, wsize=5, errors="coerce", func=np.mean, inp wside_right = wsize - nindex - 1 while True: - null_xy = find_null(data) - n_null_prev = len(null_xy) - for x_i, y_i in null_xy: + nan_xy = matrix.nan_indices(data) + n_nan_prev = len(nan_xy) + for x_i, y_i in nan_xy: left_i = max(0, y_i-wside_left) right_i = min(len(data), y_i+wside_right+1) window = data[x_i, left_i: right_i] @@ -123,7 +122,7 @@ def moving_window(data, nindex=None, wsize=5, errors="coerce", func=np.mean, inp data[x_i][y_i] = func(window_not_null) except Exception as e: print("Exception:", e) - if n_null_prev == len(find_null(data)): + if n_nan_prev == len(matrix.nan_indices(data)): break return data diff --git a/impyute/ops/__init__.py b/impyute/ops/__init__.py new file mode 100644 index 0000000..12f7bfc --- /dev/null +++ b/impyute/ops/__init__.py @@ -0,0 +1,12 @@ +""" Unorganized set of utility functions """ + +from . import error +from . import inverse_distance_weighting +from . import matrix +from . import util +from . import wrapper + +__all__ = [ + "error", "inverse_distance_weighting", "matrix", + "util", "wrapper" + ] diff --git a/impyute/ops/error.py b/impyute/ops/error.py new file mode 100644 index 0000000..4805ae4 --- /dev/null +++ b/impyute/ops/error.py @@ -0,0 +1,9 @@ +""" Impyute specific error messages """ + +class BadInputError(Exception): + "Error thrown when input args don't match spec" + pass + +class BadOutputError(Exception): + "Error thrown when outputs don't match spec" + pass diff --git a/impyute/util/inverse_distance_weighting.py b/impyute/ops/inverse_distance_weighting.py similarity index 91% rename from impyute/util/inverse_distance_weighting.py rename to impyute/ops/inverse_distance_weighting.py index a5a8fa1..b0ffbca 100644 --- a/impyute/util/inverse_distance_weighting.py +++ b/impyute/ops/inverse_distance_weighting.py @@ -1,4 +1,4 @@ -""" impyute.util.inverse_distance_weighting """ +""" Assign weights to distances in a way such that farther values are weighed less """ import numpy as np def shepards(distances, power=2): diff --git a/impyute/ops/matrix.py b/impyute/ops/matrix.py new file mode 100644 index 0000000..51321a1 --- /dev/null +++ b/impyute/ops/matrix.py @@ -0,0 +1,47 @@ +""" Common operations on matrices + +*Look into whether it's worth writing these in raw c* +""" +import numpy as np + +def nan_indices(data): + """ Finds the indices of all missing values. + + Parameters + ---------- + data: numpy.ndarray + + Returns + ------- + List of tuples + Indices of all missing values in tuple format; (i, j) + """ + return np.argwhere(np.isnan(data)) + +def map_nd(fn, arr): + """ Map fn that takes a value over entire n-dim array + + Parameters + ---------- + arr: numpy.ndarray + + Returns + ------- + numpy.ndarray + + """ + return np.vectorize(fn)(arr) + +def every_nd(fn, arr): + """ Returns bool, true if fn is true for all elements of arr + + Parameters + ---------- + arr: numpy.ndarray + + Returns + ------- + bool + + """ + return all(map(fn, arr.flatten())) diff --git a/impyute/util/testing.py b/impyute/ops/testing.py similarity index 82% rename from impyute/util/testing.py rename to impyute/ops/testing.py index f022444..989e66b 100644 --- a/impyute/util/testing.py +++ b/impyute/ops/testing.py @@ -1,3 +1,4 @@ +""" Utilities used for unit tests """ import numpy as np @@ -16,4 +17,4 @@ def return_na_check(data): """ assert isinstance(data, np.ndarray) - assert not np.isnan(data).any() \ No newline at end of file + assert not np.isnan(data).any() diff --git a/impyute/ops/util.py b/impyute/ops/util.py new file mode 100644 index 0000000..24d91ee --- /dev/null +++ b/impyute/ops/util.py @@ -0,0 +1,39 @@ +""" Random utility functions """ +from functools import wraps + +# Things that get exposed from * import +__all__ = [ + "constantly", "complement", "identity", "thread", + "execute_fn_with_args_and_or_kwargs" + ] + +def thread(arg, *fns): + if len(fns) > 0: + return thread(fns[0](arg), *fns[1:]) + else: + return arg + +def identity(x): + return x + +def constantly(x): + """ Returns a function that takes any args and returns x """ + def func(*args, **kwargs): + return x + return func + +def complement(fn): + """ Return fn that outputs the opposite truth values of the + input function + """ + @wraps(fn) + def wrapper(*args, **kwargs): + return not fn(*args, **kwargs) + return wrapper + +def execute_fn_with_args_and_or_kwargs(fn, args, kwargs): + """ If args + kwargs aren't accepted only args are passed in""" + try: + return fn(*args, **kwargs) + except TypeError: + return fn(*args) diff --git a/impyute/ops/wrapper.py b/impyute/ops/wrapper.py new file mode 100644 index 0000000..81c02b1 --- /dev/null +++ b/impyute/ops/wrapper.py @@ -0,0 +1,186 @@ +""" Decorator functions to wrap around entry and exit + +... to easily apply to a function, functions that check/process inputs +and outputs +""" +from functools import wraps +import numpy as np + +from . import error +from . import matrix +from . import util as u + +## Hacky way to handle python2 not having `ModuleNotFoundError` +# pylint: disable=redefined-builtin, missing-docstring +try: + raise ModuleNotFoundError +except NameError: + class ModuleNotFoundError(Exception): + pass +except ModuleNotFoundError: + pass +# pylint: enable=redefined-builtin, missing-docstring + +def get_pandas_df(): + """ Gets pandas DataFrame if we can import it """ + try: + import pandas as pd + df = pd.DataFrame + except (ModuleNotFoundError, ImportError): + df = None + return df + +def handle_df(fn): + """ Decorator to handle pandas Dataframe object as input + + If the first arg is a pandas dataframe, convert it to a numpy array + otherwise don't do anything. Cast back to a pandas Dataframe after + the imputation function has run + """ + @wraps(fn) + def wrapper(*args, **kwargs): + postprocess_fn = None + ## convert tuple to list so args can be modified + args = list(args) + ## Either make a copy or use a pointer to the original + if kwargs.get('inplace'): + args[0] = args[0] + else: + args[0] = args[0].copy() + + ## If input data is a dataframe then cast the input to an np.array + ## and set an indicator flag before continuing + pd_DataFrame = get_pandas_df() + if pd_DataFrame and isinstance(args[0], pd_DataFrame): + postprocess_fn = pd_DataFrame + args[0] = args[0].values + + ## function invokation + results = u.execute_fn_with_args_and_or_kwargs(fn, args, kwargs) + + ## cast the output back to a DataFrame. + if postprocess_fn is not None: + results = postprocess_fn(results) + + return results + return wrapper + +def add_inplace_option(fn): + """ Decorator for inplace option + + Functions wrapped by this can have an `inplace` kwarg to use either a copy of + data or reference """ + @wraps(fn) + def wrapper(*args, **kwargs): + """ Run input checks""" + ## convert tuple to list so args can be modified + args = list(args) + ## Either make a copy or use a pointer to the original + if kwargs.get('inplace'): + args[0] = args[0] + else: + args[0] = args[0].copy() + + ## function invokation + return u.execute_fn_with_args_and_or_kwargs(fn, args, kwargs) + return wrapper + +def conform_output(fn): + """ Decorator to handle impossible values + + Adds two optional kwargs, `coerce_fn` and `valid_fn`. + + `valid_fn` function stub + + def my_coerce_fn(some_literal) -> boolean + + `coerce_fn` function stub + + def my_coerce_fn(arr, x_i, y_i) -> some_literal + + Valid function is something run on each element of the, this is + the function that we use to indicate whether the value is valid + or not + + Coerce function has three arguments, the original matrix and + the two indices of the invalid value x_i and y_i. This function + will be run on all invalid values. + """ + @wraps(fn) + def wrapper(*args, **kwargs): + def raise_error(arr, x_i, y_i): + raise error.BadOutputError("{} does not conform".format(arr[x_i, y_i])) + ## convert tuple to list so args can be modified + args = list(args) + # function that checks if the value is valid + valid_fn = kwargs.get("valid_fn", u.constantly(True)) + # function that modifies the invalid value to something valid + coerce_fn = kwargs.get("coerce_fn", raise_error) + + ## function invokation + results = u.execute_fn_with_args_and_or_kwargs(fn, args, kwargs) + + # check each value to see if it's valid + bool_arr = matrix.map_nd(u.complement(valid_fn), results) + # get indices of invalid values + invalid_indices = np.argwhere(bool_arr) + # run the coerce fn on each invalid indice + for x_i, y_i in invalid_indices: + results[x_i, y_i] = coerce_fn(results, x_i, y_i) + + return results + return wrapper + +def wrappers(fn): + """ Helper decorator, all wrapper functions applied to modify input (matrix + with missing values) and output (matrix with imputed values) + + NOTE: `handle_df` has to be last as it needs to be in the outer loop (first + entry point) since every other function assumes you're getting an np.array + as input + """ + return u.thread( + fn, # function that's getting wrapped + add_inplace_option, # allow choosing reference/copy + conform_output, # allow enforcing of some spec on returned outputs + handle_df, # if df type, cast to np.array on in and df on out + ) + +def _shape_2d(data): + """ True if array is 2D""" + return len(np.shape(data)) == 2 + +def _shape_3d(data): + """ True if array is 3D""" + return len(np.shape(data)) == 3 + +def _is_ndarray(data): + """ True if the array is an instance of numpy's ndarray""" + return isinstance(data, np.ndarray) + +def _dtype_float(data): + """ True if the values in the array are floating point""" + return data.dtype == np.float + +def _nan_exists(data): + """ True if there is at least one np.nan in the array""" + nan_xy = matrix.nan_indices(data) + return len(nan_xy) > 0 + +def checks(fn): + """ Throw exception if error runs""" + @wraps(fn) + def wrapper(*args, **kwargs): + data = args[0] + if len(np.shape(data)) != 2: + raise error.BadInputError("No support for arrays that aren't 2D yet.") + elif not _shape_2d(data): + raise error.BadInputError("Not a 2D array.") + elif not _is_ndarray(data): + raise error.BadInputError("Not a np.ndarray.") + elif not _dtype_float(data): + raise error.BadInputError("Data is not float.") + elif not _nan_exists(data): + raise error.BadInputError("No NaN's in given data") + return u.execute_fn_with_args_and_or_kwargs(fn, args, kwargs) + return wrapper diff --git a/impyute/util/__init__.py b/impyute/util/__init__.py deleted file mode 100644 index fd38ce2..0000000 --- a/impyute/util/__init__.py +++ /dev/null @@ -1,14 +0,0 @@ -""" Diagnostic tools to find information about data. """ - -from .find_null import find_null -from .describe import describe -from .count_missing import count_missing -from .errors import BadInputError -from .checks import checks -from .compare import compare -from .preprocess import preprocess -from . import inverse_distance_weighting - -__all__ = ["find_null", "describe", "count_missing", - "checks", "compare", "BadInputError", "preprocess", - "inverse_distance_weighting"] diff --git a/impyute/util/checks.py b/impyute/util/checks.py deleted file mode 100644 index 6dbbaa7..0000000 --- a/impyute/util/checks.py +++ /dev/null @@ -1,57 +0,0 @@ -""" impyute.util.check """ -from functools import wraps -import numpy as np -from impyute.util import find_null -from impyute.util import BadInputError - -def checks(fn): - """ Main check function to ensure input is correctly formatted - - Parameters - ---------- - data: numpy.ndarray - Data to impute. - - Returns - ------- - bool - True if `data` is correctly formatted - - """ - @wraps(fn) - def wrapper(*args, **kwargs): - """ Run input checks""" - data = args[0] - if len(np.shape(data)) != 2: - raise BadInputError("No support for arrays that aren't 2D yet.") - elif not _shape_2d(data): - raise BadInputError("Not a 2D array.") - elif not _is_ndarray(data): - raise BadInputError("Not a np.ndarray.") - elif not _dtype_float(data): - raise BadInputError("Data is not float.") - elif not _nan_exists(data): - raise BadInputError("No NaN's in given data") - return fn(*args, **kwargs) - return wrapper - -def _shape_2d(data): - """ True if array is 2D""" - return len(np.shape(data)) == 2 - -def _shape_3d(data): - """ True if array is 3D""" - return len(np.shape(data)) == 3 - -def _is_ndarray(data): - """ True if the array is an instance of numpy's ndarray""" - return isinstance(data, np.ndarray) - -def _dtype_float(data): - """ True if the values in the array are floating point""" - return data.dtype == np.float - -def _nan_exists(data): - """ True if there is at least one np.nan in the array""" - null_xy = find_null(data) - return len(null_xy) > 0 diff --git a/impyute/util/errors.py b/impyute/util/errors.py deleted file mode 100644 index 7431283..0000000 --- a/impyute/util/errors.py +++ /dev/null @@ -1,5 +0,0 @@ -""" impyute.util.errors """ - -class BadInputError(Exception): - "Error thrown when input args don't match spec" - pass diff --git a/impyute/util/find_null.py b/impyute/util/find_null.py deleted file mode 100644 index 478c12d..0000000 --- a/impyute/util/find_null.py +++ /dev/null @@ -1,19 +0,0 @@ -""" impyute.util.find_null """ -import numpy as np - -def find_null(data): - """ Finds the indices of all missing values. - - Parameters - ---------- - data: numpy.ndarray - Data to impute. - - Returns - ------- - List of tuples - Indices of all missing values in tuple format; (i, j) - - """ - null_xy = np.argwhere(np.isnan(data)) - return null_xy diff --git a/impyute/util/preprocess.py b/impyute/util/preprocess.py deleted file mode 100644 index 36094c7..0000000 --- a/impyute/util/preprocess.py +++ /dev/null @@ -1,71 +0,0 @@ -""" impyute.util.preprocess """ -from functools import wraps - -## Hacky way to handle python2 not having `ModuleNotFoundError` -# pylint: disable=redefined-builtin, missing-docstring -try: - raise ModuleNotFoundError -except NameError: - class ModuleNotFoundError(Exception): - pass -except ModuleNotFoundError: - pass -# pylint: enable=redefined-builtin, missing-docstring - -def execute_fn_with_args_and_or_kwargs(fn, args, kwargs): - """ If args + kwargs aren't accepted only args are passed in""" - try: - return fn(*args, **kwargs) - except TypeError: - return fn(*args) - -def get_pandas_df(): - """ Gets pandas DataFrame if we can import it """ - try: - import pandas as pd - df = pd.DataFrame - except (ModuleNotFoundError, ImportError): - df = None - return df - -def preprocess(fn): - """ Base preprocess function for commonly used preprocessing - - PARAMETERS - ---------- - data: numpy.ndarray - Data to impute. - - RETURNS - ------- - bool - True if `data` is correctly formatted - """ - @wraps(fn) - def wrapper(*args, **kwargs): - """ Run input checks""" - postprocess_fn = None - ## convert tuple to list so args can be modified - args = list(args) - ## Either make a copy or use a pointer to the original - if kwargs.get('inplace'): - args[0] = args[0] - else: - args[0] = args[0].copy() - - ## If input data is a dataframe then cast the input to an np.array - ## and set an indicator flag before continuing - pd_DataFrame = get_pandas_df() - if pd_DataFrame and isinstance(args[0], pd_DataFrame): - postprocess_fn = pd_DataFrame - args[0] = args[0].values - - ## function invokation - results = execute_fn_with_args_and_or_kwargs(fn, args, kwargs) - - ## cast the output back to a DataFrame. - if postprocess_fn is not None: - results = postprocess_fn(results) - - return results - return wrapper diff --git a/test/util/__init__.py b/test/contrib/__init__.py similarity index 100% rename from test/util/__init__.py rename to test/contrib/__init__.py diff --git a/test/util/test_compare.py b/test/contrib/test_compare.py similarity index 79% rename from test/util/test_compare.py rename to test/contrib/test_compare.py index b880345..5f2ddaf 100644 --- a/test/util/test_compare.py +++ b/test/contrib/test_compare.py @@ -1,4 +1,3 @@ -"""test_compare.py""" import ast import pytest import numpy as np @@ -15,7 +14,7 @@ def test_output_file_exists(test_data, results_path): imputed_mode.append(["mode", (impy.mode(np.copy(data)), labels)]) imputed_mode.append(["mean", (impy.mean(np.copy(data)), labels)]) - impy.util.compare(imputed_mode, log_path=results_path) + impy.contrib.compare(imputed_mode, log_path=results_path) with open(results_path, 'r') as fin: expected = {'mode': [('SVC', 0.0)], 'mean': [('SVC', 0.0)]} - assert ast.literal_eval(next(fin)) == expected \ No newline at end of file + assert ast.literal_eval(next(fin)) == expected diff --git a/test/dataset/test_mnist.py b/test/dataset/test_mnist.py index cd34f5c..c6abd59 100644 --- a/test/dataset/test_mnist.py +++ b/test/dataset/test_mnist.py @@ -1,8 +1,7 @@ -"""test_mnist.py""" import numpy as np import pytest from impyute.dataset import mnist -from impyute.util import find_null +from impyute.ops import matrix pytest.skip("takes ~30 sec each test", allow_module_level=True) data = mnist()["X"] @@ -13,4 +12,4 @@ def test_return_type(): def test_missing_values_present(): """ Check that the dataset is corrupted (missing values present)""" - assert find_null(data).size != 0 + assert matrix.nan_indices(data).size != 0 diff --git a/test/dataset/test_randc.py b/test/dataset/test_randc.py index b65b1d4..2540179 100644 --- a/test/dataset/test_randc.py +++ b/test/dataset/test_randc.py @@ -1,19 +1,20 @@ import numpy as np import pytest from impyute.dataset.base import randc -from impyute.util import BadInputError +from impyute.ops import error def test_raise_error_nlevel_exceed_shape(): - with pytest.raises(BadInputError) as e: + with pytest.raises(error.BadInputError) as e: randc(shape=(2, 2)) expected = "nlevel exceeds the size of desired dataset. Please decrease the nlevel or increase the shape" assert str(e.value) == expected @pytest.mark.parametrize("nlevels, shape", [(5, (5,5)), (9, (3,4)), (100, (20, 20))]) def test_nlevel_categories(nlevels, shape): - """ideally the returned matrix should have nlevel+1 different categories, +1 because the Corrupt class introduce np.nan - however, if the missing value introduced by Corrupt class happens to replace a group of categories, the unique - category number would be < nlevel + 1 + """ideally the returned matrix should have nlevel+1 different categories, + +1 because the Corrupt class introduce np.nan however, if the missing value + introduced by Corrupt class happens to replace a group of categories, the + unique category number would be < nlevel + 1 """ dataframe = randc(nlevels, shape) assert len(np.unique(dataframe)) <= nlevels + 1 diff --git a/test/deletion/test_complete_case.py b/test/deletion/test_complete_case.py index 0c0f98b..31c89d0 100644 --- a/test/deletion/test_complete_case.py +++ b/test/deletion/test_complete_case.py @@ -1,7 +1,6 @@ -"""test_complete_case.py""" import numpy as np from impyute.deletion import complete_case -from impyute.util.testing import return_na_check +from impyute.ops.testing import return_na_check SHAPE = (5, 5) diff --git a/test/imputation/cs/test_buck_iterative.py b/test/imputation/cs/test_buck_iterative.py index 5604728..7240da2 100644 --- a/test/imputation/cs/test_buck_iterative.py +++ b/test/imputation/cs/test_buck_iterative.py @@ -1,8 +1,8 @@ """test_buck_iterative.py""" import impyute as impy -from impyute.util.testing import return_na_check +from impyute.ops.testing import return_na_check def test_buck_iter(buck_test_data): imputed = impy.buck_iterative(buck_test_data) - return_na_check(imputed) \ No newline at end of file + return_na_check(imputed) diff --git a/test/imputation/cs/test_central_tendency.py b/test/imputation/cs/test_central_tendency.py index 1384ab1..8615452 100644 --- a/test/imputation/cs/test_central_tendency.py +++ b/test/imputation/cs/test_central_tendency.py @@ -1,6 +1,6 @@ """test_averagings.py""" import impyute as impy -from impyute.util.testing import return_na_check +from impyute.ops.testing import return_na_check SHAPE = (5, 5) @@ -20,4 +20,4 @@ def test_mode(test_data): def test_median(test_data): data = test_data(SHAPE) imputed = impy.median(data) - return_na_check(imputed) \ No newline at end of file + return_na_check(imputed) diff --git a/test/imputation/cs/test_em.py b/test/imputation/cs/test_em.py index 43abdbd..61bd822 100644 --- a/test/imputation/cs/test_em.py +++ b/test/imputation/cs/test_em.py @@ -1,6 +1,6 @@ """test_em.py""" import impyute as impy -from impyute.util.testing import return_na_check +from impyute.ops.testing import return_na_check SHAPE = (5, 5) @@ -8,4 +8,4 @@ def test_em_(test_data): data = test_data(SHAPE) imputed = impy.em(data) - return_na_check(imputed) \ No newline at end of file + return_na_check(imputed) diff --git a/test/imputation/cs/test_fast_knn.py b/test/imputation/cs/test_fast_knn.py index b9dabde..2358724 100644 --- a/test/imputation/cs/test_fast_knn.py +++ b/test/imputation/cs/test_fast_knn.py @@ -2,7 +2,7 @@ import functools import numpy as np import impyute as impy -from impyute.util.testing import return_na_check +from impyute.ops.testing import return_na_check # pylint:disable=invalid-name SHAPE = (5, 5) @@ -14,15 +14,15 @@ def test_return_type(knn_test_data): def test_impute_value(test_data): - "fast_knn using standard idw" + """fast_knn using standard idw""" data = test_data(SHAPE, 0, 2) imputed = impy.fast_knn(data, k=2) assert np.isclose(imputed[0, 2], 8.38888888888889) def test_impute_value_custom_idw(test_data): - "fast_knn using custom idw" + """fast_knn using custom idw""" data = test_data(SHAPE, 0, 2) - idw = functools.partial(impy.util.inverse_distance_weighting.shepards, power=1) - imputed = impy.fast_knn(data, k=2, idw=idw) + idw_fn = functools.partial(impy.ops.inverse_distance_weighting.shepards, power=1) + imputed = impy.fast_knn(data, k=2, idw_fn=idw_fn) assert np.isclose(imputed[0, 2], 8.913911092686593) diff --git a/test/imputation/cs/test_random.py b/test/imputation/cs/test_random.py index 52bf479..23292a1 100644 --- a/test/imputation/cs/test_random.py +++ b/test/imputation/cs/test_random.py @@ -1,6 +1,6 @@ """test_random_imputation.py""" import impyute as impy -from impyute.util.testing import return_na_check +from impyute.ops.testing import return_na_check SHAPE = (3, 3) @@ -8,4 +8,4 @@ def test_random_(test_data): data = test_data(SHAPE) imputed = impy.random(data) - return_na_check(imputed) \ No newline at end of file + return_na_check(imputed) diff --git a/test/imputation/ts/test_locf.py b/test/imputation/ts/test_locf.py index 600d73c..4a5b5fc 100644 --- a/test/imputation/ts/test_locf.py +++ b/test/imputation/ts/test_locf.py @@ -1,8 +1,8 @@ """test_locf.py""" import numpy as np import impyute as impy -from impyute.util.testing import return_na_check -from impyute.util.errors import BadInputError +from impyute.ops.testing import return_na_check +from impyute.ops import error SHAPE = (5, 5) @@ -39,5 +39,5 @@ def test_na_at_i_end(test_data): def test_out_of_bounds(test_data): """Check out of bounds error, should throw BadInputError for any axis outside [0,1]""" data = test_data(SHAPE) - with np.testing.assert_raises(BadInputError): + with np.testing.assert_raises(error.BadInputError): impy.locf(data, axis=3) diff --git a/test/imputation/ts/test_moving_window.py b/test/imputation/ts/test_moving_window.py index c404b72..947e78e 100644 --- a/test/imputation/ts/test_moving_window.py +++ b/test/imputation/ts/test_moving_window.py @@ -2,7 +2,7 @@ import pytest import numpy as np import impyute as impy -from impyute.util.testing import return_na_check +from impyute.ops.testing import return_na_check #pylint:disable=missing-docstring, redefined-outer-name diff --git a/test/ops/__init__.py b/test/ops/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/test/ops/test_matrix.py b/test/ops/test_matrix.py new file mode 100644 index 0000000..5efc7c4 --- /dev/null +++ b/test/ops/test_matrix.py @@ -0,0 +1,17 @@ +import numpy as np +from impyute.ops import matrix + +def _is_gt_5(x): + return x > 5 + +def test_map_nd_2d(): + arr = np.arange(10).reshape([5, 2]) + expected = np.array([ + [False, False], + [False, False], + [False, False], + [True, True], + [True, True], + ]) + actual = matrix.map_nd(_is_gt_5, arr) + assert matrix.every_nd(bool, expected == actual) diff --git a/test/ops/test_util.py b/test/ops/test_util.py new file mode 100644 index 0000000..3a9c681 --- /dev/null +++ b/test/ops/test_util.py @@ -0,0 +1,24 @@ +import numpy as np +from impyute.ops import matrix +from impyute.ops import util + +def _add_one(x): + """ """ + return x + 1 + +def _square(x): + return x * x + +def test_thread(): + assert 10 == util.thread(3, _square, _add_one) + assert 100 == util.thread(3, _square, _add_one, _square) #4 + assert 82 == util.thread(3, _square, _square, _add_one) #4 + assert 10 == util.thread(3, lambda x: x*x, lambda x: x+1) + assert 100 == util.thread(3, lambda x: x*x, lambda x: x+1, lambda x: x*x) + assert 82 == util.thread(3, lambda x: x*x, lambda x: x*x, lambda x: x+1) + +def test_identity(): + arr = np.array([[1., 2., 3.]]) + actual = arr + expected = util.identity(arr) + assert matrix.every_nd(bool, expected == actual) diff --git a/test/ops/test_wrapper.py b/test/ops/test_wrapper.py new file mode 100644 index 0000000..46db79a --- /dev/null +++ b/test/ops/test_wrapper.py @@ -0,0 +1,132 @@ +import pytest +import numpy as np +from impyute.imputation.cs import mean +from impyute.ops import error +from impyute.ops import wrapper + +# pylint:disable=redefined-builtin +try: + raise ModuleNotFoundError +except NameError: + class ModuleNotFoundError(Exception): + "placeholder required for python2.7" + pass +except ModuleNotFoundError: + pass + +@wrapper.wrappers +def wrappers_mul(arr): + """Some function that performs an inplace operation on the input. Accepts kwargs""" + arr *= 25 + return arr + +def test_wrappers_inplace_false(): + """Input should be unchanged if inplace set to false""" + A = np.ones((5, 5)) + A_copy = A.copy() + wrappers_mul(A, inplace=False) + assert A[0, 0] == A_copy[0, 0] + +def test_wrappers_inplace_true(): + """Input may be changed if inplace set to true and operation is inplace""" + A = np.ones((5, 5)) + A_copy = A.copy() + wrappers_mul(A, inplace=True) + assert A[0, 0] != A_copy[0, 0] + +def test_wrappers_pandas_input(): + """ Input: DataFrame, Output: DataFrame """ + # Skip this test if you don't have pandas + pytest.importorskip('pandas') + import pandas as pd + # Create a DataFrame with a NaN + A = np.arange(25).reshape((5, 5)).astype(np.float) + A[0, 0] = np.nan + A = pd.DataFrame(A) + # Assert that the output is a DataFrame + assert isinstance(mean(A), pd.DataFrame) + +@wrapper.checks +def some_fn(data): + """Dummy fn that has form of np.array -> np.array""" + return data + +def test_correct_input(): + """ Test that an array that should satisfy all checks, no BadInputError should be raised""" + # Integer np.ndarray (check: `_is_ndarray`, `_shape_2d`, `_nan_exists`) + arr = np.array([[np.nan, 2], [3, 4]]) + # Cast integer array to float (check: `_dtype_float`) + arr.dtype = np.float + try: + some_fn(arr) + except error.BadInputError: + assert False + +def test_1d(): + """ Check 1d array, BadInputError raised""" + arr = np.array([np.nan, 2]) + with pytest.raises(error.BadInputError) as excinfo: + some_fn(arr) + assert str(excinfo.value) == "No support for arrays that aren't 2D yet." + +def test_not_nparray(): + """ If not an np.array, BadInputError raised""" + with pytest.raises(error.BadInputError) as excinfo: + some_fn([[np.nan, 2.], [3, 4]]) + assert str(excinfo.value) == "Not a np.ndarray." + +def test_nan_exists(): + """ If no NaN, BadInputError raised""" + with pytest.raises(error.BadInputError) as excinfo: + some_fn(np.array([[1.]])) + assert str(excinfo.value) == "No NaN's in given data" + +@wrapper.conform_output +def conform_output_dummy(x): + return x + +def is_between_0_1(x): + return 0 <= x <= 1 + +def coerce_between_0_1(arr, x_i, y_i): + val = arr[x_i, y_i] + if val < 0: + return 0 + elif val > 1: + return 1 + else: + return x + +def test_conform_output_not_used(): + """ If neither args passed, don't do anything""" + assert "some input" == conform_output_dummy("some input") + +def test_conform_output_valid_coerce(): + """ Check value valid and coerce invalid values""" + arr = np.array([[1.1, 0.5], [0.2, -1]]) + actual = conform_output_dummy( + arr, + valid_fn=is_between_0_1, + coerce_fn=coerce_between_0_1, + ) + expected = np.array([[1.0, 0.5], [0.2, 0.0]]) + assert np.array_equal(expected, actual) + +def test_conform_output_coerce(): + """ Coerce function doesn't run if no valid_fn passed """ + arr = np.array([[1.1, 0.5], [0.2, -1]]) + actual = conform_output_dummy( + arr, + coerce_fn=coerce_between_0_1, + ) + expected = np.array([[1.1, 0.5], [0.2, -1]]) + assert np.array_equal(expected, actual) + +def test_conform_output_valid(): + """ No coerce_fn with valid_fn will raise BadOutputeError if invalid values + encountered. First invalid value is 1.1 + """ + arr = np.array([[1.1, 0.5], [0.2, -1]]) + with pytest.raises(error.BadOutputError) as excinfo: + conform_output_dummy(arr, valid_fn=is_between_0_1) + assert str(excinfo.value) == "1.1 does not conform" diff --git a/test/util/test_checks.py b/test/util/test_checks.py deleted file mode 100644 index a5ae2df..0000000 --- a/test/util/test_checks.py +++ /dev/null @@ -1,45 +0,0 @@ -"""test_checks.py""" -import pytest -import numpy as np -from impyute.util import BadInputError -from impyute.util import checks - - -@checks -def some_fn(data): - """Dummy fn that has form of np.array -> np.array""" - return data - - -def test_correct_input(): - """ Test that an array that should satisfy all checks, no BadInputError should be raised""" - # Integer np.ndarray (check: `_is_ndarray`, `_shape_2d`, `_nan_exists`) - arr = np.array([[np.nan, 2], [3, 4]]) - # Cast integer array to float (check: `_dtype_float`) - arr.dtype = np.float - try: - some_fn(arr) - except BadInputError: - assert False - - -def test_1d(): - """ Check 1d array, BadInputError raised""" - arr = np.array([np.nan, 2]) - with pytest.raises(BadInputError) as excinfo: - some_fn(arr) - assert str(excinfo.value) == "No support for arrays that aren't 2D yet." - - -def test_not_nparray(): - """ If not an np.array, BadInputError raised""" - with pytest.raises(BadInputError) as excinfo: - some_fn([[np.nan, 2.], [3, 4]]) - assert str(excinfo.value) == "Not a np.ndarray." - - -def test_nan_exists(): - """ If no NaN, BadInputError raised""" - with pytest.raises(BadInputError) as excinfo: - some_fn(np.array([[1.]])) - assert str(excinfo.value) == "No NaN's in given data" diff --git a/test/util/test_preprocess.py b/test/util/test_preprocess.py deleted file mode 100644 index 6cd7fe1..0000000 --- a/test/util/test_preprocess.py +++ /dev/null @@ -1,83 +0,0 @@ -"""test_preprocess.py""" -import pytest -import numpy as np -from impyute.util import preprocess -from impyute.imputation.cs import mean - -# pylint:disable=redefined-builtin -try: - raise ModuleNotFoundError -except NameError: - class ModuleNotFoundError(Exception): - "placeholder required for python2.7" - pass -except ModuleNotFoundError: - pass - -# pylint:disable=unused-argument -@preprocess -def mul(arr, **kwargs): - """Some function that performs an inplace operation on the input. Accepts kwargs""" - arr *= 25 - return arr - - -@preprocess -def mul_no_kwargs(arr): - """Some function that performs an inplace operation on the input""" - arr *= 25 - return arr - - -def test_inplace_false(): - """Input should be unchanged if inplace set to false""" - A = np.ones((5, 5)) - A_copy = A.copy() - mul(A, inplace=False) - assert A[0, 0] == A_copy[0, 0] - - -def test_inplace_true(): - """Input may be changed if inplace set to true and operation is inplace""" - A = np.ones((5, 5)) - A_copy = A.copy() - mul(A, inplace=True) - assert A[0, 0] != A_copy[0, 0] - - -def test_inplace_false_nokwargs(): - """Test that passed in function doesn't need to set kwargs as parameters - Input should be unchanged if inplace set to false - """ - A = np.ones((5, 5)) - A_copy = A.copy() - # pylint: disable = unexpected-keyword-arg - mul_no_kwargs(A, inplace=False) - # pylint: enable = unexpected-keyword-arg - assert A[0, 0] == A_copy[0, 0] - - -def test_inplace_true_nokwargs(): - """Test that passed in function doesn't need to set kwargs as parameters - Input may be changed if inplace set to true and operation is inplace - """ - A = np.ones((5, 5)) - A_copy = A.copy() - # pylint: disable = unexpected-keyword-arg - mul_no_kwargs(A, inplace=True) - # pylint: enable = unexpected-keyword-arg - assert A[0, 0] != A_copy[0, 0] - - -def test_pandas_input(): - """ Input: DataFrame, Output: DataFrame """ - # Skip this test if you don't have pandas - pytest.importorskip('pandas') - import pandas as pd - # Create a DataFrame with a NaN - A = np.arange(25).reshape((5, 5)).astype(np.float) - A[0, 0] = np.nan - A = pd.DataFrame(A) - - # Assert that the output is a DataFrame - assert isinstance(mean(A), pd.DataFrame)