From 38c27beaa9f5b5f250458617930ab005977f66b1 Mon Sep 17 00:00:00 2001 From: Leland McInnes Date: Wed, 20 Jun 2018 10:14:40 -0400 Subject: [PATCH] Fix sparse correlation (at last) per issue #71 --- setup.py | 2 +- umap/sparse.py | 27 ++++++++++++++++----------- 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/setup.py b/setup.py index 87417117..f880a587 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ def readme(): configuration = { 'name' : 'umap-learn', - 'version': '0.2.3', + 'version': '0.2.5', 'description' : 'Uniform Manifold Approximation and Projection', 'long_description' : readme(), 'classifiers' : [ diff --git a/umap/sparse.py b/umap/sparse.py index 2c4025f8..50ce3bf6 100644 --- a/umap/sparse.py +++ b/umap/sparse.py @@ -680,6 +680,11 @@ def sparse_correlation(ind1, data1, ind2, data2, n_features): mu_y = 0.0 dot_product = 0.0 + if ind1.shape[0] == 0 and ind2.shape[0] == 0: + return 0.0 + elif ind1.shape[0] == 0 or ind2.shape[0] == 0: + return 1.0 + for i in range(data1.shape[0]): mu_x += data1[i] for i in range(data2.shape[0]): @@ -688,23 +693,20 @@ def sparse_correlation(ind1, data1, ind2, data2, n_features): mu_x /= n_features mu_y /= n_features - shifted_data1 = np.empty(data1.shape[0], dtype=np.float64) - shifted_data2 = np.empty(data2.shape[0], dtype=np.float64) + shifted_data1 = np.empty(data1.shape[0], dtype=np.float32) + shifted_data2 = np.empty(data2.shape[0], dtype=np.float32) for i in range(data1.shape[0]): shifted_data1[i] = data1[i] - mu_x for i in range(data2.shape[0]): shifted_data2[i] = data2[i] - mu_y - norm1 = np.sqrt(norm(shifted_data1) ** 2 + (n_features - ind1.shape[0]) * mu_x ** 2) - norm2 = np.sqrt(norm(shifted_data2) ** 2 + (n_features - ind2.shape[0]) * mu_y ** 2) + norm1 = np.sqrt((norm(shifted_data1) ** 2) + (n_features - ind1.shape[0]) * (mu_x ** 2)) + norm2 = np.sqrt((norm(shifted_data2) ** 2) + (n_features - ind2.shape[0]) * (mu_y ** 2)) dot_prod_inds, dot_prod_data = sparse_mul(ind1, shifted_data1, ind2, shifted_data2) - if dot_prod_data.shape[0] == 0: - return 1.0 - common_indices = set(dot_prod_inds) for i in range(dot_prod_data.shape[0]): @@ -712,21 +714,24 @@ def sparse_correlation(ind1, data1, ind2, data2, n_features): for i in range(ind1.shape[0]): if ind1[i] not in common_indices: - dot_product -= data1[i] * (mu_y) + dot_product -= shifted_data1[i] * (mu_y) for i in range(ind2.shape[0]): if ind2[i] not in common_indices: - dot_product -= data2[i] * (mu_x) + dot_product -= shifted_data2[i] * (mu_x) all_indices = arr_union(ind1, ind2) - dot_product += mu_x * mu_y * all_indices.shape[0] + dot_product += mu_x * mu_y * (n_features - all_indices.shape[0]) - if dot_product == 0.0: + if norm1 == 0.0 and norm2 == 0.0: + return 0.0 + elif dot_product == 0.0: return 1.0 else: return (1.0 - (dot_product / (norm1 * norm2))) + sparse_named_distances = { 'euclidean': sparse_euclidean, 'manhattan': sparse_manhattan,