Fix sparse correlation (at last) per issue #71

lmcinnes · Jun 20, 2018 · 38c27be · 38c27be
1 parent d80f313
commit 38c27be
Show file tree

Hide file tree

Showing 2 changed files with 17 additions and 12 deletions.
diff --git a/setup.py b/setup.py
@@ -6,7 +6,7 @@ def readme():
 
 configuration = {
     'name' : 'umap-learn',
-    'version': '0.2.3',
+    'version': '0.2.5',
     'description' : 'Uniform Manifold Approximation and Projection',
     'long_description' : readme(),
     'classifiers' : [

diff --git a/umap/sparse.py b/umap/sparse.py
@@ -680,6 +680,11 @@ def sparse_correlation(ind1, data1, ind2, data2, n_features):
     mu_y = 0.0
     dot_product = 0.0
 
+    if ind1.shape[0] == 0 and ind2.shape[0] == 0:
+        return 0.0
+    elif ind1.shape[0] == 0 or ind2.shape[0] == 0:
+        return 1.0
+
     for i in range(data1.shape[0]):
         mu_x += data1[i]
     for i in range(data2.shape[0]):
@@ -688,45 +693,45 @@ def sparse_correlation(ind1, data1, ind2, data2, n_features):
     mu_x /= n_features
     mu_y /= n_features
 
-    shifted_data1 = np.empty(data1.shape[0], dtype=np.float64)
-    shifted_data2 = np.empty(data2.shape[0], dtype=np.float64)
+    shifted_data1 = np.empty(data1.shape[0], dtype=np.float32)
+    shifted_data2 = np.empty(data2.shape[0], dtype=np.float32)
 
     for i in range(data1.shape[0]):
         shifted_data1[i] = data1[i] - mu_x
     for i in range(data2.shape[0]):
         shifted_data2[i] = data2[i] - mu_y
 
-    norm1 = np.sqrt(norm(shifted_data1) ** 2 + (n_features - ind1.shape[0]) * mu_x ** 2)
-    norm2 = np.sqrt(norm(shifted_data2) ** 2 + (n_features - ind2.shape[0]) * mu_y ** 2)
+    norm1 = np.sqrt((norm(shifted_data1) ** 2) + (n_features - ind1.shape[0]) * (mu_x ** 2))
+    norm2 = np.sqrt((norm(shifted_data2) ** 2) + (n_features - ind2.shape[0]) * (mu_y ** 2))
 
     dot_prod_inds, dot_prod_data = sparse_mul(ind1, shifted_data1,
                                               ind2, shifted_data2)
 
-    if dot_prod_data.shape[0] == 0:
-        return 1.0
-
     common_indices = set(dot_prod_inds)
 
     for i in range(dot_prod_data.shape[0]):
         dot_product += dot_prod_data[i]
 
     for i in range(ind1.shape[0]):
         if ind1[i] not in common_indices:
-            dot_product -= data1[i] * (mu_y)
+            dot_product -= shifted_data1[i] * (mu_y)
 
     for i in range(ind2.shape[0]):
         if ind2[i] not in common_indices:
-            dot_product -= data2[i] * (mu_x)
+            dot_product -= shifted_data2[i] * (mu_x)
 
     all_indices = arr_union(ind1, ind2)
-    dot_product += mu_x * mu_y * all_indices.shape[0]
+    dot_product += mu_x * mu_y * (n_features - all_indices.shape[0])
 
-    if dot_product == 0.0:
+    if norm1 == 0.0 and norm2 == 0.0:
+        return 0.0
+    elif dot_product == 0.0:
         return 1.0
     else:
         return (1.0 - (dot_product / (norm1 * norm2)))
 
 
+
 sparse_named_distances = {
     'euclidean': sparse_euclidean,
     'manhattan': sparse_manhattan,