From 79b42656e1a6e7b1aca6b7fac6036691b4f5e310 Mon Sep 17 00:00:00 2001 From: timsainb Date: Sat, 13 Mar 2021 23:24:35 -0800 Subject: [PATCH 1/2] when dataset is >2GB, switch to numpy function to sample from dataset, to overcome proto limit #608 --- umap/parametric_umap.py | 38 ++++++++++++++++++++++++++++---------- 1 file changed, 28 insertions(+), 10 deletions(-) diff --git a/umap/parametric_umap.py b/umap/parametric_umap.py index 98fe5982..8bdb8d01 100644 --- a/umap/parametric_umap.py +++ b/umap/parametric_umap.py @@ -344,9 +344,6 @@ def _fit_embed_data(self, X, n_epochs, init, random_state): self.head = tf.constant(tf.expand_dims(head.astype(np.int64), 0)) self.tail = tf.constant(tf.expand_dims(tail.astype(np.int64), 0)) - a, b = next(iter(edge_dataset)) - # breakme - if self.parametric_embedding: init_embedding = None else: @@ -849,6 +846,9 @@ def prepare_networks( return encoder, decoder +from umap.parametric_umap import get_graph_elements + + def construct_edge_dataset( X, graph_, @@ -877,23 +877,38 @@ def construct_edge_dataset( Whether the decoder is parametric or non-parametric """ + def gather_index(index): + return X[index] + + # if X is > 2Gb in size, we need to use a different, slower method for + # batching data. + gather_indices_in_python = True if X.nbytes * 1e-9 > 2 else False + def gather_X(edge_to, edge_from): - edge_to_batch = tf.gather(X, edge_to) - edge_from_batch = tf.gather(X, edge_from) - outputs = {"umap": 0} + # gather data from indexes (edges) in either numpy of tf, depending on array size + if gather_indices_in_python: + edge_to_batch = tf.py_function(gather_index, [edge_to], [tf.float32])[0] + edge_from_batch = tf.py_function(gather_index, [edge_from], [tf.float32])[0] + else: + edge_to_batch = tf.gather(X, edge_to) + edge_from_batch = tf.gather(X, edge_from) + return edge_to_batch, edge_from_batch + + def get_outputs(edge_to_batch, edge_from_batch): + outputs = {"umap": tf.repeat(0, batch_size)} if global_correlation_loss_weight > 0: outputs["global_correlation"] = edge_to_batch - if parametric_reconstruction: # add reconstruction to iterator output # edge_out = tf.concat([edge_to_batch, edge_from_batch], axis=0) outputs["reconstruction"] = edge_to_batch - return (edge_to_batch, edge_from_batch), outputs def make_sham_generator(): """ - The sham generator is used to + The sham generator is a placeholder when all data is already intrinsic to + the model, but keras wants some input data. Used for non-parametric + embedding. """ def sham_generator(): @@ -932,10 +947,13 @@ def sham_generator(): ) edge_dataset = edge_dataset.repeat() edge_dataset = edge_dataset.shuffle(10000) + edge_dataset = edge_dataset.batch(batch_size, drop_remainder=True) edge_dataset = edge_dataset.map( gather_X, num_parallel_calls=tf.data.experimental.AUTOTUNE ) - edge_dataset = edge_dataset.batch(batch_size, drop_remainder=True) + edge_dataset = edge_dataset.map( + get_outputs, num_parallel_calls=tf.data.experimental.AUTOTUNE + ) edge_dataset = edge_dataset.prefetch(10) else: # nonparametric embedding uses a sham dataset From 1cfe90ad7f939cda20f758fb7d5dba2d88858a19 Mon Sep 17 00:00:00 2001 From: timsainb Date: Mon, 15 Mar 2021 15:10:41 -0700 Subject: [PATCH 2/2] removed unneeded import --- umap/parametric_umap.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/umap/parametric_umap.py b/umap/parametric_umap.py index 8bdb8d01..fb97fb14 100644 --- a/umap/parametric_umap.py +++ b/umap/parametric_umap.py @@ -846,9 +846,6 @@ def prepare_networks( return encoder, decoder -from umap.parametric_umap import get_graph_elements - - def construct_edge_dataset( X, graph_,