Merge pull request #301 from nickjcroucher/update_model_fitting

Improvements to DBSCAN fitting - moving from fork to main repo
bacpop · Feb 16, 2024 · c080e2a · c080e2a
2 parents 2c1eeef + bc2ba19
commit c080e2a
Show file tree

Hide file tree

Showing 8 changed files with 305 additions and 154 deletions.
diff --git a/PopPUNK/__main__.py b/PopPUNK/__main__.py
@@ -114,11 +114,34 @@ def get_options():
 
     # model fitting
     modelGroup = parser.add_argument_group('Model fit options')
-    modelGroup.add_argument('--K', help='Maximum number of mixture components [default = 2]', type=int, default=2)
-    modelGroup.add_argument('--D', help='Maximum number of clusters in DBSCAN fitting [default = 100]', type=int, default=100)
-    modelGroup.add_argument('--min-cluster-prop', help='Minimum proportion of points in a cluster '
-                                                        'in DBSCAN fitting [default = 0.0001]', type=float, default=0.0001)
-    modelGroup.add_argument('--threshold', help='Cutoff if using --fit-model threshold', type=float)
+    modelGroup.add_argument('--model-subsample',
+                            help='Number of pairwise distances used to fit model [default = 100000]',
+                            type=int,
+                            default=100000)
+    modelGroup.add_argument('--assign-subsample',
+                            help='Number of pairwise distances in each assignment batch [default = 5000]',
+                            type=int,
+                            default=5000)
+    modelGroup.add_argument('--no-assign',
+                            help='Fit the model without assigning all points (only applies to BGMM and DBSCAN models)',
+                            default=False,
+                            action='store_true')
+    modelGroup.add_argument('--K',
+                            help='Maximum number of mixture components [default = 2]',
+                            type=int,
+                            default=2)
+    modelGroup.add_argument('--D',
+                            help='Maximum number of clusters in DBSCAN fitting [default = 100]',
+                            type=int,
+                            default=100)
+    modelGroup.add_argument('--min-cluster-prop',
+                            help='Minimum proportion of points in a cluster '
+                                 'in DBSCAN fitting [default = 0.0001]',
+                            type=float,
+                            default=0.0001)
+    modelGroup.add_argument('--threshold',
+                            help='Cutoff if using --fit-model threshold',
+                            type=float)
 
     # model refinement
     refinementGroup = parser.add_argument_group('Refine model options')
@@ -180,6 +203,7 @@ def get_options():
     other.add_argument('--threads', default=1, type=int, help='Number of threads to use [default = 1]')
     other.add_argument('--gpu-sketch', default=False, action='store_true', help='Use a GPU when calculating sketches (read data only) [default = False]')
     other.add_argument('--gpu-dist', default=False, action='store_true', help='Use a GPU when calculating distances [default = False]')
+    other.add_argument('--gpu-model', default=False, action='store_true', help='Use a GPU when fitting a model [default = False]')
     other.add_argument('--gpu-graph', default=False, action='store_true', help='Use a GPU when calculating networks [default = False]')
     other.add_argument('--deviceid', default=0, type=int, help='CUDA device ID, if using GPU [default = 0]')
     other.add_argument('--no-plot', help='Switch off model plotting, which can be slow for large datasets',
@@ -491,14 +515,24 @@ def main():
         if args.fit_model:
             # Run DBSCAN model
             if args.fit_model == "dbscan":
-                model = DBSCANFit(output)
+                model = DBSCANFit(output,
+                                  max_samples = args.model_subsample,
+                                  max_batch_size = args.assign_subsample,
+                                  assign_points = not args.no_assign)
                 model.set_threads(args.threads)
-                assignments = model.fit(distMat, args.D, args.min_cluster_prop)
+                assignments = model.fit(distMat,
+                                        args.D,
+                                        args.min_cluster_prop,
+                                        args.gpu_model)
             # Run Gaussian model
             elif args.fit_model == "bgmm":
-                model = BGMMFit(output)
+                model = BGMMFit(output,
+                                max_samples = args.model_subsample,
+                                max_batch_size = args.assign_subsample,
+                                assign_points = not args.no_assign)
                 model.set_threads(args.threads)
-                assignments = model.fit(distMat, args.K)
+                assignments = model.fit(distMat,
+                                        args.K)
             elif args.fit_model == "refine":
                 new_model = RefineFit(output)
                 new_model.set_threads(args.threads)
@@ -557,6 +591,10 @@ def main():
         else:
             assignments = model.assign(distMat)
 
+        # end here if not assigning data
+        if args.no_assign:
+            sys.exit(0)
+
         #******************************#
         #*                            *#
         #* network construction       *#

diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py
@@ -408,6 +408,11 @@ def assign_query_hdf5(dbFuncs,
         raise RuntimeError("lineage models cannot be used with --serial")
     model.set_threads(threads)
 
+    # Only proceed with a fully-fitted model
+    if not model.fitted or (hasattr(model,'assign_points') and model.assign_points == False):
+        sys.stderr.write('Cannot assign points with an incompletely-fitted model\n')
+        sys.exit(1)
+
     # Set directories of previous fit
     if previous_clustering is not None:
         prev_clustering = previous_clustering

diff --git a/PopPUNK/dbscan.py b/PopPUNK/dbscan.py
@@ -9,7 +9,9 @@
 # hdbscan
 import hdbscan
 
-def fitDbScan(X, min_samples, min_cluster_size, cache_out):
+from .utils import check_and_set_gpu
+
+def fitDbScan(X, min_samples, min_cluster_size, cache_out, use_gpu = False):
     """Function to fit DBSCAN model as an alternative to the Gaussian
 
     Fits the DBSCAN model to the distances using hdbscan
@@ -23,26 +25,42 @@ def fitDbScan(X, min_samples, min_cluster_size, cache_out):
             Minimum number of points in a cluster for HDBSCAN
         cache_out (str)
             Prefix for DBSCAN cache used for refitting
+        use_gpu (bool)
+            Whether GPU algorithms should be used in DBSCAN fitting
 
     Returns:
-        hdb (hdbscan.HDBSCAN)
+        hdb (hdbscan.HDBSCAN or cuml.cluster.HDBSCAN)
             Fitted HDBSCAN to subsampled data
         labels (list)
             Cluster assignments of each sample
         n_clusters (int)
             Number of clusters used
     """
     # set DBSCAN clustering parameters
-    hdb = hdbscan.HDBSCAN(algorithm='boruvka_balltree',
-                     min_samples = min_samples,
-                     #core_dist_n_jobs = threads, # may cause error, see #19
-                     memory = cache_out,
-                     prediction_data = True,
-                     min_cluster_size = min_cluster_size
-                     ).fit(X)
-    # Number of clusters in labels, ignoring noise if present.
-    labels = hdb.labels_
-    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
+    if use_gpu:
+      from cuml import cluster
+      import cupy as cp
+      sys.stderr.write('Fitting HDBSCAN model using a GPU\n')
+      hdb = cluster.hdbscan.HDBSCAN(min_samples = min_samples,
+                                 output_type = 'cupy',
+                                 prediction_data = True,
+                                 min_cluster_size = min_cluster_size
+                                 ).fit(X)
+      # Number of clusters in labels, ignoring noise if present.
+      labels = hdb.labels_
+      n_clusters = len(cp.unique(labels[labels>-1]))
+    else:
+      sys.stderr.write('Fitting HDBSCAN model using a CPU\n')
+      hdb = hdbscan.HDBSCAN(algorithm='boruvka_balltree',
+                       min_samples = min_samples,
+                       #core_dist_n_jobs = threads, # may cause error, see #19
+                       memory = cache_out,
+                       prediction_data = True,
+                       min_cluster_size = min_cluster_size
+                       ).fit(X)
+      # Number of clusters in labels, ignoring noise if present.
+      labels = hdb.labels_
+      n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
 
     # return model parameters
     return hdb, labels, n_clusters
@@ -70,7 +88,7 @@ def evaluate_dbscan_clusters(model):
 
     # evaluate whether maxima of cluster nearest origin do not
     # overlap with minima of cluster furthest from origin
-    if core_minimum_of_between > core_maximum_of_within and \
+    if core_minimum_of_between > core_maximum_of_within or \
         accessory_minimum_of_between > accessory_maximum_of_within:
         indistinct = False