Skip to content

Commit

Permalink
Merge pull request #301 from nickjcroucher/update_model_fitting
Browse files Browse the repository at this point in the history
Improvements to DBSCAN fitting - moving from fork to main repo
  • Loading branch information
nickjcroucher authored Feb 16, 2024
2 parents 2c1eeef + bc2ba19 commit c080e2a
Show file tree
Hide file tree
Showing 8 changed files with 305 additions and 154 deletions.
56 changes: 47 additions & 9 deletions PopPUNK/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,11 +114,34 @@ def get_options():

# model fitting
modelGroup = parser.add_argument_group('Model fit options')
modelGroup.add_argument('--K', help='Maximum number of mixture components [default = 2]', type=int, default=2)
modelGroup.add_argument('--D', help='Maximum number of clusters in DBSCAN fitting [default = 100]', type=int, default=100)
modelGroup.add_argument('--min-cluster-prop', help='Minimum proportion of points in a cluster '
'in DBSCAN fitting [default = 0.0001]', type=float, default=0.0001)
modelGroup.add_argument('--threshold', help='Cutoff if using --fit-model threshold', type=float)
modelGroup.add_argument('--model-subsample',
help='Number of pairwise distances used to fit model [default = 100000]',
type=int,
default=100000)
modelGroup.add_argument('--assign-subsample',
help='Number of pairwise distances in each assignment batch [default = 5000]',
type=int,
default=5000)
modelGroup.add_argument('--no-assign',
help='Fit the model without assigning all points (only applies to BGMM and DBSCAN models)',
default=False,
action='store_true')
modelGroup.add_argument('--K',
help='Maximum number of mixture components [default = 2]',
type=int,
default=2)
modelGroup.add_argument('--D',
help='Maximum number of clusters in DBSCAN fitting [default = 100]',
type=int,
default=100)
modelGroup.add_argument('--min-cluster-prop',
help='Minimum proportion of points in a cluster '
'in DBSCAN fitting [default = 0.0001]',
type=float,
default=0.0001)
modelGroup.add_argument('--threshold',
help='Cutoff if using --fit-model threshold',
type=float)

# model refinement
refinementGroup = parser.add_argument_group('Refine model options')
Expand Down Expand Up @@ -180,6 +203,7 @@ def get_options():
other.add_argument('--threads', default=1, type=int, help='Number of threads to use [default = 1]')
other.add_argument('--gpu-sketch', default=False, action='store_true', help='Use a GPU when calculating sketches (read data only) [default = False]')
other.add_argument('--gpu-dist', default=False, action='store_true', help='Use a GPU when calculating distances [default = False]')
other.add_argument('--gpu-model', default=False, action='store_true', help='Use a GPU when fitting a model [default = False]')
other.add_argument('--gpu-graph', default=False, action='store_true', help='Use a GPU when calculating networks [default = False]')
other.add_argument('--deviceid', default=0, type=int, help='CUDA device ID, if using GPU [default = 0]')
other.add_argument('--no-plot', help='Switch off model plotting, which can be slow for large datasets',
Expand Down Expand Up @@ -491,14 +515,24 @@ def main():
if args.fit_model:
# Run DBSCAN model
if args.fit_model == "dbscan":
model = DBSCANFit(output)
model = DBSCANFit(output,
max_samples = args.model_subsample,
max_batch_size = args.assign_subsample,
assign_points = not args.no_assign)
model.set_threads(args.threads)
assignments = model.fit(distMat, args.D, args.min_cluster_prop)
assignments = model.fit(distMat,
args.D,
args.min_cluster_prop,
args.gpu_model)
# Run Gaussian model
elif args.fit_model == "bgmm":
model = BGMMFit(output)
model = BGMMFit(output,
max_samples = args.model_subsample,
max_batch_size = args.assign_subsample,
assign_points = not args.no_assign)
model.set_threads(args.threads)
assignments = model.fit(distMat, args.K)
assignments = model.fit(distMat,
args.K)
elif args.fit_model == "refine":
new_model = RefineFit(output)
new_model.set_threads(args.threads)
Expand Down Expand Up @@ -557,6 +591,10 @@ def main():
else:
assignments = model.assign(distMat)

# end here if not assigning data
if args.no_assign:
sys.exit(0)

#******************************#
#* *#
#* network construction *#
Expand Down
5 changes: 5 additions & 0 deletions PopPUNK/assign.py
Original file line number Diff line number Diff line change
Expand Up @@ -408,6 +408,11 @@ def assign_query_hdf5(dbFuncs,
raise RuntimeError("lineage models cannot be used with --serial")
model.set_threads(threads)

# Only proceed with a fully-fitted model
if not model.fitted or (hasattr(model,'assign_points') and model.assign_points == False):
sys.stderr.write('Cannot assign points with an incompletely-fitted model\n')
sys.exit(1)

# Set directories of previous fit
if previous_clustering is not None:
prev_clustering = previous_clustering
Expand Down
44 changes: 31 additions & 13 deletions PopPUNK/dbscan.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@
# hdbscan
import hdbscan

def fitDbScan(X, min_samples, min_cluster_size, cache_out):
from .utils import check_and_set_gpu

def fitDbScan(X, min_samples, min_cluster_size, cache_out, use_gpu = False):
"""Function to fit DBSCAN model as an alternative to the Gaussian
Fits the DBSCAN model to the distances using hdbscan
Expand All @@ -23,26 +25,42 @@ def fitDbScan(X, min_samples, min_cluster_size, cache_out):
Minimum number of points in a cluster for HDBSCAN
cache_out (str)
Prefix for DBSCAN cache used for refitting
use_gpu (bool)
Whether GPU algorithms should be used in DBSCAN fitting
Returns:
hdb (hdbscan.HDBSCAN)
hdb (hdbscan.HDBSCAN or cuml.cluster.HDBSCAN)
Fitted HDBSCAN to subsampled data
labels (list)
Cluster assignments of each sample
n_clusters (int)
Number of clusters used
"""
# set DBSCAN clustering parameters
hdb = hdbscan.HDBSCAN(algorithm='boruvka_balltree',
min_samples = min_samples,
#core_dist_n_jobs = threads, # may cause error, see #19
memory = cache_out,
prediction_data = True,
min_cluster_size = min_cluster_size
).fit(X)
# Number of clusters in labels, ignoring noise if present.
labels = hdb.labels_
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
if use_gpu:
from cuml import cluster
import cupy as cp
sys.stderr.write('Fitting HDBSCAN model using a GPU\n')
hdb = cluster.hdbscan.HDBSCAN(min_samples = min_samples,
output_type = 'cupy',
prediction_data = True,
min_cluster_size = min_cluster_size
).fit(X)
# Number of clusters in labels, ignoring noise if present.
labels = hdb.labels_
n_clusters = len(cp.unique(labels[labels>-1]))
else:
sys.stderr.write('Fitting HDBSCAN model using a CPU\n')
hdb = hdbscan.HDBSCAN(algorithm='boruvka_balltree',
min_samples = min_samples,
#core_dist_n_jobs = threads, # may cause error, see #19
memory = cache_out,
prediction_data = True,
min_cluster_size = min_cluster_size
).fit(X)
# Number of clusters in labels, ignoring noise if present.
labels = hdb.labels_
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)

# return model parameters
return hdb, labels, n_clusters
Expand Down Expand Up @@ -70,7 +88,7 @@ def evaluate_dbscan_clusters(model):

# evaluate whether maxima of cluster nearest origin do not
# overlap with minima of cluster furthest from origin
if core_minimum_of_between > core_maximum_of_within and \
if core_minimum_of_between > core_maximum_of_within or \
accessory_minimum_of_between > accessory_maximum_of_within:
indistinct = False

Expand Down
Loading

0 comments on commit c080e2a

Please sign in to comment.