Skip to content

Commit

Permalink
Merge pull request #80 from brettc/feature/fastercluster
Browse files Browse the repository at this point in the history
Feature/fastercluster

we now have the search option rclusterf, which is a faster version of the rcluster algorithm. I do not yet know exactly how well it compares to rcluster, though it should be quite a bit faster in certain situations (especially where the number of models is << than the number of processors you have).
  • Loading branch information
roblanf committed Nov 13, 2015
2 parents 97fd303 + e394e8e commit b6fcd69
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 27 deletions.
62 changes: 37 additions & 25 deletions partfinder/analysis_method.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ def do_analysis(self):
c_matrix = spatial.distance.squareform(c_matrix)

# 1. pick top N subset pairs from distance matrix
cutoff = max_schemes
cutoff = max_schemes # this defines the greedy algorithm: we look at all schemes

closest_pairs = neighbour.get_N_closest_subsets(
subsets, the_config, cutoff, d_matrix)
Expand Down Expand Up @@ -283,12 +283,13 @@ def do_analysis(self):

class RelaxedClusteringAnalysis(Analysis):
'''
A relaxed clustering algorithm for heuristic partitioning searches
A fast relaxed clustering algorithm for heuristic partitioning searches
1. Rank subsets by their similarity (defined by clustering-weights)
2. Analyse min(cluster-percent or cluster-max) most similar schemes
3. Take the scheme that improves the AIC/BIC score the most
4. Quit if no improvements.
3. Sequentially perform all groupings that imporve the AICc/BIC score, in order of improvement
4. Analyse resulting scheme, iterate to 2.
5. Quit if no improvements.
'''

def clean_scheme(self, start_scheme):
Expand Down Expand Up @@ -391,7 +392,7 @@ def do_analysis(self):
# 1. pick top N subset pairs from distance matrix
cutoff = int(math.ceil(max_schemes * (the_config.cluster_percent * 0.01)))
if cutoff <= 0: cutoff = 1
if the_config.cluster_max != None and cutoff>the_config.cluster_max:
if the_config.cluster_max != None and cutoff > the_config.cluster_max:
cutoff = the_config.cluster_max
log.info("Choosing the %d most similar subset pairs" % cutoff)
closest_pairs = neighbour.get_N_closest_subsets(
Expand Down Expand Up @@ -432,44 +433,52 @@ def do_analysis(self):
# so we need to be careful to only proceed if we have a negative change
# which indicates an improvement in the score
best_change = np.amin(c_matrix)
best_scheme = start_scheme

log.debug("Biggest improvement in info score: %s", str(best_change))

if best_change>=0:
log.info("Found no schemes that improve the score, stopping")
break

best_pair = neighbour.get_best_pair(c_matrix, best_change, subsets)
while best_change<0:

best_pair = neighbour.get_best_pair(c_matrix, best_change, subsets)
best_merged = subset_ops.merge_subsets(best_pair)
best_scheme = neighbour.make_clustered_scheme(
start_scheme, scheme_name, best_pair, best_merged, the_config)
start_scheme = best_scheme

log.info("Combining subsets: '%s' and '%s'" %(best_pair[0].name, best_pair[1].name))
log.info("This improves the %s score by: %s", the_config.model_selection, str(abs(best_change)))

# reset_c_matrix and the subset list
c_matrix = neighbour.reset_c_matrix(c_matrix, list(best_pair), [best_merged], subsets)

# we update the subset list in a way that means its structure tracks the c-matrix
subsets = neighbour.reset_subsets(subsets, list(best_pair), [best_merged])

best_change = np.amin(c_matrix)

if the_config.search == 'rcluster':
break
# otherwise we are using rclusterf, which continues in this loop

best_merged = subset_ops.merge_subsets(best_pair)
best_scheme = neighbour.make_clustered_scheme(
start_scheme, scheme_name, best_pair, best_merged, the_config)
best_result = self.analyse_scheme(best_scheme)

# the best change can get updated a fraction at this point
# because calaculting the info score on the whole alignment
# is a little different from doing it on the one subset
best_result = self.analyse_scheme(best_scheme)
best_change = self.results.best_score - start_score


log.info("Best scheme combines subsets: '%s' and '%s'" %(best_pair[0].name, best_pair[1].name))


log.info("The best scheme improves the %s score by %.2f to %.1f",
log.info("The best scheme has %d subsets and improves the %s score by %.2f to %.1f",
len(best_scheme.subsets),
the_config.model_selection,
np.abs(best_change),
self.results.best_score)
start_scheme = best_scheme
start_score = best_result.score

log.debug("Best pair: %s", str([s.name for s in best_pair]))
log.debug("Merged into: %s", str([best_merged.name]))

# 5. reset_c_matrix and the subset list
c_matrix = neighbour.reset_c_matrix(c_matrix, list(best_pair), [best_merged], subsets)

# we update the subset list in a way that means its structure tracks the c-matrix
subsets = neighbour.reset_subsets(subsets, list(best_pair), [best_merged])

if not the_config.quick:
the_config.reporter.write_scheme_summary(
Expand All @@ -487,8 +496,10 @@ def do_analysis(self):
self.results.best_score))

if the_config.min_subset_size or the_config.all_states:
best_scheme = self.clean_scheme(best_scheme)
best_scheme = self.clean_scheme(self.results.best_scheme)
best_result = self.analyse_scheme(best_scheme)

# scores after cleaning can be worse, so we reset these trackers...
self.results.best_result = best_result
self.results.best_score = best_result.score
self.results.best_scheme = best_scheme
Expand All @@ -500,7 +511,6 @@ def do_analysis(self):
the_config.reporter.write_best_scheme(self.results)



class KmeansAnalysis(Analysis):

# set the default subset size to 100 for kmeans analyses
Expand Down Expand Up @@ -885,6 +895,8 @@ def choose_method(search):
method = StrictClusteringAnalysis
elif search == 'rcluster':
method = RelaxedClusteringAnalysis
elif search == 'rclusterf':
method = RelaxedClusteringAnalysis
elif search == 'kmeans':
method = KmeansAnalysis
else:
Expand Down
2 changes: 1 addition & 1 deletion partfinder/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ class Configuration(object):
options = {
'branchlengths': ['linked', 'unlinked'],
'model_selection': ['aic', 'aicc', 'bic'],
'search': ['all', 'user', 'greedy', 'hcluster', 'rcluster', 'kmeans']
'search': ['all', 'user', 'greedy', 'hcluster', 'rcluster', 'rclusterf', 'kmeans']
}

def __init__(self):
Expand Down
8 changes: 7 additions & 1 deletion partfinder/reporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,7 +293,7 @@ def write_best_scheme(self, result):
output.write(scheme_header_template % ("model_selection",
self.cfg.model_selection))
output.write(scheme_header_template % ("search", self.cfg.search))
if self.cfg.search in ["rcluster", "hcluster"]:
if self.cfg.search in ["rcluster", "hcluster", "rclusterf"]:
pretty_weights = "rate = %s, base = %s, model = %s, alpha = %s" %(
str(self.cfg.cluster_weights["rate"]),
str(self.cfg.cluster_weights["freqs"]),
Expand Down Expand Up @@ -356,6 +356,8 @@ def write_citation_text(self):
"partitioning schemes for phylogenomic datasets. "
"BMC evolutionary biology, 14(1), 82.")

ref_rclusterf = ref_rcluster

ref_kmeans = ("Frandsen, P. B., Calcott, B., Mayer, C., & Lanfear, R. "
"(2015). Automatic selection of partitioning schemes for "
"phylogenetic analyses using iterative k-means clustering "
Expand Down Expand Up @@ -399,6 +401,10 @@ def write_citation_text(self):
elif self.cfg.search == "greedy":
citation_text.append("%s\n" % ref_PF1)

elif self.cfg.search == "rclusterf":
citation_text.append("%s\n" % ref_rclusterf)


citation_text.append("\n")
if self.cfg.phylogeny_program == 'phyml':
citation_text.append("Your analysis also used PhyML, so please cite:\n")
Expand Down

0 comments on commit b6fcd69

Please sign in to comment.