Merge pull request #80 from brettc/feature/fastercluster

Feature/fastercluster we now have the search option rclusterf, which is a faster version of the rcluster algorithm. I do not yet know exactly how well it compares to rcluster, though it should be quite a bit faster in certain situations (especially where the number of models is << than the number of processors you have).
brettc · Nov 13, 2015 · b6fcd69 · b6fcd69
2 parents 97fd303 + e394e8e
commit b6fcd69
Show file tree

Hide file tree

Showing 3 changed files with 45 additions and 27 deletions.
diff --git a/partfinder/analysis_method.py b/partfinder/analysis_method.py
@@ -184,7 +184,7 @@ def do_analysis(self):
                     c_matrix = spatial.distance.squareform(c_matrix)
 
                 # 1. pick top N subset pairs from distance matrix
-                cutoff = max_schemes
+                cutoff = max_schemes # this defines the greedy algorithm: we look at all schemes
 
                 closest_pairs = neighbour.get_N_closest_subsets(
                     subsets, the_config, cutoff, d_matrix)
@@ -283,12 +283,13 @@ def do_analysis(self):
 
 class RelaxedClusteringAnalysis(Analysis):
     '''
-    A relaxed clustering algorithm for heuristic partitioning searches
+    A fast relaxed clustering algorithm for heuristic partitioning searches
 
     1. Rank subsets by their similarity (defined by clustering-weights)
     2. Analyse min(cluster-percent or cluster-max) most similar schemes
-    3. Take the scheme that improves the AIC/BIC score the most
-    4. Quit if no improvements.
+    3. Sequentially perform all groupings that imporve the AICc/BIC score, in order of improvement
+    4. Analyse resulting scheme, iterate to 2.
+    5. Quit if no improvements.
     '''
 
     def clean_scheme(self, start_scheme):
@@ -391,7 +392,7 @@ def do_analysis(self):
                 # 1. pick top N subset pairs from distance matrix
                 cutoff = int(math.ceil(max_schemes * (the_config.cluster_percent * 0.01)))
                 if cutoff <= 0: cutoff = 1
-                if the_config.cluster_max != None and cutoff>the_config.cluster_max:
+                if the_config.cluster_max != None and cutoff > the_config.cluster_max:
                     cutoff = the_config.cluster_max
                 log.info("Choosing the %d most similar subset pairs" % cutoff)
                 closest_pairs = neighbour.get_N_closest_subsets(
@@ -432,44 +433,52 @@ def do_analysis(self):
                 # so we need to be careful to only proceed if we have a negative change
                 # which indicates an improvement in the score
                 best_change = np.amin(c_matrix)
+                best_scheme = start_scheme
 
-                log.debug("Biggest improvement in info score: %s", str(best_change))
 
                 if best_change>=0:
                     log.info("Found no schemes that improve the score, stopping")
                     break
 
-                best_pair = neighbour.get_best_pair(c_matrix, best_change, subsets)
+                while best_change<0:
+
+                    best_pair = neighbour.get_best_pair(c_matrix, best_change, subsets)
+                    best_merged = subset_ops.merge_subsets(best_pair)
+                    best_scheme = neighbour.make_clustered_scheme(
+                        start_scheme, scheme_name, best_pair, best_merged, the_config)
+                    start_scheme = best_scheme
+
+                    log.info("Combining subsets: '%s' and '%s'" %(best_pair[0].name, best_pair[1].name))
+                    log.info("This improves the %s score by: %s", the_config.model_selection, str(abs(best_change)))
+
+                    # reset_c_matrix and the subset list
+                    c_matrix = neighbour.reset_c_matrix(c_matrix, list(best_pair), [best_merged], subsets)
+
+                    # we update the subset list in a way that means its structure tracks the c-matrix
+                    subsets = neighbour.reset_subsets(subsets, list(best_pair), [best_merged])
+
+                    best_change = np.amin(c_matrix)
+
+                    if the_config.search == 'rcluster':
+                        break
+                        # otherwise we are using rclusterf, which continues in this loop
 
-                best_merged = subset_ops.merge_subsets(best_pair)
-                best_scheme = neighbour.make_clustered_scheme(
-                    start_scheme, scheme_name, best_pair, best_merged, the_config)
-                best_result = self.analyse_scheme(best_scheme)
 
                 # the best change can get updated a fraction at this point
                 # because calaculting the info score on the whole alignment
                 # is a little different from doing it on the one subset
+                best_result = self.analyse_scheme(best_scheme)
                 best_change = self.results.best_score - start_score
 
 
-                log.info("Best scheme combines subsets: '%s' and '%s'" %(best_pair[0].name, best_pair[1].name))
-
-
-                log.info("The best scheme improves the %s score by %.2f to %.1f",
+                log.info("The best scheme has %d subsets and improves the %s score by %.2f to %.1f",
+                    len(best_scheme.subsets),
                     the_config.model_selection,
                     np.abs(best_change),
                     self.results.best_score)
                 start_scheme = best_scheme
                 start_score = best_result.score
 
-                log.debug("Best pair: %s", str([s.name for s in best_pair]))
-                log.debug("Merged into: %s", str([best_merged.name]))
-
-                # 5. reset_c_matrix and the subset list
-                c_matrix = neighbour.reset_c_matrix(c_matrix, list(best_pair), [best_merged], subsets)
-
-                # we update the subset list in a way that means its structure tracks the c-matrix
-                subsets = neighbour.reset_subsets(subsets, list(best_pair), [best_merged])
 
                 if not the_config.quick:
                     the_config.reporter.write_scheme_summary(
@@ -487,8 +496,10 @@ def do_analysis(self):
                     self.results.best_score))
 
         if the_config.min_subset_size or the_config.all_states:
-            best_scheme = self.clean_scheme(best_scheme)
+            best_scheme = self.clean_scheme(self.results.best_scheme)
             best_result = self.analyse_scheme(best_scheme)
+
+            # scores after cleaning can be worse, so we reset these trackers...
             self.results.best_result = best_result 
             self.results.best_score = best_result.score
             self.results.best_scheme = best_scheme
@@ -500,7 +511,6 @@ def do_analysis(self):
         the_config.reporter.write_best_scheme(self.results)
 
 
-
 class KmeansAnalysis(Analysis):
 
     # set the default subset size to 100 for kmeans analyses
@@ -885,6 +895,8 @@ def choose_method(search):
         method = StrictClusteringAnalysis
     elif search == 'rcluster':
         method = RelaxedClusteringAnalysis
+    elif search == 'rclusterf':
+        method = RelaxedClusteringAnalysis
     elif search == 'kmeans':
         method = KmeansAnalysis
     else:

diff --git a/partfinder/config.py b/partfinder/config.py
@@ -37,7 +37,7 @@ class Configuration(object):
     options = {
         'branchlengths': ['linked', 'unlinked'],
         'model_selection': ['aic', 'aicc', 'bic'],
-        'search': ['all', 'user', 'greedy', 'hcluster', 'rcluster', 'kmeans']
+        'search': ['all', 'user', 'greedy', 'hcluster', 'rcluster', 'rclusterf', 'kmeans']
     }
 
     def __init__(self):

diff --git a/partfinder/reporter.py b/partfinder/reporter.py
@@ -293,7 +293,7 @@ def write_best_scheme(self, result):
         output.write(scheme_header_template % ("model_selection",
                                                 self.cfg.model_selection))
         output.write(scheme_header_template % ("search", self.cfg.search))
-        if self.cfg.search in ["rcluster", "hcluster"]:
+        if self.cfg.search in ["rcluster", "hcluster", "rclusterf"]:
             pretty_weights = "rate = %s, base = %s, model = %s, alpha = %s" %(
                                str(self.cfg.cluster_weights["rate"]),
                                str(self.cfg.cluster_weights["freqs"]),
@@ -356,6 +356,8 @@ def write_citation_text(self):
                     "partitioning schemes for phylogenomic datasets. "
                     "BMC evolutionary biology, 14(1), 82.")
 
+    ref_rclusterf = ref_rcluster
+
     ref_kmeans = ("Frandsen, P. B., Calcott, B., Mayer, C., & Lanfear, R. "
                   "(2015). Automatic selection of partitioning schemes for "
                   "phylogenetic analyses using iterative k-means clustering "
@@ -399,6 +401,10 @@ def write_citation_text(self):
     elif self.cfg.search == "greedy":
         citation_text.append("%s\n" % ref_PF1)
 
+    elif self.cfg.search == "rclusterf":
+        citation_text.append("%s\n" % ref_rclusterf)
+
+
     citation_text.append("\n")
     if self.cfg.phylogeny_program == 'phyml':
         citation_text.append("Your analysis also used PhyML, so please cite:\n")