Merge pull request #288 from bacpop/master

Update from master
bacpop · Nov 14, 2023 · 5652532 · 5652532
2 parents bf4fcec + 6ff3bbb
commit 5652532
Show file tree

Hide file tree

Showing 18 changed files with 313 additions and 33 deletions.
diff --git a/.github/workflows/azure_ci.yml b/.github/workflows/azure_ci.yml
@@ -21,10 +21,17 @@ jobs:
       uses: actions/setup-python@v2
       with:
         python-version: ${{ matrix.python-version }}
+    - name: Get current date
+      id: date
+      run: echo "date=$(date +%Y-%m-%d)" >> "${GITHUB_OUTPUT}"
     - name: Install Conda environment from environment.yml
-      uses: mamba-org/provision-with-micromamba@main
+      uses: mamba-org/setup-micromamba@v1
       with:
-        cache-env: true
+        micromamba-version: '1.4.6-0'
+        environment-file: environment.yml
+        # persist on the same day.
+        cache-environment-key: environment-${{ steps.date.outputs.date }}
+        cache-downloads-key: downloads-${{ steps.date.outputs.date }}
     - name: Install and run_test.py
       shell: bash -l {0}
       run: |

diff --git a/.readthedocs.yaml b/.readthedocs.yaml
@@ -18,5 +18,4 @@ python:
    install:
       - requirements: docs/requirements.txt
       - method: setuptools
-        path: docs
-   system_packages: true
+        path: docs
diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py
@@ -271,8 +271,8 @@ def assign_query(dbFuncs,
     constructDatabase = dbFuncs['constructDatabase']
     readDBParams = dbFuncs['readDBParams']
 
-    if ref_db == output:
-        sys.stderr.write("--output and --ref-db must be different to "
+    if ref_db == output and overwrite == False:
+        sys.stderr.write("--output and --db must be different to "
                          "prevent overwrite.\n")
         sys.exit(1)
 
@@ -386,8 +386,8 @@ def assign_query_hdf5(dbFuncs,
     readDBParams = dbFuncs['readDBParams']
     getSeqsInDb = dbFuncs['getSeqsInDb']
 
-    if ref_db == output:
-        sys.stderr.write("--output and --ref-db must be different to "
+    if ref_db == output and overwrite == False:
+        sys.stderr.write("--output and --db must be different to "
                          "prevent overwrite.\n")
         sys.exit(1)
     if (update_db and not distances):
@@ -509,8 +509,9 @@ def assign_query_hdf5(dbFuncs,
 
             n_vertices = len(get_vertex_list(genomeNetwork, use_gpu = gpu_graph))
             if n_vertices != len(rNames):
-                sys.stderr.write(f"There are {n_vertices}  vertices in the network but {len(rNames)} reference names supplied; " + \
+                sys.stderr.write(f"ERROR: There are {n_vertices} vertices in the network but {len(rNames)} reference names supplied; " + \
                                  "please check the '--model-dir' variable is pointing to the correct directory\n")
+                sys.exit(1)
 
         if model.type == 'lineage':
             # Assign lineages by calculating query-query information

diff --git a/PopPUNK/mandrake.py b/PopPUNK/mandrake.py
@@ -20,7 +20,7 @@
 from .utils import readPickle
 
 def generate_embedding(seqLabels, accMat, perplexity, outPrefix, overwrite, kNN = 50,
-                       maxIter = 1000000, n_threads = 1, use_gpu = False, device_id = 0):
+                       maxIter = 10000000, n_threads = 1, use_gpu = False, device_id = 0):
     """Generate t-SNE projection using accessory distances
 
     Writes a plot of t-SNE clustering of accessory distances (.dot)
@@ -127,7 +127,7 @@ def get_options():
     parser.add_argument('--output', required=True, help='Name of output file')
     parser.add_argument('--perplexity', help='Perplexity used to generate projection [default = 30]', type=int, default=30)
     parser.add_argument('--knn', help='Number of neighbours used to generate t-SNE projection [default = 50]', type=int, default=50)
-    parser.add_argument('--iter', help='Number of iterations [default = 1000000]', type=int, default=1000000)
+    parser.add_argument('--iter', help='Number of iterations [default = 1000000]', type=int, default=10000000)
     parser.add_argument('--cpus', help="Number of CPU threads", type=int, default=1)
     parser.add_argument('--use-gpu', help='Whether to use GPU libraries for t-SNE calculation', default = False, action='store_true')
     parser.add_argument('--device-id', help="Device ID of GPU to use", type=int, default=0)

diff --git a/PopPUNK/network.py b/PopPUNK/network.py
@@ -536,11 +536,16 @@ def network_to_edges(prev_G_fn, rlist, adding_qq_dists = False,
         source_ids = old_source_ids
         target_ids = old_target_ids
     else:
-        # Update IDs to new versions
-        old_id_indices = [rlist.index(x) for x in old_ids]
-        # translate to indices
-        source_ids = [old_id_indices[x] for x in old_source_ids]
-        target_ids = [old_id_indices[x] for x in old_target_ids]
+        try:
+            # Update IDs to new versions
+            old_id_indices = [rlist.index(x) for x in old_ids]
+            # translate to indices
+            source_ids = [old_id_indices[x] for x in old_source_ids]
+            target_ids = [old_id_indices[x] for x in old_target_ids]
+        except ValueError:
+            sys.stderr.write(f"Network size mismatch. Previous network nodes: {max(old_id_indices)}."
+                             f"New network nodes: {max(old_source_ids.a)}/{max(old_target_ids.a)}\n")
+            sys.exit(1)
 
     # return values
     if weights:

diff --git a/PopPUNK/plot.py b/PopPUNK/plot.py
@@ -229,7 +229,7 @@ def plot_results(X, Y, means, covariances, scale, title, out_prefix):
         # Plot an ellipse to show the Gaussian component
         angle = np.arctan(u[1] / u[0])
         angle = 180. * angle / np.pi  # convert to degrees
-        ell = mpl.patches.Ellipse(mean*scale, v[0], v[1], 180. + angle, color=color)
+        ell = mpl.patches.Ellipse(mean*scale, v[0], v[1], angle=180. + angle, color=color)
         ell.set_clip_box(splot.bbox)
         ell.set_alpha(0.5)
         splot.add_artist(ell)
@@ -746,7 +746,7 @@ def writeClusterCsv(outfile, nodeNames, nodeLabels, clustering,
             prev_col_items = this_col_items
         sys.exit(1)
 
-def outputsForMicroreact(combined_list, clustering, nj_tree, mst_tree, accMat, perplexity,
+def outputsForMicroreact(combined_list, clustering, nj_tree, mst_tree, accMat, perplexity, maxIter,
                          outPrefix, epiCsv, queryList = None, overwrite = False, n_threads = 1,
                          use_gpu = False, device_id = 0):
     """Generate files for microreact
@@ -768,7 +768,9 @@ def outputsForMicroreact(combined_list, clustering, nj_tree, mst_tree, accMat, p
         accMat (numpy.array)
             n x n array of accessory distances for n samples.
         perplexity (int)
-            Perplexity parameter passed to t-SNE
+            Perplexity parameter passed to mandrake
+        maxIter (int)
+            Maximum iterations for mandrake
         outPrefix (str)
             Prefix for all generated output files, which will be placed in `outPrefix` subdirectory
         epiCsv (str)
@@ -803,7 +805,7 @@ def outputsForMicroreact(combined_list, clustering, nj_tree, mst_tree, accMat, p
 
     # write the phylogeny .nwk; t-SNE network .dot; clusters + data .csv
     embedding_file = generate_embedding(seqLabels, accMat, perplexity, outPrefix, overwrite,
-                       kNN=100, maxIter=1000000, n_threads=n_threads,
+                       kNN=100, maxIter=maxIter, n_threads=n_threads,
                        use_gpu=use_gpu, device_id=device_id)
     outfiles.append(embedding_file)
 

diff --git a/PopPUNK/utils.py b/PopPUNK/utils.py
@@ -329,14 +329,23 @@ def joinClusterDicts(d1, d2):
         d1 (dict of dicts)
             d1 with d2 appended
     """
-    if d1.keys() != d2.keys():
-        sys.stderr.write("Cluster columns not compatible\n")
+    matching_cols = set(d1.keys()).intersection(d2.keys())
+    if len(matching_cols) == 0:
+        sys.stderr.write("Cluster columns do not match between sets being combined\n")
+        sys.stderr.write(f"{d1.keys()} {d2.keys()}\n")
         sys.exit(1)
 
+    missing_cols = []
     for column in d1.keys():
-        # Combine dicts: https://stackoverflow.com/a/15936211
-        d1[column] = \
-            dict(chain.from_iterable(d.items() for d in (d1[column], d2[column])))
+        if column in matching_cols:
+            # Combine dicts: https://stackoverflow.com/a/15936211
+            d1[column] = \
+                dict(chain.from_iterable(d.items() for d in (d1[column], d2[column])))
+        else:
+            missing_cols.append(column)
+
+    for missing in missing_cols:
+        del d1[missing]
 
     return d1
 

diff --git a/PopPUNK/visualise.py b/PopPUNK/visualise.py
@@ -126,6 +126,9 @@ def get_options():
     faGroup.add_argument('--perplexity',
                          type=float, default = 20.0,
                          help='Perplexity used to calculate mandrake projection (with --microreact) [default=20.0]')
+    faGroup.add_argument('--maxIter',
+                         type=int, default = 10000000,
+                         help='Iterations used to calculate mandrake projection (with --microreact) [default=10000000]')
     faGroup.add_argument('--info-csv',
                          help='Epidemiological information CSV formatted for microreact (can be used with other outputs)')
 
@@ -170,6 +173,7 @@ def generate_visualisations(query_db,
                             grapetree,
                             cytoscape,
                             perplexity,
+                            maxIter,
                             strand_preserved,
                             include_files,
                             model_dir,
@@ -554,6 +558,7 @@ def generate_visualisations(query_db,
                                                 mst_tree,
                                                 acc_distMat,
                                                 perplexity,
+                                                maxIter,
                                                 output,
                                                 info_csv,
                                                 queryList=qlist,
@@ -638,6 +643,7 @@ def main():
                             args.grapetree,
                             args.cytoscape,
                             args.perplexity,
+                            args.maxIter,
                             args.strand_preserved,
                             args.include_files,
                             args.model_dir,

diff --git a/docs/images/ecoli_refined_fit.png b/docs/images/ecoli_refined_fit.png
diff --git a/docs/images/ecoli_refined_fit_with_neg.png b/docs/images/ecoli_refined_fit_with_neg.png
diff --git a/docs/images/ipp_tree_example.png b/docs/images/ipp_tree_example.png
diff --git a/docs/images/vp_ipp_result.png b/docs/images/vp_ipp_result.png
diff --git a/docs/index.rst b/docs/index.rst
@@ -45,6 +45,7 @@ in the `paper <https://doi.org/10.1101/gr.241455.118>`_.
    gpu.rst
    troubleshooting.rst
    scripts.rst
+   poppunk_iterate.rst
    citing.rst
    api.rst
    miscellaneous.rst

diff --git a/docs/installation.rst b/docs/installation.rst
@@ -58,12 +58,14 @@ Clone the code
 --------------
 You can also clone the github to run the latest version, which is executed by::
 
-    git clone https://github.com/johnlees/PopPUNK.git && cd PopPUNK
+    git clone https://github.com/bacpop/PopPUNK.git && cd PopPUNK
+    python3 setup.py build
+    python3 setup.py install
     python3 poppunk-runner.py
 
 This will also give access to the :ref:`scripts`.
 
 You will need to install the dependencies yourself (you can still use
-conda or pip for this purpose). See ``environment.yml``.
-
-
+conda or pip for this purpose). See ``environment.yml``::
+    mamba env create -f environment.yml
+    conda activate pp_env
diff --git a/docs/model_fitting.rst b/docs/model_fitting.rst
@@ -706,7 +706,7 @@ between the origin and the refined boundary position.
 Trivial cluster sets, where every sample is in its own cluster, will be excluded, so
 the final number of clusters may be less than ``<n>``.
 
-For a use of these cluster sets, see the :ref:`poppunk-iterate` section.
+For a use of these cluster sets, see the :doc:`poppunk_iterate` section.
 
 threshold
 ---------