Merge pull request #93 from nf-core/reports

Reports
nf-core · Dec 20, 2023 · 5092582 · 5092582
2 parents 45daa85 + 134c3f5
commit 5092582
Show file tree

Hide file tree

Showing 29 changed files with 610 additions and 143 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -36,8 +36,27 @@ jobs:
           version: "${{ matrix.NXF_VER }}"
 
       - name: Run pipeline with test data
-        # TODO nf-core: You can customise CI pipeline run tests as required
-        # For example: adding multiple test runs with different parameters
-        # Remember that you can parallelise this by using strategy.matrix
         run: |
           nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results
+
+  parameters:
+    name: Test workflow parameters
+    if: "${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/multiplesequencealign') }}"
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        NXF_VER:
+          - "23.04.0"
+          - "latest-everything"
+    steps:
+      - name: Check out pipeline code
+        uses: actions/checkout@v4
+
+      - name: Install Nextflow
+        uses: nf-core/setup-nextflow@v1
+        with:
+          version: "${{ matrix.NXF_VER }}"
+
+      - name: Test workflow parameters
+        run: |
+          nextflow run ${GITHUB_WORKSPACE} -profile test_align_only,docker --outdir ./results
diff --git a/.gitignore b/.gitignore
@@ -10,3 +10,4 @@ null/
 outdir/
 .ipynb_checkpoints/
 test.ipynb
+*reports.tsv
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -17,7 +17,7 @@ Initial release of nf-core/multiplesequencealign, created with the [nf-core](htt
 [#35](https://github.com/nf-core/multiplesequencealign/issues/35) - Add module MUSCLE5_SUPER5
 [#59](https://github.com/nf-core/multiplesequencealign/issues/59) - Add support for passing structure template in samplesheet.
 [#77](https://github.com/nf-core/multiplesequencealign/issues/77) - Add module zip
-[#77](https://github.com/nf-core/multiplesequencealign/issues/77) - Add module zip
+[#93](https://github.com/nf-core/multiplesequencealign/pull/93) - Add multiqc basic support. Add custom params validation. Add basic shiny app.
 
 ### `Fixed`
 

diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml
@@ -10,4 +10,137 @@ report_section_order:
   "nf-core-multiplesequencealign-summary":
     order: -1002
 
+report_header_info:
+  - Application Type: "Muliple Sequence Alignment deplyoment and benchmarking."
+
 export_plots: true
+
+run_modules:
+  - custom_content
+
+custom_data:
+  summary_stats:
+    id: "summary_stats"
+    section_name: "Summary Stats"
+    plot_type: "table"
+    anchor: "summary_stats"
+    namespace: "summary_stats"
+    pconfig:
+      id: "summary_stats"
+      title: "Summary statistics"
+
+custom_table_header_config:
+  summary_stats:
+    perc_sim:
+      description: "Average percentage similarity between all sequences in the input fasta file."
+      group: "Fasta"
+      max: 100
+      format: "{:,.2f}"
+      suffix: "%"
+    fasta:
+      description: "Input fasta file."
+      hidden: False
+      group: "Fasta"
+    n_sequences:
+      description: "Number of sequences in the input fasta file."
+      hidden: False
+      group: "Fasta"
+      scale: "Set2"
+    seqlength_mean:
+      description: "Average sequence length in the input fasta file."
+      hidden: False
+      group: "Fasta"
+      format: "{:,.1f}"
+    seqlength_max:
+      description: "Maximum sequence length in the input fasta file."
+      hidden: True
+      group: "Fasta"
+      format: "{:,.1f}"
+    seqlength_median:
+      description: "Median sequence length in the input fasta file."
+      hidden: True
+      group: "Fasta"
+      format: "{:,.1f}"
+    sp:
+      description: "Sum of Pairs metric."
+      hidden: False
+      group: "Alignment"
+      format: "{:,.1f}"
+      max: 100
+    tc:
+      description: "Total Column metric."
+      hidden: False
+      group: "Alignment"
+      format: "{:,.1f}"
+      max: 100
+    EVALUATED:
+      description: "EVALUATED metric."
+      hidden: True
+      group: "Alignment"
+      format: "{:,.1f}"
+      max: 100
+    APDB:
+      description: "APDB metric."
+      hidden: True
+      group: "Alignment"
+      format: "{:,.1f}"
+    iRMSD:
+      description: "iRMSD metric."
+      hidden: True
+      group: "Alignment"
+      format: "{:,.1f}"
+    NiRMSD:
+      description: "NiRMSD metric."
+      hidden: False
+      group: "Alignment"
+      format: "{:,.1f}"
+      min: 0
+    tree:
+      description: "Tree used in the alignment."
+      hidden: False
+      group: "Alignment"
+      scale: "Paired"
+    args_tree:
+      description: "Arguments used to build the tree."
+      hidden: True
+      group: "Alignment"
+    args_tree_clean:
+      description: "Arguments used to build the tree."
+      hidden: True
+      group: "Alignment"
+    aligner:
+      description: "Aligner used."
+      hidden: False
+      group: "Alignment"
+      scale: "Paired"
+    args_aligner:
+      description: "Arguments used to run the aligner."
+      hidden: True
+      group: "Alignment"
+    args_aligner_clean:
+      description: "Arguments used to run the aligner."
+      hidden: True
+      group: "Alignment"
+
+table_columns_placement:
+  summary_stats:
+    fasta: 90
+    tree: 150
+    args_tree: 170
+    aligner: 200
+    args_aligner: 220
+    n_sequences: 250
+    seqlength_mean: 280
+    seqlength_median: 310
+    seqlength_max: 340
+    perc_sim: 370
+    sp: 400
+    tc: 430
+    EVALUATED: 470
+    APDB: 500
+    iRMSD: 530
+    NiRMSD: 560
+
+sp:
+  summary_stats:
+    fn: "summary_stats_eval_multiqc_table.csv"
diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv
@@ -1,2 +1,3 @@
 id,fasta,reference,structures,template
 seatoxin-ref,test-dataset/setoxin-ref.fa,test-dataset/setoxin.ref,https://raw.githubusercontent.com/nf-core/test-datasets/multiplesequencealign/testdata/structures/seatoxin-ref.tar.gz,
+toxin-ref,test-dataset/toxin-ref.fa,test-dataset/toxin.ref,,
diff --git a/assets/samplesheet_copy.csv b/assets/samplesheet_copy.csv
diff --git a/assets/toolsheet.csv b/assets/toolsheet.csv
@@ -1,4 +1,10 @@
 tree,args_tree,aligner,args_aligner
-FAMSA,"-gt upgma -parttree",FAMSA,""
-"","",MAFFT,"--anysymbol --quiet --dpparttree"
-
+FAMSA,-gt upgma -parttree,FAMSA,
+,,MAFFT,--anysymbol --quiet --dpparttree
+,,MUSCLE5,
+,,LEARNMSA,
+,,KALIGN,
+CLUSTALO,,CLUSTALO,
+,,TCOFFEE,
+,,3DCOFFEE,-method TMalign_pair
+,,REGRESSIVE,-reg_nseq 3
diff --git a/assets/tooslheet_example.csv b/assets/tooslheet_example.csv
diff --git a/shiny_app/app.py → bin/app.py b/shiny_app/app.py → bin/app.py
@@ -14,19 +14,19 @@
 
 # Load file
 # ----------------------------------------------------------------------------
-summary_report = "./outdir/summary_report/evaluation_summary_report.csv"
-stats_report = "./outdir/stats/stats_summary_report.csv"
+summary_report = "./shiny_data.csv"
 
-summary_df = pd.read_csv(summary_report)
-stats_df = pd.read_csv(stats_report)
+try:
+    inputfile = pd.read_csv(summary_report)
+except:
+    print("ERROR: file not found: ", summary_report)
+    sys.exit(1)
 
-cols_to_merge = ["id"]
 
-inputfile = summary_df.merge(stats_df, on=cols_to_merge, how="left")
 # ----------------------------------------------------------------------------
 
 options = {item: item for item in list(inputfile.columns)}
-options_color = {"align": "assembly", "tree": "tree"}
+options_color = {"aligner": "assembly", "tree": "tree"}
 options_eval = {
     "sp": "sum of pairs (SP)",
     "n_sequences": "# sequences",

diff --git a/bin/calc_seqstats.py b/bin/calc_seqstats.py
@@ -7,6 +7,7 @@
 fasta_file = sys.argv[2]
 outfile = sys.argv[3]
 outfile_summary = sys.argv[4]
+outfile_mqc = sys.argv[5]
 
 
 def get_seq_lengths(fasta_file):
@@ -16,7 +17,10 @@ def get_seq_lengths(fasta_file):
         name, sequence = fasta.id, str(fasta.seq)
         l = len(sequence)
         name = name.replace("/", "_")
-        entry = pd.DataFrame([{"id": fam_name, "sequence": name, "sequence length": l}])
+        # entry = pd.DataFrame([{"id": fam_name, "sequence": name, "sequence length": l}])
+        entry = pd.DataFrame([{"id": fam_name, "sequence length": l}])
+        # count number of sequences per sequence length
+        # entry = entry.groupby(by=["id", "sequence length"]).size().reset_index(name="count")
         summary = pd.concat([summary, entry], ignore_index=True)
     return summary
 
@@ -29,7 +33,11 @@ def get_seq_lengths(fasta_file):
 )
 stats_df["n_sequences"] = len(summary_lengths)
 stats_df.rename(columns={"mean": "seqlength_mean", "max": "seqlength_max", "median": "seqlength_median"}, inplace=True)
+nseq_mqc = stats_df[["id", "n_sequences"]].drop_duplicates()
 
 
 summary_lengths.to_csv(outfile, sep=",", index=False)
 stats_df.to_csv(outfile_summary, sep=",", index=False)
+# save mqc file with no column names
+# append to file
+nseq_mqc.to_csv(outfile_mqc, sep="\t", index=False, header=False)
diff --git a/bin/prep_multiqc_table.py b/bin/prep_multiqc_table.py
@@ -0,0 +1,46 @@
+#!/usr/bin/env python
+
+import csv
+import argparse
+import sys
+import pandas as pd
+
+
+def prep_table(input, output):
+    df = pd.read_csv(input, sep=",")
+    # make nan values "null"
+    df = df.rename(columns={"id": "fasta"})
+    # run id as first column
+    # replace in all rows the word null with default
+
+    # add column with index as integer
+    df["id"] = df.index + 1
+    # make it int
+    df["id"] = df["id"].astype(int)
+    # make it the first column
+    cols = df.columns.tolist()
+    cols = cols[-1:] + cols[:-1]
+    df = df[cols]
+    df.to_csv(output, index=False)
+
+
+def parse_args(argv=None):
+    """Define and immediately parse command line arguments."""
+    parser = argparse.ArgumentParser(description="--")
+    parser.add_argument(
+        "-i",
+    )
+
+    parser.add_argument(
+        "-o",
+    )
+    return parser.parse_args(argv)
+
+
+def main(argv=None):
+    args = parse_args(argv)
+    prep_table(args.i, args.o)
+
+
+if __name__ == "__main__":
+    sys.exit(main())