From 679e81271a87bf9ee48763c2a1ec587da0ac65ef Mon Sep 17 00:00:00 2001 From: cb-Hades <81743695+cb-Hades@users.noreply.github.com> Date: Tue, 27 Aug 2024 16:18:31 +0200 Subject: [PATCH] Update #7 --- docs/source/conf.py | 2 +- docs/source/modules/cmd.rst | 36 ++++++---- pyproject.toml | 2 +- src/specimen/classes/reports.py | 16 +++-- src/specimen/cmd_access.py | 68 ++++++++++--------- .../hqtb/core/refinement/smoothing.py | 2 +- src/specimen/hqtb/workflow.py | 17 +++-- src/specimen/util/util.py | 10 ++- 8 files changed, 90 insertions(+), 63 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index a50a76e..5065305 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -9,7 +9,7 @@ project = 'SPECIMEN' copyright = '2024, Carolin Brune and Gwendolyn O. Döbel' author = 'Carolin Brune and Gwendolyn O. Döbel' -release = '0.0.dev0' +release = '0.0.dev1' # -- Path setup -------------------------------------------------------------- diff --git a/docs/source/modules/cmd.rst b/docs/source/modules/cmd.rst index d018887..3db5863 100644 --- a/docs/source/modules/cmd.rst +++ b/docs/source/modules/cmd.rst @@ -1,6 +1,12 @@ The specimen.cmd_access submodule ================================= +.. warning:: + + The ``HQTB`` workflow is under heavy construction due to + changes in ``refineGEMs``. It might not work as expected + (or throw errors). Please await the next update. + .. automodule:: specimen.cmd_access :members: :undoc-members: @@ -15,8 +21,8 @@ from inside the Python environment it was installed in using: The following commands are available: -- ``cmpb`` : Pipeline for GEM curation based on CarveMe model and ModelPolisher. -- ``hqtb`` : Pipeline for GEM curation based on a high-quality template. +- ``cmpb`` : Workflow for GEM curation based on CarveMe model and ModelPolisher. +- ``hqtb`` : Workflow for GEM curation based on a high-quality template. - ``setup`` : Setup structure, data and more. @@ -32,22 +38,22 @@ specimen setup specimen setup config -Download a configuration file, either for the pipeline or for media. +Download a configuration file, either for the worfklow or for media. Options: - ``--filename/-f``: Name/Path to save the config under. -- ``--type/-t``: Type of config to download. Can be media or basic/advanced for the pipeline config. +- ``--type/-t``: Type of config to download. Can be media or basic/advanced for the worfklow config. .. code:: bash - specimen data structure [PIPELINE] + specimen data structure [WORKFLOW] -Setup a directory with the basic structure for the data needed for the pipeline. +Setup a directory with the basic structure for the data needed for the workflow. Argument: -- ``PIPELINE``: The name of the pipeline to setup the structure for. +- ``WORKFLOW``: The name of the worfklow to setup the structure for. Options: @@ -59,15 +65,15 @@ specimen hqtb .. code:: bash - specimen hqtb run_pipeline [CONFIG] + specimen hqtb run [CONFIG] -Run the complete pipeline with a configuration file as input. +Run the complete worfklow with a configuration file as input. .. code:: bash - specimen hqtb run_wrapper [CONFIG] + specimen hqtb wrapper [CONFIG] -Run the pipeline using a config on a directory containing multiple input genomes. +Run the workflow using a config on a directory containing multiple input genomes. Options: @@ -77,7 +83,7 @@ Options: specimen hqtb bdb [TEMPLATE] [INPUT] -Run step 1: bidirectional BLAST of the pipeline. Requires the input and template genome as input. +Run step 1: bidirectional BLAST of the workflow. Requires the input and template genome as input. Options: @@ -93,7 +99,7 @@ Options: specimen hqtb draft [TEMPLATE] [BPBBH] -Run step 2: generate draft model of the pipeline. Requires the results of the bidirectional BLAST +Run step 2: generate draft model of the worfklow. Requires the results of the bidirectional BLAST and the template model as input. Options: @@ -136,7 +142,7 @@ Options: specimen hqtb refinement ^^^^^^^^^^^^^^^^^^^^^^^^ -Run the different parts of the step 3: refinement of the pipeline. +Run the different parts of the step 3: refinement of the workflow. .. code:: bash @@ -230,4 +236,4 @@ specimen cmpb specimen cmpb run [CONFIG] -Run the complete pipeline with a configuration file as input. \ No newline at end of file +Run the complete CMPB workflow with a configuration file as input. \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 235de4e..f61ca99 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,7 +14,7 @@ maintainers = [ description = "SPECIMEN: A collection of workflows for strain-specific metabolic modelling." readme = "README.md" -version = "0.0.dev0" +version = "0.0.dev1" requires-python = ">=3.10" license = {text = "GPL-3.0"} keywords = ['Systems Biology', 'GEM', 'Metabolic Modelling', 'Python Package', 'Pipeline'] diff --git a/src/specimen/classes/reports.py b/src/specimen/classes/reports.py index e50d329..c63a142 100644 --- a/src/specimen/classes/reports.py +++ b/src/specimen/classes/reports.py @@ -26,9 +26,8 @@ ################################################################################ class SpecimenModelInfoReport(ModelInfoReport): - """A SPECIMEN-specific report for a given model. - - Child-class of the refineGEMs class ModelInfoReport. + """A SPECIMEN-specific version of the + ModelInfoReport for a given model. Attributes: model: @@ -70,7 +69,7 @@ def format_table(self) -> pd.DataFrame: # depending on the implementation, save and make html # can be inherited or need to be overwritten - # but currently a @TODO + # @TODO def visualise(self, color_palette: str = 'YlGn') -> tuple[matplotlib.figure.Figure]: """Extend the visualisation function to include a graph for the creation type. @@ -120,12 +119,17 @@ def plot_origin(data, color_palette): return (fig1,fig2) + def save(self, dir: str, color_palette: str = 'YlGn') -> None: """Save the report and the Args: - - dir (str): _description_ - - color_palette (str, optional): _description_. Defaults to 'YlGn'. + - dir (str): + Path to a directory to save the output files to. + - color_palette (str, optional): + Name of a matplotlib colour palette. + Used as the input for the figures. + Defaults to 'YlGn'. """ # save the statistics report diff --git a/src/specimen/cmd_access.py b/src/specimen/cmd_access.py index 5cc16db..cc9ec18 100644 --- a/src/specimen/cmd_access.py +++ b/src/specimen/cmd_access.py @@ -10,6 +10,8 @@ import specimen import click +import specimen.hqtb + ################################################################################ # entry points ################################################################################ @@ -50,53 +52,53 @@ def config(filename,type): # setup data directory structure # ------------------------------ @setup.command() -@click.argument('pipeline', type=click.Choice(['hqtb','high-quality template based', +@click.argument('workflow', type=click.Choice(['hqtb','high-quality template based', 'cmpb', 'carveme modelpolisher based'])) @click.option('--dir','-d', type=str, default='./data/', show_default=True, help='Name/path to the directory create subdirectories in.') @click.option('--chunk-size', '-s', type=int, default=2048, show_default=True, help=' Size of the chunks of data while downloading.') -def data_structure(pipeline, dir, chunk_size): +def data_structure(workflow, dir, chunk_size): """Create a data directory and download basic databases. Creates the 'ideal' data directory structure and directly downloads the MetaNetX and BiGG data files. - PIPELINE is the type of pipeline for which the data structure should be build. + WORKFLOW is the type of workflow for which the data structure should be build. """ - specimen.util.set_up.build_data_directories(pipeline, dir, chunk_size) + specimen.util.set_up.build_data_directories(workflow, dir, chunk_size) ################# -# hqtb pipeline # +# hqtb workflow # ################# @cli.group() def hqtb(): - """Pipeline for GEM curation based on a high-quality template.""" + """Workflow for GEM curation based on a high-quality template.""" -# run complete pipeline from config +# run complete workflow from config # --------------------------------- @hqtb.command() @click.argument('config', type=str) -def run_pipeline(config): - """Run the complete pipeline based on a config file. +def run(config): + """Run the complete workflow based on a config file. CONFIG is the path to the configuration file to read the parameters from. """ - specimen.workflow.run_complete(config) + specimen.hqtb.workflow.run(config) -# run complete pipeline from config and folder (run multiple times) +# run complete workflow from config and folder (run multiple times) # ------------------------------------------------------------------ @hqtb.command() @click.argument('config', type=str) @click.option('-d','--directory', default='', type=str, help='Path to the (parent) directory that contains the folders if the subject input files.') -def run_wrapper(config,directory): - """Run the complete pipeline multiple times based on a config file +def wrapper(config,directory): + """Run the complete workflow multiple times based on a config file and a folder. The folder should contain subfolders with the subject files (annotated and full genome). CONFIG is the path to the configuration file to read the parameters from. """ - specimen.workflow.wrapper_pipeline(config, parent_dir=directory) + specimen.hqtb.workflow.wrapper(config, parent_dir=directory) # run bidirectional blast @@ -113,12 +115,12 @@ def run_wrapper(config,directory): @click.option('--sensitivity', '-s', type=click.Choice(['sensitive','more-sensitive','very-sensitive','ultra-sensitive']), default='sensitive', help='Sensitivity mode for DIAMOND blastp run. Can be sensitive, more-sensitive, very-sensitive or ultra-sensitive. Default is sensitive.') def bdb(template, input, template_name, input_name, temp_header, in_header, dir, threads, sensitivity): - """Step 1 of the pipeline: Perform bidirectional blast on a TEMPLATE and an INPUT annotated genome. + """Step 1 of the workflow: Perform bidirectional blast on a TEMPLATE and an INPUT annotated genome. TEMPLATE is an annotated genome file (path) that is used for comparison. INPUT is an annotated genome file (path) that will be compared to TEMPLATE """ - specimen.core.bidirectional_blast.run(template, input, dir,template_name, input_name, temp_header, in_header, threads, extra_info=['locus_tag', 'product', 'protein_id'], sensitivity=sensitivity) + specimen.hqtb.core.bidirectional_blast.run(template, input, dir,template_name, input_name, temp_header, in_header, threads, extra_info=['locus_tag', 'product', 'protein_id'], sensitivity=sensitivity) # generafte draft @@ -134,12 +136,12 @@ def bdb(template, input, template_name, input_name, temp_header, in_header, dir, @click.option('--namespace','--nsp',type=click.Choice(['BiGG']),default='BiGG',help='Namespace of the model.') @click.option('--memote', is_flag=True, default=False, help='Run Memote on the generated draft model.') def draft(template, bpbbh, dir, edit_names, pid, name, medium, nsp, memote): - """Step 2 of the pipeline: Generate a draft model from a blastp best hits tsv file and a template model. + """Step 2 of the workflow: Generate a draft model from a blastp best hits tsv file and a template model. TEMPLATE is the path (string) to the template model.\n - BPBBH is the path (string) to the BLASTp bidirectional best hits (step 1). + BPBBH is the path (string) to the BLASTp bidirectional best hits (step 1). """ - specimen.core.generate_draft_model.run(template, bpbbh, dir, edit_names, + specimen.hqtb.core.generate_draft_model.run(template, bpbbh, dir, edit_names, pid, name, medium, nsp, memote) @@ -186,7 +188,7 @@ def extension(draft, gene_list, fasta, db, dir, draft, gene_list, fasta, db, dir, mnx_chem_prop, mnx_chem_xref, mnx_reac_prop, mnx_reac_xref """ - specimen.core.refinement.extension.run(draft, gene_list, fasta, db, dir, + specimen.hqtb.core.refinement.extension.run(draft, gene_list, fasta, db, dir, mnx_chem_prop, mnx_chem_xref, mnx_reac_prop, mnx_reac_xref, ncbi_map, ncbi_dat, id, sensitivity, @@ -243,7 +245,7 @@ def cleanup(model, MODEL is the path to the model to perform the this refinement step on. Ideally in the format of this workflow or the results might differ. """ - specimen.core.refinement.cleanup.run(model, + specimen.hqtb.core.refinement.cleanup.run(model, dir, biocyc_db, check_dupl_reac, @@ -277,7 +279,7 @@ def annotation(model,dir,kegg_via_ec,kegg_via_rc,memote): MODEL is the path to the model to be annotated. """ - specimen.core.refinement.annotation.run(model, + specimen.hqtb.core.refinement.annotation.run(model, dir, kegg_viaEC=kegg_via_ec, kegg_viaRC=kegg_via_rc, @@ -308,7 +310,7 @@ def smoothing(model, genome, dir, mcc, dna_weight_frac, ion_weight_frac, egc, na MODEL is the path to the model that is to b refined.\n Further required is a genome FASTA file of the genome the model was build on. """ - specimen.core.refinement.smoothing.run(genome, model, dir, mcc, + specimen.hqtb.core.refinement.smoothing.run(genome, model, dir, mcc, egc, namespace, dna_weight_frac, ion_weight_frac, @@ -322,14 +324,14 @@ def smoothing(model, genome, dir, mcc, dna_weight_frac, ion_weight_frac, egc, na @click.option('--dir', '-d', default='./validation/', type=str, help='Path to a directory for the output.') @click.option('--run-test', '-t', multiple=True, default=['all'], help='define, which tests should be run. Current possibilities are "all" and "cobra"') def validation(model,dir,run_test): - """Step 4 of the pipeline: Validate the model. + """Step 4 of the workflow: Validate the model. MODEL is the path to the model to be validated. """ if 'all' in run_test: - specimen.core.validation.run(dir, model, tests=None, all=True) + specimen.hqtb.core.validation.run(dir, model, tests=None, all=True) else: - specimen.core.validation.run(dir, model, tests=run_test, all=False) + specimen.hqtb.core.validation.run(dir, model, tests=run_test, all=False) @@ -355,14 +357,14 @@ def analysis(model, mp, test_aa_auxotrophies, pathway): - """Step 5 of the pipeline: Analyse the final model. + """Step 5 of the workflow: Analyse the final model. Includes a statistical analysis and optional a pan-core as well as a growth analysis. MODEL is the path to the model to be analysed. """ - specimen.core.analysis.run(model_path=model, + specimen.hqtb.core.analysis.run(model_path=model, dir=dir, media_path=mp, namespace=n, @@ -373,19 +375,21 @@ def analysis(model, ################# -# cmpb pipeline # +# cmpb workflow # ################# @cli.group() def cmpb(): - """Pipeline for GEM curation based on CarveMe model and ModelPolisher.""" + """Workflow for GEM curation based (mainly) + on CarveMe and ModelPolisher.""" +# @TODO allow command line input as well @cmpb.command() @click.argument('config',type=click.Path(exists=True)) def run(config): - """Run the pipeline for GEM curation based on a CarveMe model using a config file. + """Run the workflow for GEM curation based on a CarveMe model using a config file. CONFIG is the path to the config file. """ - specimen.cmpb.run(config) + specimen.cmpb.workflow.run(config) diff --git a/src/specimen/hqtb/core/refinement/smoothing.py b/src/specimen/hqtb/core/refinement/smoothing.py index 2a10d26..d1e893d 100644 --- a/src/specimen/hqtb/core/refinement/smoothing.py +++ b/src/specimen/hqtb/core/refinement/smoothing.py @@ -128,7 +128,7 @@ def run(genome:str,model:str,dir:str,mcc='skip', case 'greedy': print('GreedyEGCSolver:') solver = egcs.GreedyEGCSolver() - results = solver.solve_egcs(model,namespace=namespace) # @NOTE automatically uses c,p as compartments - maybe change later + results = solver.solve_egcs(model,namespace=namespace) # @NOTE automatically uses c,p as compartments if results: for k,v in results.items(): print(f'\t{k}: {v}') diff --git a/src/specimen/hqtb/workflow.py b/src/specimen/hqtb/workflow.py index 84958ec..d1b28ca 100644 --- a/src/specimen/hqtb/workflow.py +++ b/src/specimen/hqtb/workflow.py @@ -1,4 +1,9 @@ -"""Funktions to run the workflow to create a GEM based on a high-quality template model. +"""Functions to run the workflow to create a GEM based on a high-quality template model. + +.. warning:: + + This module is under heavy construction due to added content and + changes in refineGEMs. """ __author__ = 'Carolin Brune' @@ -26,10 +31,10 @@ # functions ################################################################################ -# @TODO +# @TODO rewrite gap filling to fit refineGEMs +# @TODO add more annotation/cleanup stuff from refineGEMs # @TEST -# -> improvements regarding new SPECIMEN update and OS independency -def run_complete(config_file:str = 'test_config.yaml'): +def run(config_file:str = 'test_config.yaml'): """Run the complete workflow for creating a strain-specific model. Args: @@ -182,7 +187,7 @@ def run_complete(config_file:str = 'test_config.yaml'): pathway=config['parameters']['analysis']['pathway']) -def wrapper_pipeline(config_file:str, parent_dir:str=""): +def wrapper(config_file:str, parent_dir:str=""): """Run the pipeline multiple times on a folder containing subfolders with subject annotated genomes and full genome sequences using the same configuration. @@ -236,4 +241,4 @@ def wrapper_pipeline(config_file:str, parent_dir:str=""): yaml.dump(current_config, config_stream) current_config = util.set_up.validate_config(temp_config.name) - run_complete(temp_config) + run(temp_config) diff --git a/src/specimen/util/util.py b/src/specimen/util/util.py index f31b1fe..fd4808e 100644 --- a/src/specimen/util/util.py +++ b/src/specimen/util/util.py @@ -1,3 +1,9 @@ +#!/usr/bin/env python +"""Utility functions. +""" + +__author__ = 'Carolin Brune' + ################################################################################ # requirements ################################################################################ @@ -15,7 +21,7 @@ from typing import Literal # further required programs: -# - DIAMOND, tested with version 0.9.14 +# - DIAMOND, tested with version 0.9.14+ ################################################################################ # variables @@ -95,6 +101,8 @@ def create_DIAMOND_db_from_folder(dir:str, out:str, name:str='database', # create a NCBI mapping file for the database # ------------------------------------------- +# @DISCUSSION can be merge or replaced with the one in refineGEMs? +# @DEPRECATE if the answer to the question above is yes def get_info_GenBank_Record(file_path:str) -> pd.DataFrame: """Retrieves a table containg information about the following qualifiers from a Genbank file: ['protein_id','locus_tag','db_xref','old_locus_tag','EC_number'].