newest changes

draeger-lab · Feb 7, 2024 · 2362ed8 · 2362ed8
1 parent 14fa436
commit 2362ed8
Show file tree

Hide file tree

Showing 3 changed files with 21 additions and 192 deletions.
diff --git a/src/specimen/core/refinement/extension.py b/src/specimen/core/refinement/extension.py
@@ -26,7 +26,7 @@
 from Bio.KEGG import REST
 from Bio.KEGG import Enzyme, Compound
 
-from refinegems.io import kegg_reaction_parser
+from refinegems.io import kegg_reaction_parser, load_a_table_from_database
 
 # further required programs:
 #        - DIAMOND, tested with version 0.9.14 (works only for certain sensitivity mode)
@@ -418,18 +418,19 @@ def map_Bigg_reactions_row(row, namespace):
     return row
 
 
-def map_BiGG_reactions(table_file, file):
+# @TEST : fitted to refinegems
+# @CHECK : connections, e.g. input is now a param short 
+def map_BiGG_reactions(table_file):
     """Map the output of map_to_KEGG() to a BiGG namespace file (rewritten-type, see auxilliaries).
 
     :param table_file: The path to the saved table from running map_to_KEGG().
     :type  table_file: string
-    :param file:       The path to the BiGG namespace (reactions!) file.
-    :type  file:       string
     :returns:          The table with an additional column for the mapping to BiGG reactions.
     :rtype:            pd.DataFrame
     """
 
-    r_namespace = pd.read_csv(file, sep='\t').fillna('-')
+    r_namespace = load_a_table_from_database('bigg_reactions', False)
+
     table = pd.read_csv(table_file)
     table['bigg_id'] = pd.Series(dtype='str')
 
@@ -1151,7 +1152,9 @@ def add_reaction(model,row,reac_xref,reac_prop,chem_xref,chem_prop,bigg_metaboli
 
 
 # notes
-def extent_model(table, model,chem_prop_file,chem_xref_file,reac_prop_file,reac_xref_file,bigg_metabolites_file, exclude_dna=True, exclude_rna=True):
+# @TEST : fitted to refinegems
+# @CHECK : connections, e.g. input is now a param short 
+def extent_model(table, model,chem_prop_file,chem_xref_file,reac_prop_file,reac_xref_file, exclude_dna=True, exclude_rna=True):
     """Add reactions, metabolites and genes to a model based on the output of map_to_bigg().
 
     :param table:                 The table with the information to be added to the model.
@@ -1166,8 +1169,6 @@ def extent_model(table, model,chem_prop_file,chem_xref_file,reac_prop_file,reac_
     :type  reac_prop_file:        string
     :param reac_xref_file:        Path to the MetaNetX reac_xref file.
     :type  reac_xref_file:        string
-    :param bigg_metabolites_file: Path to the BiGG Metabolites namespace file (rewritten).
-    :type  bigg_metabolites_file: string
     :param exclude_dna:           Tag to include or exclude DNA reactions.
     :type  exclude_dna:           bool, default is True.
     :param exclude_rna:           Tag to include or exclude RNA reactions.
@@ -1184,7 +1185,9 @@ def extent_model(table, model,chem_prop_file,chem_xref_file,reac_prop_file,reac_
     reac_xref = pd.read_csv(reac_xref_file, sep='\t', comment='#', names=['source','ID','description'])
 
     # load bigg metabolite namespace
-    bigg_metabolites = pd.read_csv(bigg_metabolites_file, sep='\t', names=['bigg_id','universal_bigg_id','name','CHEBI','BioCyc','KEGG Compound','MetaNetX (MNX) Chemical','SEED Compound','InChI Key'])
+    bigg_metabolites = load_a_table_from_database('bigg_metabolites', False)
+    bigg_metabolites.rename(columns={'id':'bigg_id'}, inplace=True)
+    bigg_metabolites = bigg_metabolites[['bigg_id','universal_bigg_id','name','CHEBI','BioCyc','KEGG Compound','MetaNetX (MNX) Chemical','SEED Compound','InChI Key']]
 
     # add genes one by one to model
     print('\tAdding genes and if needed reactions and metabolites to model:')
@@ -1231,8 +1234,10 @@ def extent_model(table, model,chem_prop_file,chem_xref_file,reac_prop_file,reac_
     return model
 
 
+# @TEST : fitted to refinegems
+# @CHECK : connections, e.g. input is now two params short 
 # run the main as function
-def run(draft, gene_list, fasta, db, dir, bigg_reac, bigg_meta, mnx_chem_prop, mnx_chem_xref, mnx_reac_prop, mnx_reac_xref, ncbi_map, ncbi_dat, id='locus_tag', sensitivity='more-sensitive', coverage=95.0, pid=90.0, threads=2, exclude_dna=True, exclude_rna=True, memote=False):
+def run(draft, gene_list, fasta, db, dir, mnx_chem_prop, mnx_chem_xref, mnx_reac_prop, mnx_reac_xref, ncbi_map, ncbi_dat, id='locus_tag', sensitivity='more-sensitive', coverage=95.0, pid=90.0, threads=2, exclude_dna=True, exclude_rna=True, memote=False):
     """Create a draft model.
 
     Explaination missing ....
@@ -1248,11 +1253,6 @@ def run(draft, gene_list, fasta, db, dir, bigg_reac, bigg_meta, mnx_chem_prop, m
     :param dir: Path to the directory for the output (directories).
     :type dir: string
 
-    :param bigg_reac: Path to the BiGG reaction namespace file (rewritten version).
-    :type bigg_reac: string
-    :param bigg_meta: Path to the BiGG metabolite namespace file (rewritten version).
-    :type bigg_meta: string
-
     :param mnx_chem_prop: Path to the MetaNetX chem_prop namespace file.
     :type mnx_chem_prop: string
     :param mnx_chem_xref: Path to the MetaNetX chem_xref namespace file.
@@ -1386,7 +1386,7 @@ def run(draft, gene_list, fasta, db, dir, bigg_reac, bigg_meta, mnx_chem_prop, m
 
     # map to BiGG
     print('\tmap information to BiGG namespace via EC number AND KEGG.reaction ID')
-    genes_to_add = map_BiGG_reactions(genes_to_add, bigg_reac)
+    genes_to_add = map_BiGG_reactions(genes_to_add)
 
     end = time.time()
     print(F'\ttime: {end - start}s')
@@ -1404,7 +1404,7 @@ def run(draft, gene_list, fasta, db, dir, bigg_reac, bigg_meta, mnx_chem_prop, m
     g_before = len(draft.genes)
 
     # extent the model
-    draft = extent_model(genes_to_add,draft,mnx_chem_prop,mnx_chem_xref,mnx_reac_prop,mnx_reac_xref,bigg_meta, exclude_dna, exclude_rna)
+    draft = extent_model(genes_to_add,draft,mnx_chem_prop,mnx_chem_xref,mnx_reac_prop,mnx_reac_xref, exclude_dna, exclude_rna)
     # save it
     name = F'{draft.id}_extended'
     cobra.io.write_sbml_model(draft, F'{dir}step1-extension/{name}.xml')

diff --git a/src/specimen/util/set_up.py b/src/specimen/util/set_up.py
@@ -11,20 +11,17 @@
 import os
 from pathlib import Path
 import requests
-import sys
 from tqdm import tqdm
 import yaml
 
-from . import util
-
 ################################################################################
 # variables
 ################################################################################
 
 # config keys
 # -----------
 CONFIG_PATH_OPTIONAL = ['media', 'ncbi_map', 'ncbi_dat','biocyc','universal','pan-core']
-CONFIG_PATH_REQUIRED = ['annotated_genome','full_sequence','model','diamond', 'bigg_reac','bigg_meta',
+CONFIG_PATH_REQUIRED = ['annotated_genome','full_sequence','model','diamond',
                         'mnx_chem_prop', 'mnx_chem_xref','mnx_reac_prop','mnx_reac_xref']
 
 
@@ -37,16 +34,14 @@
 MNX_URL_DICT = {'chem_prop.tsv':MNX_CHEM_PROP_URL, 'chem_xref.tsv':MNX_CHEM_XREF_URL,
                 'reac_prop.tsv':MNX_REAC_PROP_URL, 'reac_xref.tsv':MNX_REAC_XREF_URL}
 
-BIGG_REAC = 'http://bigg.ucsd.edu/static/namespace/bigg_models_reactions.txt'
-BIGG_META = 'http://bigg.ucsd.edu/static/namespace/bigg_models_metabolites.txt'
-
 ################################################################################
 # functions
 ################################################################################
 
 # ----------------------
 # setup data (structure)
 # ----------------------
+# @TEST : deleted BiGG part, since its already covered with refinegems
 
 def download_mnx(dir='MetaNetX/', chunk_size=1024):
     """Download the data needed from the MetaNetX database.
@@ -68,30 +63,9 @@ def download_mnx(dir='MetaNetX/', chunk_size=1024):
                 bar.update(size)
 
 
-def download_bigg(dir='BiGG-namespace'):
-    """Download the BiGG namespace files and rewrite them into
-    the format needed for the workflow.
-
-    :param dir: Name of the directory to write the files into.
-    :type dir: string
-    """
-
-    # BiGG metabolites
-    r = requests.get(BIGG_META)
-    with open(dir+'bigg_models_metabolites.txt', 'wb') as f:
-        f.write(r.content)
-    util.write_BiGG_namespace_to_table(dir+'bigg_models_metabolites.txt', type='metabolites')
-
-    # BiGG reactions
-    r = requests.get(BIGG_REAC)
-    with open(dir+'bigg_models_reactions.txt', 'wb') as f:
-        f.write(r.content)
-    util.write_BiGG_namespace_to_table(dir+'bigg_models_reactions.txt', type='reactions')
-
-
 def build_data_directories(dir, chunk_size=2048):
     """Set up the directory structure for the data and download the files
-    from MetaNetX and BiGG.
+    from MetaNetX.
 
     :param dir: Parent folder to write the subfolder structure to.
     :type dir: string
@@ -106,7 +80,7 @@ def build_data_directories(dir, chunk_size=2048):
 
     # create the data directory structure
     print('Creating directory structure...')
-    DATA_DIRECTORIES = ['annotated_genomes', 'BiGG-namespace', 'BioCyc', 'RefSeqs',
+    DATA_DIRECTORIES = ['annotated_genomes', 'BioCyc', 'RefSeqs',
                         'medium', 'MetaNetX', 'pan-core-models', 'template-models',
                         'universal-models']
     for sub_dir in DATA_DIRECTORIES:
@@ -118,14 +92,10 @@ def build_data_directories(dir, chunk_size=2048):
             print(F'Directory {new_dir} already exists.')
 
     # download data for those directories where this is possible
-    print('Downloading BiGG-namespace...')
-    download_bigg(dir + 'BiGG-namespace/')
     print('Downloading MetaNetX...')
     download_mnx(dir + 'MetaNetX/', chunk_size=chunk_size)
 
 
-
-
 # ---------------------
 # handling config files
 # ---------------------
@@ -255,7 +225,3 @@ def validate_config(userc):
     return combined_config
 
 
-
-# ----------------
-# installing tools
-# ----------------
diff --git a/src/specimen/util/util.py b/src/specimen/util/util.py
@@ -10,7 +10,6 @@
 import re
 import sys
 import subprocess
-import time
 
 # further required programs:
 #        - DIAMOND, tested with version 0.9.14
@@ -161,139 +160,3 @@ def create_NCBIinfo_mapping(dir, out, extension='gbff'):
             # go to next
             file_counter += 1
 
-
-# rewrite the BiGG namespaces into tables
-# ---------------------------------------
-
-def separate_db_links_reaction(row):
-    """Separate the database links in the column of the same name
-    from a reaction BiGG namespace file row.
-
-    :param row: one row of the table
-    :type  row: pandas object, a pd.DataFrame row
-    :returns:   The row with new columns for certain database links (EC number, BioCyc, KEGG, MNX, SEED)
-    :rtype:     pandas object, a pd.DataFrame row
-    """
-
-    values = {'EC Number': [], 'BioCyc': [], 'KEGG Reaction': [], 'MetaNetX (MNX) Equation': [], 'SEED Reaction': []}
-    if isinstance(row['database_links'], str):
-        links = row['database_links'].split(';')
-        for link in links:
-            key, value = link.split(':',1)
-            key = key.strip()
-            value = value.rsplit('/',1)[1].strip()
-            if key in values.keys():
-                values[key].append(value)
-
-    for k in values.keys():
-        row[k] = ', '.join(values[k])
-
-    return row
-
-
-def rewrite_reactions(file, out):
-    """Rewrites or reformates a given BiGG reaction namespace TXT file to the following columns:
-    bigg_id, name, reaction_string, EC number, BioCyc, MetaNetX (MNX) Equation, SEED Reaction
-
-    :param file: Path of the input file.
-    :type  file: string
-    :param out:  Path of the output file.
-    :type  out:  string
-    """
-
-    # read in the txt
-    data = pd.read_csv(file, sep='\t')
-    # remove model list and old bigg ids
-    data.drop(columns=['model_list', 'old_bigg_ids'], axis=1, inplace=True)
-    # add new columns for the database links
-    data = data.apply(separate_db_links_reaction, axis=1)
-    data.drop(columns=['database_links'], axis=1, inplace=True)
-    # save the reformatted data
-    data.to_csv(out, sep='\t', index=False, header=True)
-
-
-def separate_db_links_metabolite(row):
-    """Separate the database links in the column of the same name
-    from a metabolite BiGG namespace file row.
-
-    :param row: one row of the table
-    :type  row: pandas object, a pd.DataFrame row
-    :returns:   The row with new columns for certain database links (CHEBI, BioCyc, KEGG, MNX, SEED, InChI)
-    :rtype:     pandas object, a pd.DataFrame row
-    """
-
-    values = {'CHEBI': [], 'BioCyc': [], 'KEGG Compound': [], 'MetaNetX (MNX) Chemical': [], 'SEED Compound': [], 'InChI Key': []}
-    if isinstance(row['database_links'], str):
-        links = row['database_links'].split(';')
-        for link in links:
-            key, value = link.split(':',1)
-            key = key.strip()
-            value = value.rsplit('/',1)[1].strip()
-            if key in values.keys():
-                values[key].append(value)
-
-    for k in values.keys():
-        row[k] = ', '.join(values[k])
-
-    return row
-
-
-def rewrite_metabolites(file, out):
-    """Rewrites or reformates a given BiGG metabolites namespace TXT file to the following columns:
-    bigg_id, name, reaction_string, EC number, BioCyc, MetaNetX (MNX) Equation, SEED Reaction
-
-    :param file: Path of the input file.
-    :type  file: string
-    :param out:  Path of the output file.
-    :type  out:  string
-    """
-
-    # read in the txt
-    data = pd.read_csv(file, sep='\t')
-    # remove model list and old bigg ids
-    data.drop(columns=['model_list', 'old_bigg_ids'], axis=1, inplace=True)
-    # add new columns for the database links
-    data = data.apply(separate_db_links_metabolite, axis=1)
-    data.drop(columns=['database_links'], axis=1, inplace=True)
-    # save the reformatted data
-    data.to_csv(out, sep='\t', index=False, header=True)
-
-
-def write_BiGG_namespace_to_table(input, out=None, type='reactions'):
-    """Rewrite a BiGG namespace into a table format.
-
-    :param input: The input BiGG namespace txt-file (its path). Can be for reactions or metabolites.
-    :type input: string
-    :param out: Path to the output file. Default is the name of the input file with the additional tag '_rewritte'.
-    :type out: string
-    :param type: Specifies if the input file is for reactions or metabolites.
-        Can either be 'reactions' (default) or 'metabolites'.
-    :type type: string
-    """
-
-    print('\nrewrite BiGG txt files\n################################################################################\n')
-
-    if out == None:
-        out = os.path.dirname(input) + '/' + os.path.splitext(os.path.basename(input))[0] + '_rewritten.tsv'
-
-    # -------------
-    # start program
-    # -------------
-
-    match type:
-
-        case 'reactions':
-            start = time.time()
-            print('\trewriting reaction namespace ...')
-            rewrite_reactions(input, out)
-            end = time.time()
-            print(F'\ttime: {end - start}s')
-
-        case 'metabolites':
-            start = time.time()
-            print('\trewriting metabolite namespace ...')
-            rewrite_metabolites(input, out)
-            end = time.time()
-            print(F'\ttime: {end - start}s')
-        case _:
-            raise ValueError(F'Unknown option for namespace type: {type}')