Skip to content

Commit

Permalink
newest changes
Browse files Browse the repository at this point in the history
  • Loading branch information
cb-Hades committed Feb 7, 2024
1 parent 14fa436 commit 2362ed8
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 192 deletions.
34 changes: 17 additions & 17 deletions src/specimen/core/refinement/extension.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
from Bio.KEGG import REST
from Bio.KEGG import Enzyme, Compound

from refinegems.io import kegg_reaction_parser
from refinegems.io import kegg_reaction_parser, load_a_table_from_database

# further required programs:
# - DIAMOND, tested with version 0.9.14 (works only for certain sensitivity mode)
Expand Down Expand Up @@ -418,18 +418,19 @@ def map_Bigg_reactions_row(row, namespace):
return row


def map_BiGG_reactions(table_file, file):
# @TEST : fitted to refinegems
# @CHECK : connections, e.g. input is now a param short
def map_BiGG_reactions(table_file):
"""Map the output of map_to_KEGG() to a BiGG namespace file (rewritten-type, see auxilliaries).
:param table_file: The path to the saved table from running map_to_KEGG().
:type table_file: string
:param file: The path to the BiGG namespace (reactions!) file.
:type file: string
:returns: The table with an additional column for the mapping to BiGG reactions.
:rtype: pd.DataFrame
"""

r_namespace = pd.read_csv(file, sep='\t').fillna('-')
r_namespace = load_a_table_from_database('bigg_reactions', False)

table = pd.read_csv(table_file)
table['bigg_id'] = pd.Series(dtype='str')

Expand Down Expand Up @@ -1151,7 +1152,9 @@ def add_reaction(model,row,reac_xref,reac_prop,chem_xref,chem_prop,bigg_metaboli


# notes
def extent_model(table, model,chem_prop_file,chem_xref_file,reac_prop_file,reac_xref_file,bigg_metabolites_file, exclude_dna=True, exclude_rna=True):
# @TEST : fitted to refinegems
# @CHECK : connections, e.g. input is now a param short
def extent_model(table, model,chem_prop_file,chem_xref_file,reac_prop_file,reac_xref_file, exclude_dna=True, exclude_rna=True):
"""Add reactions, metabolites and genes to a model based on the output of map_to_bigg().
:param table: The table with the information to be added to the model.
Expand All @@ -1166,8 +1169,6 @@ def extent_model(table, model,chem_prop_file,chem_xref_file,reac_prop_file,reac_
:type reac_prop_file: string
:param reac_xref_file: Path to the MetaNetX reac_xref file.
:type reac_xref_file: string
:param bigg_metabolites_file: Path to the BiGG Metabolites namespace file (rewritten).
:type bigg_metabolites_file: string
:param exclude_dna: Tag to include or exclude DNA reactions.
:type exclude_dna: bool, default is True.
:param exclude_rna: Tag to include or exclude RNA reactions.
Expand All @@ -1184,7 +1185,9 @@ def extent_model(table, model,chem_prop_file,chem_xref_file,reac_prop_file,reac_
reac_xref = pd.read_csv(reac_xref_file, sep='\t', comment='#', names=['source','ID','description'])

# load bigg metabolite namespace
bigg_metabolites = pd.read_csv(bigg_metabolites_file, sep='\t', names=['bigg_id','universal_bigg_id','name','CHEBI','BioCyc','KEGG Compound','MetaNetX (MNX) Chemical','SEED Compound','InChI Key'])
bigg_metabolites = load_a_table_from_database('bigg_metabolites', False)
bigg_metabolites.rename(columns={'id':'bigg_id'}, inplace=True)
bigg_metabolites = bigg_metabolites[['bigg_id','universal_bigg_id','name','CHEBI','BioCyc','KEGG Compound','MetaNetX (MNX) Chemical','SEED Compound','InChI Key']]

# add genes one by one to model
print('\tAdding genes and if needed reactions and metabolites to model:')
Expand Down Expand Up @@ -1231,8 +1234,10 @@ def extent_model(table, model,chem_prop_file,chem_xref_file,reac_prop_file,reac_
return model


# @TEST : fitted to refinegems
# @CHECK : connections, e.g. input is now two params short
# run the main as function
def run(draft, gene_list, fasta, db, dir, bigg_reac, bigg_meta, mnx_chem_prop, mnx_chem_xref, mnx_reac_prop, mnx_reac_xref, ncbi_map, ncbi_dat, id='locus_tag', sensitivity='more-sensitive', coverage=95.0, pid=90.0, threads=2, exclude_dna=True, exclude_rna=True, memote=False):
def run(draft, gene_list, fasta, db, dir, mnx_chem_prop, mnx_chem_xref, mnx_reac_prop, mnx_reac_xref, ncbi_map, ncbi_dat, id='locus_tag', sensitivity='more-sensitive', coverage=95.0, pid=90.0, threads=2, exclude_dna=True, exclude_rna=True, memote=False):
"""Create a draft model.
Explaination missing ....
Expand All @@ -1248,11 +1253,6 @@ def run(draft, gene_list, fasta, db, dir, bigg_reac, bigg_meta, mnx_chem_prop, m
:param dir: Path to the directory for the output (directories).
:type dir: string
:param bigg_reac: Path to the BiGG reaction namespace file (rewritten version).
:type bigg_reac: string
:param bigg_meta: Path to the BiGG metabolite namespace file (rewritten version).
:type bigg_meta: string
:param mnx_chem_prop: Path to the MetaNetX chem_prop namespace file.
:type mnx_chem_prop: string
:param mnx_chem_xref: Path to the MetaNetX chem_xref namespace file.
Expand Down Expand Up @@ -1386,7 +1386,7 @@ def run(draft, gene_list, fasta, db, dir, bigg_reac, bigg_meta, mnx_chem_prop, m

# map to BiGG
print('\tmap information to BiGG namespace via EC number AND KEGG.reaction ID')
genes_to_add = map_BiGG_reactions(genes_to_add, bigg_reac)
genes_to_add = map_BiGG_reactions(genes_to_add)

end = time.time()
print(F'\ttime: {end - start}s')
Expand All @@ -1404,7 +1404,7 @@ def run(draft, gene_list, fasta, db, dir, bigg_reac, bigg_meta, mnx_chem_prop, m
g_before = len(draft.genes)

# extent the model
draft = extent_model(genes_to_add,draft,mnx_chem_prop,mnx_chem_xref,mnx_reac_prop,mnx_reac_xref,bigg_meta, exclude_dna, exclude_rna)
draft = extent_model(genes_to_add,draft,mnx_chem_prop,mnx_chem_xref,mnx_reac_prop,mnx_reac_xref, exclude_dna, exclude_rna)
# save it
name = F'{draft.id}_extended'
cobra.io.write_sbml_model(draft, F'{dir}step1-extension/{name}.xml')
Expand Down
42 changes: 4 additions & 38 deletions src/specimen/util/set_up.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,20 +11,17 @@
import os
from pathlib import Path
import requests
import sys
from tqdm import tqdm
import yaml

from . import util

################################################################################
# variables
################################################################################

# config keys
# -----------
CONFIG_PATH_OPTIONAL = ['media', 'ncbi_map', 'ncbi_dat','biocyc','universal','pan-core']
CONFIG_PATH_REQUIRED = ['annotated_genome','full_sequence','model','diamond', 'bigg_reac','bigg_meta',
CONFIG_PATH_REQUIRED = ['annotated_genome','full_sequence','model','diamond',
'mnx_chem_prop', 'mnx_chem_xref','mnx_reac_prop','mnx_reac_xref']


Expand All @@ -37,16 +34,14 @@
MNX_URL_DICT = {'chem_prop.tsv':MNX_CHEM_PROP_URL, 'chem_xref.tsv':MNX_CHEM_XREF_URL,
'reac_prop.tsv':MNX_REAC_PROP_URL, 'reac_xref.tsv':MNX_REAC_XREF_URL}

BIGG_REAC = 'http://bigg.ucsd.edu/static/namespace/bigg_models_reactions.txt'
BIGG_META = 'http://bigg.ucsd.edu/static/namespace/bigg_models_metabolites.txt'

################################################################################
# functions
################################################################################

# ----------------------
# setup data (structure)
# ----------------------
# @TEST : deleted BiGG part, since its already covered with refinegems

def download_mnx(dir='MetaNetX/', chunk_size=1024):
"""Download the data needed from the MetaNetX database.
Expand All @@ -68,30 +63,9 @@ def download_mnx(dir='MetaNetX/', chunk_size=1024):
bar.update(size)


def download_bigg(dir='BiGG-namespace'):
"""Download the BiGG namespace files and rewrite them into
the format needed for the workflow.
:param dir: Name of the directory to write the files into.
:type dir: string
"""

# BiGG metabolites
r = requests.get(BIGG_META)
with open(dir+'bigg_models_metabolites.txt', 'wb') as f:
f.write(r.content)
util.write_BiGG_namespace_to_table(dir+'bigg_models_metabolites.txt', type='metabolites')

# BiGG reactions
r = requests.get(BIGG_REAC)
with open(dir+'bigg_models_reactions.txt', 'wb') as f:
f.write(r.content)
util.write_BiGG_namespace_to_table(dir+'bigg_models_reactions.txt', type='reactions')


def build_data_directories(dir, chunk_size=2048):
"""Set up the directory structure for the data and download the files
from MetaNetX and BiGG.
from MetaNetX.
:param dir: Parent folder to write the subfolder structure to.
:type dir: string
Expand All @@ -106,7 +80,7 @@ def build_data_directories(dir, chunk_size=2048):

# create the data directory structure
print('Creating directory structure...')
DATA_DIRECTORIES = ['annotated_genomes', 'BiGG-namespace', 'BioCyc', 'RefSeqs',
DATA_DIRECTORIES = ['annotated_genomes', 'BioCyc', 'RefSeqs',
'medium', 'MetaNetX', 'pan-core-models', 'template-models',
'universal-models']
for sub_dir in DATA_DIRECTORIES:
Expand All @@ -118,14 +92,10 @@ def build_data_directories(dir, chunk_size=2048):
print(F'Directory {new_dir} already exists.')

# download data for those directories where this is possible
print('Downloading BiGG-namespace...')
download_bigg(dir + 'BiGG-namespace/')
print('Downloading MetaNetX...')
download_mnx(dir + 'MetaNetX/', chunk_size=chunk_size)




# ---------------------
# handling config files
# ---------------------
Expand Down Expand Up @@ -255,7 +225,3 @@ def validate_config(userc):
return combined_config



# ----------------
# installing tools
# ----------------
137 changes: 0 additions & 137 deletions src/specimen/util/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
import re
import sys
import subprocess
import time

# further required programs:
# - DIAMOND, tested with version 0.9.14
Expand Down Expand Up @@ -161,139 +160,3 @@ def create_NCBIinfo_mapping(dir, out, extension='gbff'):
# go to next
file_counter += 1


# rewrite the BiGG namespaces into tables
# ---------------------------------------

def separate_db_links_reaction(row):
"""Separate the database links in the column of the same name
from a reaction BiGG namespace file row.
:param row: one row of the table
:type row: pandas object, a pd.DataFrame row
:returns: The row with new columns for certain database links (EC number, BioCyc, KEGG, MNX, SEED)
:rtype: pandas object, a pd.DataFrame row
"""

values = {'EC Number': [], 'BioCyc': [], 'KEGG Reaction': [], 'MetaNetX (MNX) Equation': [], 'SEED Reaction': []}
if isinstance(row['database_links'], str):
links = row['database_links'].split(';')
for link in links:
key, value = link.split(':',1)
key = key.strip()
value = value.rsplit('/',1)[1].strip()
if key in values.keys():
values[key].append(value)

for k in values.keys():
row[k] = ', '.join(values[k])

return row


def rewrite_reactions(file, out):
"""Rewrites or reformates a given BiGG reaction namespace TXT file to the following columns:
bigg_id, name, reaction_string, EC number, BioCyc, MetaNetX (MNX) Equation, SEED Reaction
:param file: Path of the input file.
:type file: string
:param out: Path of the output file.
:type out: string
"""

# read in the txt
data = pd.read_csv(file, sep='\t')
# remove model list and old bigg ids
data.drop(columns=['model_list', 'old_bigg_ids'], axis=1, inplace=True)
# add new columns for the database links
data = data.apply(separate_db_links_reaction, axis=1)
data.drop(columns=['database_links'], axis=1, inplace=True)
# save the reformatted data
data.to_csv(out, sep='\t', index=False, header=True)


def separate_db_links_metabolite(row):
"""Separate the database links in the column of the same name
from a metabolite BiGG namespace file row.
:param row: one row of the table
:type row: pandas object, a pd.DataFrame row
:returns: The row with new columns for certain database links (CHEBI, BioCyc, KEGG, MNX, SEED, InChI)
:rtype: pandas object, a pd.DataFrame row
"""

values = {'CHEBI': [], 'BioCyc': [], 'KEGG Compound': [], 'MetaNetX (MNX) Chemical': [], 'SEED Compound': [], 'InChI Key': []}
if isinstance(row['database_links'], str):
links = row['database_links'].split(';')
for link in links:
key, value = link.split(':',1)
key = key.strip()
value = value.rsplit('/',1)[1].strip()
if key in values.keys():
values[key].append(value)

for k in values.keys():
row[k] = ', '.join(values[k])

return row


def rewrite_metabolites(file, out):
"""Rewrites or reformates a given BiGG metabolites namespace TXT file to the following columns:
bigg_id, name, reaction_string, EC number, BioCyc, MetaNetX (MNX) Equation, SEED Reaction
:param file: Path of the input file.
:type file: string
:param out: Path of the output file.
:type out: string
"""

# read in the txt
data = pd.read_csv(file, sep='\t')
# remove model list and old bigg ids
data.drop(columns=['model_list', 'old_bigg_ids'], axis=1, inplace=True)
# add new columns for the database links
data = data.apply(separate_db_links_metabolite, axis=1)
data.drop(columns=['database_links'], axis=1, inplace=True)
# save the reformatted data
data.to_csv(out, sep='\t', index=False, header=True)


def write_BiGG_namespace_to_table(input, out=None, type='reactions'):
"""Rewrite a BiGG namespace into a table format.
:param input: The input BiGG namespace txt-file (its path). Can be for reactions or metabolites.
:type input: string
:param out: Path to the output file. Default is the name of the input file with the additional tag '_rewritte'.
:type out: string
:param type: Specifies if the input file is for reactions or metabolites.
Can either be 'reactions' (default) or 'metabolites'.
:type type: string
"""

print('\nrewrite BiGG txt files\n################################################################################\n')

if out == None:
out = os.path.dirname(input) + '/' + os.path.splitext(os.path.basename(input))[0] + '_rewritten.tsv'

# -------------
# start program
# -------------

match type:

case 'reactions':
start = time.time()
print('\trewriting reaction namespace ...')
rewrite_reactions(input, out)
end = time.time()
print(F'\ttime: {end - start}s')

case 'metabolites':
start = time.time()
print('\trewriting metabolite namespace ...')
rewrite_metabolites(input, out)
end = time.time()
print(F'\ttime: {end - start}s')
case _:
raise ValueError(F'Unknown option for namespace type: {type}')

0 comments on commit 2362ed8

Please sign in to comment.