From 683e50f751717538ddb2ecc2092632e743bc4295 Mon Sep 17 00:00:00 2001 From: cb-Hades <81743695+cb-Hades@users.noreply.github.com> Date: Fri, 9 Feb 2024 12:07:59 +0100 Subject: [PATCH] Started reconstructing extension module - progress found in the notebooks under dev #1, #2 --- .gitignore | 2 + dev/CB_extension_remod.ipynb | 551 ++++++++++++++++++++++++++++++++ dev/CB_under_construction.ipynb | 41 +++ specimen.egg-info/SOURCES.txt | 1 - 4 files changed, 594 insertions(+), 1 deletion(-) create mode 100644 dev/CB_extension_remod.ipynb create mode 100644 dev/CB_under_construction.ipynb diff --git a/.gitignore b/.gitignore index ad233c2..00b8e0d 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,5 @@ data/media.csv data/media.numbers example/ + +build/ diff --git a/dev/CB_extension_remod.ipynb b/dev/CB_extension_remod.ipynb new file mode 100644 index 0000000..e398f78 --- /dev/null +++ b/dev/CB_extension_remod.ipynb @@ -0,0 +1,551 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "import cobra\n", + "import pandas as pd\n", + "\n", + "from random import choice \n", + "from string import ascii_uppercase, digits\n", + "from typing import Literal\n", + "\n", + "from refinegems.io import load_a_table_from_database, load_model_cobra" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "mapped_genes_path = '/Users/carolinb/Documents/104 Masterthesis/klebsiella-pipeline/example/thesis/Kp_std/03_refinement/step1-extension/genes_mapped.csv'\n", + "mg = pd.read_csv(mapped_genes_path)\n", + "mapped_genes_comp = map_BiGG_reactions(mapped_genes_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "draft_model_path = '/Users/carolinb/Documents/104 Masterthesis/klebsiella-pipeline/example/thesis/Kp_std/02_generate_draft_model/Kp_std_draft.xml'\n", + "draft_model = load_model_cobra(draft_model_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### generally useful functions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# @TODO: docs\n", + "def create_random_id(model:cobra.Model, entity_type='reac', prefix='') -> str:\n", + "\n", + " match type:\n", + " case 'reac':\n", + " all_ids = [_.id for _ in model.reactions]\n", + " case 'meta':\n", + " all_ids = [_.id for _ in model.metabolites]\n", + "\n", + " prefix = f'{prefix}{entity_type}'\n", + " var = ''.join(choice(ascii_uppercase + digits) for i in range(6))\n", + " label = prefix + var\n", + " j = 6\n", + " \n", + " while True:\n", + " \n", + " for i in range(36**6): # make sure it does not run endlessly\n", + " if label in all_ids:\n", + " label = prefix + ''.join(choice(ascii_uppercase + digits) for i in range(j))\n", + " else:\n", + " return label\n", + " \n", + " j = j + 1\n", + "\n", + "\n", + "# @TODO: \n", + "# more namespace options\n", + "# @TODO: docs\n", + "# @TEST\n", + "def match_id_to_namespace(model_entity:[cobra.Reaction, cobra.Metabolite], namespace:Literal['BiGG']) -> None:\n", + "\n", + " match model_entity:\n", + "\n", + " # Reaction\n", + " # --------\n", + " case cobra.Reaction():\n", + " match namespace:\n", + "\n", + " case 'BiGG':\n", + " if 'bigg.reaction' in model_entity.annotation.keys():\n", + " model_entity.id = model_entity.annotation['bigg.reaction']\n", + "\n", + " case _:\n", + " mes = f'Unknown input for namespace: {namespace}'\n", + " raise ValueError(mes)\n", + " \n", + " # Metabolite\n", + " # ----------\n", + " case cobra.Metabolite():\n", + " match namespace:\n", + "\n", + " case 'BiGG':\n", + " if 'bigg.metabolite' in model_entity.annotation.keys():\n", + " model_entity.id = model_entity.annotation['bigg.reaction'] + '_' + model_entity.compartment\n", + "\n", + " case _:\n", + " mes = f'Unknown input for namespace: {namespace}'\n", + " raise ValueError(mes)\n", + " # Error\n", + " # -----\n", + " case _:\n", + " mes = f'Unknown type for model_entity: {type(model_entity)}'\n", + " raise TypeError(mes)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### reworking functions for extension\n", + "\n", + "#### mapping" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# @NOTE changed\n", + "def map_Bigg_reactions_row(row, namespace):\n", + " \"\"\"Map a single entry from the table in map_BiGG_reactions() to the BiGG reaction namespace.\n", + "\n", + " :param row: A single row of the table.\n", + " :type row: pd.Series\n", + " :param namespace: The BiGG reaction namespace table.\n", + " :type namespace: pd.DataFrame\n", + " :returns: The edited row.\n", + " :rtype: pd.Series\n", + " \"\"\"\n", + "\n", + " \"\"\"\n", + " @TODO\n", + " NOTE: only works for cases, where KEGG.reaction in row contains EXACTLY one entry\n", + " in the rare case that multiple reactions belong to one enzyme, they are omitted\n", + " in this search\n", + " \"\"\"\n", + "\n", + " # match by EC number AND KEGG id\n", + " matches = namespace.loc[namespace['EC Number'].str.contains(row['EC number']) & namespace['KEGG Reaction'].str.contains(row['KEGG.reaction'])]\n", + "\n", + " # case 1 : no matches\n", + " if matches.empty:\n", + " return row\n", + "\n", + " # case 2 : exactly one match\n", + " elif len(matches) == 1:\n", + " row['bigg_id'] = matches['id'].values[0]\n", + "\n", + " # case 3 : multiple matches\n", + " # often due to reaction being in different compartments\n", + " else:\n", + " row['bigg_id'] = ' '.join(matches['id'].values)\n", + "\n", + " return row\n", + "\n", + "\n", + "# @TEST : fitted to refinegems\n", + "# @CHECK : connections, e.g. input is now a param short \n", + "def map_BiGG_reactions(table_file):\n", + " \"\"\"Map the output of map_to_KEGG() to a BiGG namespace file (rewritten-type, see auxilliaries).\n", + "\n", + " :param table_file: The path to the saved table from running map_to_KEGG().\n", + " :type table_file: string\n", + " :returns: The table with an additional column for the mapping to BiGG reactions.\n", + " :rtype: pd.DataFrame\n", + " \"\"\"\n", + "\n", + " r_namespace = load_a_table_from_database('bigg_reactions', False)\n", + "\n", + " table = pd.read_csv(table_file)\n", + " table['bigg_id'] = pd.Series(dtype='str')\n", + "\n", + " table = table.apply(lambda row: map_Bigg_reactions_row(row,r_namespace), axis=1)\n", + "\n", + " return table\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### actual extension" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "# @TODO\n", + "# @TEST\n", + "# @DOCS wrong\n", + "# UNDER CONSTRUCTION\n", + "def build_metabolite_mnx(metabolite, model, mnx_chem_prop, mnx_chem_xref, bigg_metabolites, namespace):\n", + " \"\"\"Create or retrieve (from model) a metabolite based on its MetaNetX ID.\n", + "\n", + " :param metabolite: The MetaNetX ID of the metabolite.\n", + " :type metabolite: string\n", + " :param model: The underlying genome-scale metabolic model.\n", + " :type model: cobra.model\n", + " :param mnx_chem_xref: The chem_xref table from MetaNetX\n", + " :type mnx_chem_xref: pd.DataFrame\n", + " :param mnx_chem_prop: The chem_prop table from MetaNetX\n", + " :type mnx_chem_prop: pd.DataFrame\n", + " :param bigg_metabolites: The BiGG compound namespace table.\n", + " :type bigg_metabolites: pd.DataFrame\n", + " :returns: The retrieved or newly build metabolite.\n", + " :rtype: cobra.Metabolite\n", + " \"\"\"\n", + "\n", + " metabolite_prop = mnx_chem_prop[mnx_chem_prop['ID']==metabolite]\n", + " metabolite_anno = mnx_chem_xref[mnx_chem_xref['ID']==metabolite]\n", + " model_mnx = [x.annotation['metanetx.chemical'] for x in model.metabolites if 'metanetx.chemical' in x.annotation]\n", + "\n", + " # fast check if compound already in model\n", + " # ------------------------------------------\n", + " # @TODO ..........................................\n", + " # currently no checking for compartments\n", + " # first match will be taken (most often cytosol one)\n", + " # regardless of the compartment\n", + " #.............................................\n", + " # step 1: check if MetaNetX ID in model\n", + " if metabolite in model_mnx:\n", + " matches = [x.id for x in model.metabolites if 'metanetx.chemical' in x.annotation and x.annotation['metanetx.chemical']==metabolite]\n", + "\n", + " # step 2: if yes, retrieve metabolite from model\n", + " # case 1: multiple matches found\n", + " if len(matches) > 1:\n", + " # ................\n", + " # @TODO see above\n", + " # ................\n", + " match = model.metabolites.get_by_id(matches[0])\n", + " # case 2: only one match found\n", + " else:\n", + " match = model.metabolites.get_by_id(matches[0])\n", + "\n", + " # step 3: add metabolite\n", + " return match\n", + "\n", + " # if not, create new metabolite\n", + " # -----------------------------\n", + " else:\n", + "\n", + " # step 1: create a random metabolite ID\n", + " # ...........................\n", + " # @TODO : compartment problem \n", + " # - does it have to be in the name?\n", + " # ...........................\n", + " new_metabolite = cobra.Metabolite(create_random_id(model, 'meta','SPECIMEN')) \n", + "\n", + "\n", + " # step 2: add features\n", + " # --------------------\n", + " # @TODO ..........................................\n", + " # currently no checking for compartments\n", + " # defaults to c\n", + " # makes it difficult to add exchange reactions\n", + " #.................................................\n", + " new_metabolite.formula = metabolite_prop['formula'].iloc[0]\n", + " new_metabolite.name = metabolite_prop['name'].iloc[0]\n", + " new_metabolite.charge = metabolite_prop['charge'].iloc[0]\n", + " new_metabolite.compartment = 'c'\n", + "\n", + " # step 3: add notes\n", + " # -----------------\n", + " new_metabolite.notes['added via'] = 'metanetx.chemical'\n", + "\n", + " # step 4: add annotations\n", + " # -----------------------\n", + " new_metabolite.annotation['metanetx.chemical'] = metabolite_prop['ID'].iloc[0]\n", + " new_metabolite.annotation['chebi'] = metabolite_prop['reference'].iloc[0].upper()\n", + " if not pd.isnull(metabolite_prop['InChIKey'].iloc[0]):\n", + " new_metabolite.annotation['inchikey'] = metabolite_prop['InChIKey'].iloc[0].split('=')[1]\n", + " for db in ['kegg.compound','metacyc.compound','seed.compound','bigg.metabolite']:\n", + " db_matches = metabolite_anno[metabolite_anno['source'].str.contains(db)]\n", + " if len(db_matches) == 1:\n", + " new_metabolite.annotation[db] = db_matches['source'].iloc[0].split(':',1)[1]\n", + " elif len(db_matches) > 1:\n", + " new_metabolite.annotation[db] = [m.split(':',1)[1] for m in db_matches['source'].tolist()]\n", + "\n", + " # if no BiGG was found in MetaNetX, try reverse search in BiGG\n", + " if metabolite in bigg_metabolites['MetaNetX (MNX) Chemical']:\n", + " new_metabolite.annotation['bigg.metabolite'] = bigg_metabolites[bigg_metabolites['MetaNetX (MNX) Chemical']==metabolite].iloc[0]\n", + " \n", + " # add additional information from bigg if possible \n", + " if 'bigg.metabolite' in new_metabolite.annotation.keys():\n", + " bigg_information = bigg_metabolites[bigg_metabolites['bigg_id'].str.contains('|'.join(new_metabolite.annotation['bigg.metabolite']))]\n", + " db_id_bigg = {'BioCyc':'biocyc', 'MetaNetX (MNX) Chemical':'metanetx.chemical','SEED Compound':'seed.compound','InChI Key':'inchikey'}\n", + " for db in db_id_bigg:\n", + " info = bigg_information[db].dropna().to_list()\n", + " if len(info) > 0:\n", + " info = ','.join(info)\n", + " info = [x.strip() for x in info.split(',')] # make sure all entries are a separate list object\n", + " new_metabolite.annotation[db_id_bigg[db]] = info\n", + "\n", + " # step 5: change ID according to namespace\n", + " # ----------------------------------------\n", + " match_id_to_namespace(new_metabolite,namespace)\n", + " \n", + " # step 6: re-check existence of ID in model\n", + " # -----------------------------------------\n", + " if new_metabolite.id in [_.id for _ in model.metabolites]:\n", + " return model.metabolites.get_by_id(new_metabolite.id)\n", + " \n", + " return new_metabolite\n", + "\n", + "\n", + "\n", + "\n", + "# seems fine\n", + "def add_gene(model, reaction, row, first=False):\n", + " \"\"\"Add a new gene to a genome-scale metabolic cobra model.\n", + "\n", + " :param model: The model.\n", + " :type model: cobra.Model\n", + " :param reaction: The reaction id to add the gene to.\n", + " :type reaction: string\n", + " :param row: A single row of the output table of map_BiGG_reactions().\n", + " :type row: pd.Series\n", + " :param first: Shows, if gene is the first gene to be added to the reaction.\n", + " :type first: bool, true if gene is first to be added.\n", + " :returns: The updated model.\n", + " :rtype: cobra.Model\n", + " \"\"\"\n", + "\n", + " # add gene\n", + " if first or model.reactions.get_by_id(reaction).gene_reaction_rule == '':\n", + " model.reactions.get_by_id(reaction).gene_reaction_rule = row[0]\n", + " else:\n", + " model.reactions.get_by_id(reaction).gene_reaction_rule = model.reactions.get_by_id(reaction).gene_reaction_rule + ' or ' + row[0]\n", + "\n", + " # add name\n", + " model.genes.get_by_id(row[0]).name = row[1]\n", + "\n", + " # add annotations\n", + " if not pd.isnull(row[4]):\n", + " model.genes.get_by_id(row[0]).annotation['ncbigene'] = row[4]\n", + " model.genes.get_by_id(row[0]).annotation['ncbiprotein'] = row[2].split('.')[0]\n", + " # note: annotations like sbo, kegg.genes and uniprot missing\n", + "\n", + " return model\n", + "\n", + "\n", + "# UNDER CONSTRUCTION\n", + "def add_reaction(model,row,reac_xref,reac_prop,chem_xref,chem_prop,bigg_metabolites, namespace:str='BiGG', exclude_dna=True, exclude_rna=True):\n", + "\n", + " # create reaction object\n", + " reac = cobra.Reaction(create_random_id(model,'reac','SPECIMEN'))\n", + "\n", + " # ----------------------------\n", + " # curate reaction via MetaNetX\n", + " # ----------------------------\n", + " # try kegg.reaction --> metanetx.reaction\n", + " if F'kegg.reaction:{row[\"KEGG.reaction\"]}' in list(reac_xref['source']):\n", + " \n", + " # get MetaNetX ID\n", + " met_reac_kegg = reac_xref[reac_xref['source']==F'kegg.reaction:{row[\"KEGG.reaction\"]}']\n", + " met_reac = reac_prop[reac_prop['ID']==met_reac_kegg['ID'].iloc[0]]\n", + "\n", + " # make sure exactly one entry is parsed\n", + " # @TODO : parallel parsing\n", + " if len(met_reac) > 1:\n", + " print(F'Warning: multiple matches for kegg.reaction {row[\"KEGG.reaction\"]} found. Only first one will be used.')\n", + " met_reac = met_reac.head(1)\n", + "\n", + " # add name\n", + " # --------\n", + " # from MetaNetX KEGG description\n", + " reac.name = met_reac_kegg['description'].iloc[0].split('|')[0]\n", + "\n", + " # add notes\n", + " # ---------\n", + " reac.notes['creation'] = 'via MetaNetX'\n", + " reac.notes['KEGG.information'] = row['KEGG.notes']\n", + "\n", + " # add metabolites\n", + " # ----------------\n", + " reac.add_metabolites(get_metabolites_mnx(model,met_reac['mnx equation'].iloc[0],chem_xref,chem_prop,bigg_metabolites))\n", + " #@TODO .............\n", + " # direction of reaction\n", + " # ---> current solution:\n", + " # use one direction only\n", + " # ..................\n", + "\n", + " # add annotations\n", + " # ---------------\n", + " reac.annotation['ec-code'] = row['EC number']\n", + " reac.annotation['kegg.reaction'] = row['KEGG.reaction']\n", + " reac.annotation['metanetx.reaction'] = met_reac_kegg['ID'].iloc[0]\n", + " met_reac_anno = reac_xref[reac_xref['ID']==met_reac_kegg['ID'].iloc[0]]\n", + " for db in ['metacyc.reaction','seed.reaction','rhea','bigg.reaction']:\n", + " db_matches = met_reac_anno[met_reac_anno['source'].str.contains(db)]\n", + " if len(db_matches) == 1:\n", + " reac.annotation[db] = db_matches['source'].iloc[0].split(':',1)[1]\n", + " elif len(db_matches) > 1:\n", + " reac.annotation[db] = [r.split(':',1)[1] for r in db_matches['source'].tolist()]\n", + " else:\n", + " continue\n", + " \n", + " # if not possible, use information from KEGG only\n", + " # ------------------------\n", + " # curate reaction via KEGG\n", + " # ------------------------\n", + " else:\n", + " \n", + " # retrieve reaction information from KEGG\n", + " reac_kegg = kegg_reaction_parser(row['KEGG.reaction'])\n", + "\n", + " # add name\n", + " # --------\n", + " # from KEGG name\n", + " reac.name = reac_kegg['name']\n", + "\n", + " # add notes\n", + " # ---------\n", + " reac.notes['creation'] = 'via KEGG'\n", + " reac.notes['KEGG.information'] = row['KEGG.notes']\n", + "\n", + " # add metabolites\n", + " # ----------------\n", + " reac.add_metabolites(get_metabolites_kegg(model,reac_kegg['equation'],chem_xref,chem_prop,bigg_metabolites))\n", + " #@TODO .............\n", + " # direction of reaction\n", + " # ---> current solution:\n", + " # use one direction only\n", + " # ..................\n", + "\n", + " # add annotations\n", + " # ---------------\n", + " reac.annotation['ec-code'] = row['EC number']\n", + " reac.annotation['kegg.reaction'] = row['KEGG.reaction']\n", + " for db, identifiers in reac_kegg['db'].items():\n", + " if len(identifiers) == 1:\n", + " reac.annotation[db] = identifiers[0]\n", + " else:\n", + " reac.annotation[db] = identifiers\n", + "\n", + "\n", + " # --------------------------------------\n", + " # re-set ID to fit namespace if possible\n", + " # --------------------------------------\n", + " match_id_to_namespace(reac)\n", + "\n", + " # ---------------------\n", + " # add reaction to model\n", + " # ---------------------\n", + " \n", + " # if the ID change results in an ID already in the model, use that reaction\n", + " if reac.id in [_.id for _ in model.reactions]:\n", + " print(f'{reac.id} already in model, not added a second time.')\n", + " else:\n", + " # check if reaction is complete\n", + " # and fullfills the requirements / parameters\n", + " if isreaction_complete(reac, exclude_dna, exclude_rna):\n", + " model.add_reactions([reac])\n", + " else:\n", + " print(F'reaction {reac.name} for gene {row[\"locus_tag\"]} could not be completely reconstructed, not added to model.')\n", + " return model\n", + "\n", + " # --------\n", + " # add gene\n", + " # --------\n", + " # check if gene is already in model\n", + " if row['locus_tag'] in model.genes:\n", + " # only need to add the gene reaction rule, if not already in\n", + " if not model.reactions.get_by_id(reac.id).gene_reaction_rule or len(model.reactions.get_by_id(reac.id).gene_reaction_rule) == 0:\n", + " model.reactions.get_by_id(reac.id).gene_reaction_rule = row['locus_tag']\n", + " else:\n", + " model.reactions.get_by_id(reac.id).gene_reaction_rule = model.reactions.get_by_id(reac.id).gene_reaction_rule + ' or ' + row['locus_tag']\n", + " else:\n", + " # add (to) gene reaction rule and curate new gene object\n", + " if not model.reactions.get_by_id(reac.id).gene_reaction_rule or len(model.reactions.get_by_id(reac.id).gene_reaction_rule) == 0:\n", + " model = add_gene(model, reac.id, row, first=True)\n", + " else:\n", + " model = add_gene(model, reac.id, row, first=False)\n", + "\n", + " return model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Test Area 51" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'c'" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "draft_model.metabolites[0].compartment" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "sprg", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/dev/CB_under_construction.ipynb b/dev/CB_under_construction.ipynb new file mode 100644 index 0000000..8e8cb0b --- /dev/null +++ b/dev/CB_under_construction.ipynb @@ -0,0 +1,41 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from refinegems.io import *" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "mbi", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.11" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/specimen.egg-info/SOURCES.txt b/specimen.egg-info/SOURCES.txt index db4b78b..3e5a204 100644 --- a/specimen.egg-info/SOURCES.txt +++ b/specimen.egg-info/SOURCES.txt @@ -19,7 +19,6 @@ src/specimen/__init__.py src/specimen/cmd_access.py src/specimen/workflow.py src/specimen/classes/__init__.py -src/specimen/classes/medium.py src/specimen/classes/reports.py src/specimen/core/__init__.py src/specimen/core/analysis.py