diff --git a/dev/CB_extension_remod.ipynb b/dev/CB_extension_remod.ipynb index 3d3be01..c2aa908 100644 --- a/dev/CB_extension_remod.ipynb +++ b/dev/CB_extension_remod.ipynb @@ -52,942 +52,15 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### generally useful functions\n", - "\n", - "-> maybe something for refinegems \n", - "--> where to put them, if yes" - ] - }, - { - "cell_type": "code", - "execution_count": 158, - "metadata": {}, - "outputs": [], - "source": [ - "# put in entities in refinegems\n", - "def make_reaction_annotation_dict(model:cobra.Model, db:Literal['KEGG','BiGG']) -> dict:\n", - " \"\"\"Create a dictionary of a model's reaction IDs and a choosen database ID as\n", - " saved in the annotations of the model.\n", - " The database ID can be choosen based on the strings for the namespace options\n", - " in other functions.\n", - "\n", - " Args:\n", - " model (cobra.Model): A model loaded with COBRApy.\n", - " db (Literal['KEGG','BiGG']): The string denoting the database to map to.\n", - "\n", - " Raises:\n", - " ValueError: Unknown database string for paramezer db\n", - "\n", - " Returns:\n", - " dict: The mapping of the reaction IDs to the database IDs found in the annotations\n", - " \"\"\"\n", - "\n", - " react_dict = {}\n", - "\n", - " match db:\n", - " case 'KEGG':\n", - " db_string = 'kegg.reaction'\n", - " case 'BiGG':\n", - " db_string = 'bigg.reaction'\n", - " case _:\n", - " mes = f'Unknown database string for parameter db: {db}'\n", - " raise ValueError(mes)\n", - "\n", - " for r in model.reactions:\n", - " if db_string in r.annotation.keys():\n", - " react_dict[r.id] = r.annotation[db_string]\n", - " else:\n", - " react_dict[r.id] = '-'\n", - "\n", - " return react_dict\n", - "\n", - "\n", - "\n", - "def create_random_id(model:cobra.Model, entity_type:Literal['reac','meta']='reac', prefix:str='') -> str:\n", - " \"\"\"Generate a unique, random ID for a model entity for a model.\n", - "\n", - " Args:\n", - " model (cobra.Model): A model loaded with COBRApy.\n", - " entity_type (Literal['reac','meta'], optional): Type of model entity. \n", - " Can be 'reac' for Reaction or 'meta' for Metabolite.\n", - " Defaults to 'reac'.\n", - " prefix (str, optional): prefix to set for the randomised part.\n", - " Useful to identify the random IDs later on. \n", - " Defaults to ''.\n", - "\n", - " Raises:\n", - " ValueError: Unknown entity_type\n", - "\n", - " Returns:\n", - " str: The generate new and unique ID.\n", - " \"\"\"\n", - "\n", - " match entity_type:\n", - " case 'reac':\n", - " all_ids = [_.id for _ in model.reactions]\n", - " case 'meta':\n", - " all_ids = [_.id for _ in model.metabolites]\n", - " case _:\n", - " mes = f'Unkown entity_type: {entity_type}'\n", - " raise ValueError(mes)\n", - "\n", - " prefix = f'{prefix}{entity_type}'\n", - " var = ''.join(choice(ascii_uppercase + digits) for i in range(6))\n", - " label = prefix + var\n", - " j = 6\n", - " \n", - " while True:\n", - " \n", - " for i in range(36**6): # make sure it does not run endlessly\n", - " if label in all_ids:\n", - " label = prefix + ''.join(choice(ascii_uppercase + digits) for x in range(j))\n", - " else:\n", - " return label\n", - " \n", - " j = j + 1\n", - "\n", - "\n", - "# @TODO: \n", - "# more namespace options\n", - "def match_id_to_namespace(model_entity:[cobra.Reaction, cobra.Metabolite], namespace:Literal['BiGG']) -> None:\n", - " \"\"\"Based on a given namespace, change the ID of a given model entity to it the set namespace.\n", - "\n", - " Currently working namespaces:\n", - " - BiGG \n", - "\n", - " Args:\n", - " model_entity (cobra.Reaction, cobra.Metabolite]): The model entity. \n", - " Can be either a cobra.Reaction or cobra.Metabolite object.\n", - " namespace (Literal['BiGG']): The chosen namespace.\n", - "\n", - " Raises:\n", - " ValueError: Unknown input for namespace\n", - " TypeError: Unknown type for model_entity\n", - " \"\"\"\n", - "\n", - " match model_entity:\n", - "\n", - " # Reaction\n", - " # --------\n", - " case cobra.Reaction():\n", - " match namespace:\n", - "\n", - " case 'BiGG':\n", - " if 'bigg.reaction' in model_entity.annotation.keys():\n", - " # @TODO : currently takes first entry is annotation is list\n", - " model_entity.id = model_entity.annotation['bigg.reaction'] if isinstance(model_entity.annotation['bigg.reaction'],str) else model_entity.annotation['bigg.reaction'][0]\n", - "\n", - " case _:\n", - " mes = f'Unknown input for namespace: {namespace}'\n", - " raise ValueError(mes)\n", - " \n", - " # Metabolite\n", - " # ----------\n", - " case cobra.Metabolite():\n", - " match namespace:\n", - "\n", - " case 'BiGG':\n", - " if 'bigg.metabolite' in model_entity.annotation.keys():\n", - " model_entity.id = model_entity.annotation['bigg.metabolite'] + '_' + model_entity.compartment if isinstance(model_entity.annotation['bigg.metabolite'],str) else model_entity.annotation['bigg.metabolite'][0]\n", - "\n", - " case _:\n", - " mes = f'Unknown input for namespace: {namespace}'\n", - " raise ValueError(mes)\n", - " # Error\n", - " # -----\n", - " case _:\n", - " mes = f'Unknown type for model_entity: {type(model_entity)}'\n", - " raise TypeError(mes)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### reworking functions for extension\n", - "\n", - "#### mapping" + "### reworking functions for extension\n" ] }, { "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [], - "source": [ - "# @NOTE changed\n", - "def map_BiGG_reactions_row(row, namespace):\n", - " \"\"\"Map a single entry from the table in map_BiGG_reactions() to the BiGG reaction namespace.\n", - "\n", - " :param row: A single row of the table.\n", - " :type row: pd.Series\n", - " :param namespace: The BiGG reaction namespace table.\n", - " :type namespace: pd.DataFrame\n", - " :returns: The edited row.\n", - " :rtype: pd.Series\n", - " \"\"\"\n", - "\n", - " \"\"\"\n", - " @TODO\n", - " NOTE: only works for cases, where KEGG.reaction in row contains EXACTLY one entry\n", - " in the rare case that multiple reactions belong to one enzyme, they are omitted\n", - " in this search\n", - " \"\"\"\n", - "\n", - " # match by EC number AND KEGG id\n", - " matches = namespace.loc[namespace['EC Number'].str.contains(row['EC number']) & namespace['KEGG Reaction'].str.contains(row['KEGG.reaction'])]\n", - "\n", - " # case 1 : no matches\n", - " if matches.empty:\n", - " return row\n", - "\n", - " # case 2 : exactly one match\n", - " elif len(matches) == 1:\n", - " row['bigg_id'] = matches['id'].values[0]\n", - "\n", - " # case 3 : multiple matches\n", - " # often due to reaction being in different compartments\n", - " else:\n", - " row['bigg_id'] = ' '.join(matches['id'].values)\n", - "\n", - " return row\n", - "\n", - "\n", - "# @TEST : fitted to refinegems\n", - "# @CHECK : connections, e.g. input is now a param short \n", - "def map_BiGG_reactions(table_file):\n", - " \"\"\"Map the output of map_to_KEGG() to a BiGG namespace file (rewritten-type, see auxilliaries).\n", - "\n", - " :param table_file: The path to the saved table from running map_to_KEGG().\n", - " :type table_file: string\n", - " :returns: The table with an additional column for the mapping to BiGG reactions.\n", - " :rtype: pd.DataFrame\n", - " \"\"\"\n", - "\n", - " r_namespace = load_a_table_from_database('bigg_reactions', False)\n", - "\n", - " table = pd.read_csv(table_file)\n", - " table['bigg_id'] = pd.Series(dtype='str')\n", - "\n", - " table = table.apply(lambda row: map_BiGG_reactions_row(row,r_namespace), axis=1)\n", - "\n", - " return table\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### actual extension" - ] - }, - { - "cell_type": "code", - "execution_count": 73, + "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "# @TODO\n", - "def isreaction_complete(reac, exclude_dna=True, exclude_rna=True):\n", - " \"\"\"Check, if a reaction is complete and ready to be added to the model.\n", - " Additionally, it is possible to check for DNA and RNA reations\n", - " and set them to be excluded or included.\n", - "\n", - " :param reac: The reaction to be checked.\n", - " :type reac: cobra.Reaction\n", - " :param exclude_dna: Tag to include or exclude DNA reactions.\n", - " :type exclude_dna: bool, default is True.\n", - " :param exclude_rna: Tag to include or exclude RNA reactions.\n", - " :type exclude_rna: bool, default is True.\n", - " :returns: True if the check is successful, else false.\n", - " :rtype: bool\n", - " \"\"\"\n", - "\n", - " # ................\n", - " # @TODO\n", - " # extendable\n", - " # ................\n", - "\n", - " # check reaction\n", - " if exclude_dna and 'DNA' in reac.name:\n", - " return False\n", - " if exclude_rna and 'RNA' in reac.name:\n", - " return False\n", - "\n", - " # check metabolites\n", - " for m in reac.metabolites:\n", - " if m.id == '' or pd.isnull(m.id):\n", - " return False\n", - " if m.name == '' or pd.isnull(m.name):\n", - " return False\n", - " if m.formula == '' or pd.isnull(m.formula):\n", - " return False\n", - "\n", - " return True\n", - "\n", - "\n", - "# @TODO\n", - "# @DOCS wrong\n", - "# UNDER CONSTRUCTION\n", - "def build_metabolite_mnx(metabolite, model, mnx_chem_prop, mnx_chem_xref, bigg_metabolites, namespace):\n", - " \"\"\"Create or retrieve (from model) a metabolite based on its MetaNetX ID.\n", - "\n", - " :param metabolite: The MetaNetX ID of the metabolite.\n", - " :type metabolite: string\n", - " :param model: The underlying genome-scale metabolic model.\n", - " :type model: cobra.model\n", - " :param mnx_chem_xref: The chem_xref table from MetaNetX\n", - " :type mnx_chem_xref: pd.DataFrame\n", - " :param mnx_chem_prop: The chem_prop table from MetaNetX\n", - " :type mnx_chem_prop: pd.DataFrame\n", - " :param bigg_metabolites: The BiGG compound namespace table.\n", - " :type bigg_metabolites: pd.DataFrame\n", - " :returns: The retrieved or newly build metabolite.\n", - " :rtype: cobra.Metabolite\n", - " \"\"\"\n", - "\n", - " metabolite_prop = mnx_chem_prop[mnx_chem_prop['ID']==metabolite]\n", - " metabolite_anno = mnx_chem_xref[mnx_chem_xref['ID']==metabolite]\n", - " model_mnx = [x.annotation['metanetx.chemical'] for x in model.metabolites if 'metanetx.chemical' in x.annotation]\n", - "\n", - " # fast check if compound already in model\n", - " # ------------------------------------------\n", - " # @TODO ..........................................\n", - " # currently no checking for compartments\n", - " # first match will be taken (most often cytosol one)\n", - " # regardless of the compartment\n", - " #.............................................\n", - " # step 1: check if MetaNetX ID in model\n", - " if metabolite in model_mnx:\n", - " matches = [x.id for x in model.metabolites if 'metanetx.chemical' in x.annotation and x.annotation['metanetx.chemical']==metabolite]\n", - "\n", - " # step 2: if yes, retrieve metabolite from model\n", - " # case 1: multiple matches found\n", - " if len(matches) > 1:\n", - " # ................\n", - " # @TODO see above\n", - " # ................\n", - " match = model.metabolites.get_by_id(matches[0])\n", - " # case 2: only one match found\n", - " else:\n", - " match = model.metabolites.get_by_id(matches[0])\n", - "\n", - " # step 3: add metabolite\n", - " return match\n", - "\n", - " # if not, create new metabolite\n", - " # -----------------------------\n", - " else:\n", - "\n", - " # step 1: create a random metabolite ID\n", - " # ...........................\n", - " # @TODO : compartment problem \n", - " # - does it have to be in the name?\n", - " # ...........................\n", - " new_metabolite = cobra.Metabolite(create_random_id(model, 'meta','SPECIMEN')) \n", - "\n", - "\n", - " # step 2: add features\n", - " # --------------------\n", - " # @TODO ..........................................\n", - " # currently no checking for compartments\n", - " # defaults to c\n", - " # makes it difficult to add exchange reactions\n", - " #.................................................\n", - " new_metabolite.formula = metabolite_prop['formula'].iloc[0]\n", - " new_metabolite.name = metabolite_prop['name'].iloc[0]\n", - " new_metabolite.charge = metabolite_prop['charge'].iloc[0]\n", - " new_metabolite.compartment = 'c'\n", - "\n", - " # step 3: add notes\n", - " # -----------------\n", - " new_metabolite.notes['added via'] = 'metanetx.chemical'\n", - "\n", - " # step 4: add annotations\n", - " # -----------------------\n", - " new_metabolite.annotation['metanetx.chemical'] = metabolite_prop['ID'].iloc[0]\n", - " new_metabolite.annotation['chebi'] = metabolite_prop['reference'].iloc[0].upper()\n", - " if not pd.isnull(metabolite_prop['InChIKey'].iloc[0]):\n", - " new_metabolite.annotation['inchikey'] = metabolite_prop['InChIKey'].iloc[0].split('=')[1]\n", - " for db in ['kegg.compound','metacyc.compound','seed.compound','bigg.metabolite']:\n", - " db_matches = metabolite_anno[metabolite_anno['source'].str.contains(db)]\n", - " if len(db_matches) == 1:\n", - " new_metabolite.annotation[db] = db_matches['source'].iloc[0].split(':',1)[1]\n", - " elif len(db_matches) > 1:\n", - " new_metabolite.annotation[db] = [m.split(':',1)[1] for m in db_matches['source'].tolist()]\n", - "\n", - " # if no BiGG was found in MetaNetX, try reverse search in BiGG\n", - " if metabolite in bigg_metabolites['MetaNetX (MNX) Chemical']:\n", - " new_metabolite.annotation['bigg.metabolite'] = bigg_metabolites[bigg_metabolites['MetaNetX (MNX) Chemical']==metabolite].iloc[0]\n", - " \n", - " # add additional information from bigg if possible \n", - " if 'bigg.metabolite' in new_metabolite.annotation.keys():\n", - " bigg_information = bigg_metabolites[bigg_metabolites['bigg_id'].str.contains('|'.join(new_metabolite.annotation['bigg.metabolite']))]\n", - " db_id_bigg = {'BioCyc':'biocyc', 'MetaNetX (MNX) Chemical':'metanetx.chemical','SEED Compound':'seed.compound','InChI Key':'inchikey'}\n", - " for db in db_id_bigg:\n", - " info = bigg_information[db].dropna().to_list()\n", - " if len(info) > 0:\n", - " info = ','.join(info)\n", - " info = [x.strip() for x in info.split(',')] # make sure all entries are a separate list object\n", - " new_metabolite.annotation[db_id_bigg[db]] = info\n", - "\n", - " # step 5: change ID according to namespace\n", - " # ----------------------------------------\n", - " match_id_to_namespace(new_metabolite,namespace)\n", - " \n", - " # step 6: re-check existence of ID in model\n", - " # -----------------------------------------\n", - " # @TODO : check complete annotations? \n", - " # - or let those be covered by the duplicate check later on?\n", - " if new_metabolite.id in [_.id for _ in model.metabolites]:\n", - " return model.metabolites.get_by_id(new_metabolite.id)\n", - " \n", - " return new_metabolite\n", - "\n", - "# @TODO\n", - "# @DOCS\n", - "# UNDER CONSTRUCTION\n", - "def build_metabolite_kegg(kegg_id, model, model_kegg_ids, bigg_metabolites, namespace):\n", - " \"\"\"Create or retrieve (from model) a metabolite based on its KEGG ID.\n", - "\n", - " :param kegg_id: The KEGG.compound ID of the metabolite in question.\n", - " :type kegg_id: string\n", - " :param model: The model.\n", - " :type model: cobra.Model\n", - " :param model_kegg_ids: List of all annotated KEGG Compound IDs in the model.\n", - " :type model_kegg_ids: list\n", - " :param bigg_metabolites: The BiGG compound namespace table.\n", - " :type bigg_metabolites: pd.DataFrame\n", - " :returns: The retrieved or newly build metabolite.\n", - " :rytpe: cobra.Metabolite\n", - " \"\"\"\n", - "\n", - " # retrieve KEGG entry for compound\n", - " # --------------------------------\n", - " try:\n", - " kegg_handle = REST.kegg_get(kegg_id)\n", - " kegg_record = [r for r in Compound.parse(kegg_handle)][0]\n", - " except urllib.error.HTTPError:\n", - " print(F'HTTPError: {kegg_id}')\n", - " return cobra.Metabolite()\n", - " except ConnectionResetError:\n", - " print(F'ConnectionResetError: {kegg_id}')\n", - " return cobra.Metabolite()\n", - " except urllib.error.URLError:\n", - " print(F'URLError: {kegg_id}')\n", - " return cobra.Metabolite()\n", - "\n", - " # ---------------------------------------\n", - " # fast check if compound already in model\n", - " # ---------------------------------------\n", - " # @TODO ..........................................\n", - " # currently no checking for compartments\n", - " # first match will be taken (most often cytosol one)\n", - " # regardless of the compartment\n", - " #.............................................\n", - " # step 1: check via KEGG ID\n", - " if kegg_id in model_kegg_ids:\n", - " matches = [x.id for x in model.metabolites if ('kegg.compound' in x.annotation and x.annotation['kegg.compound'] == kegg_id)]\n", - "\n", - " # step 2: model id --> metabolite object\n", - " # case 1: multiple matches found\n", - " if len(matches) > 1:\n", - " match = model.metabolites.get_by_id(matches[0])\n", - " # case 2: only one match found\n", - " else:\n", - " match = model.metabolites.get_by_id(matches[0])\n", - "\n", - " # step 3: add metabolite\n", - " return match\n", - "\n", - " # -----------------------------\n", - " # if not, create new metabolite\n", - " # -----------------------------\n", - " # ...............\n", - " # @TODO\n", - " # compartment\n", - " # ...............\n", - " else:\n", - " # step 1: create a random metabolite ID\n", - " # -------------------------------------\n", - " # ...........................\n", - " # @TODO : compartment problem \n", - " # - does it have to be in the name?\n", - " # ...........................\n", - " new_metabolite = cobra.Metabolite(create_random_id(model, 'meta','SPECIMEN')) \n", - "\n", - " # step 2: add features\n", - " # --------------------\n", - " # @TODO ..........................................\n", - " # currently no checking for compartments\n", - " #.............................................\n", - " # set name from KEGG and additionally use it as ID if there is none yet\n", - " if isinstance(kegg_record.name, list):\n", - " if len(kegg_record.name) > 1:\n", - " new_metabolite.name = kegg_record.name[1]\n", - " else:\n", - " new_metabolite.name = kegg_record.name[0]\n", - " else:\n", - " new_metabolite.name = kegg_record.name\n", - " # set compartment\n", - " new_metabolite.compartment = 'c'\n", - " # set formula\n", - " new_metabolite.formula = kegg_record.formula\n", - "\n", - " # step 3: add notes\n", - " # -----------------\n", - " new_metabolite.notes['added via'] = 'KEGG.compound'\n", - "\n", - " # step 4: add annotations\n", - " # -----------------------\n", - " new_metabolite.annotation['kegg.compound'] = kegg_id\n", - " db_idtf = {'CAS':'cas','PubChem':'pubchem.compound','ChEBI':'chebi'}\n", - " for db,ids in kegg_record.dblinks:\n", - " if db in db_idtf:\n", - " if len(ids) > 1:\n", - " new_metabolite.annotation[db_idtf[db]] = ids\n", - " else:\n", - " new_metabolite.annotation[db_idtf[db]] = ids[0]\n", - "\n", - " # add additional information from BiGG\n", - " if kegg_id in bigg_metabolites['KEGG Compound']:\n", - "\n", - " bigg_information = bigg_metabolites[bigg_metabolites['KEGG Compound']==kegg_id]\n", - " if len(bigg_information) > 0:\n", - "\n", - " new_metabolite.annotation['bigg.metabolite'] = bigg_information['bigg_id'].to_list()\n", - "\n", - " db_id_bigg = {'BioCyc':'biocyc', 'MetaNetX (MNX) Chemical':'metanetx.chemical','SEED Compound':'seed.compound','InChI Key':'inchikey'}\n", - " for db in db_id_bigg:\n", - " info = bigg_information[db].dropna().to_list()\n", - " if len(info) > 0:\n", - " info = ','.join(info)\n", - " info = [x.strip() for x in info.split(',')] # make sure all entries are a separate list object\n", - " new_metabolite.annotation[db_id_bigg[db]] = info\n", - "\n", - " # step 5: change ID according to namespace\n", - " # ----------------------------------------\n", - " match_id_to_namespace(new_metabolite,namespace)\n", - " \n", - " # step 6: re-check existence of ID in model\n", - " # -----------------------------------------\n", - " # @TODO : check complete annotations? \n", - " # - or let those be covered by the duplicate check later on?\n", - " if new_metabolite.id in [_.id for _ in model.metabolites]:\n", - " return model.metabolites.get_by_id(new_metabolite.id)\n", - "\n", - " return new_metabolite\n", - "\n", - "# @TODO\n", - "# @DOCS\n", - "# UNDER CONSTRUCTION\n", - "def get_metabolites_mnx(model,equation,mnx_chem_xref,mnx_chem_prop,bigg_metabolites, namespace):\n", - " \"\"\"Based on a given MetaNetX equation and a model, get or\n", - " create metabolite entires in/for the model.\n", - "\n", - " :param model: A GEM.\n", - " :type model: cobra.Model\n", - " :param equation: The equation from MetaNetX\n", - " :type equation: string\n", - " :param mnx_chem_xref: The chem_xref table from MetaNetX\n", - " :type mnx_chem_xref: pd.DataFrame\n", - " :param mnx_chem_prop: The chem_prop table from MetaNetX\n", - " :type mnx_chem_prop: pd.DataFrame\n", - " :param bigg_metabolites: The BiGG compound namespace table.\n", - " :type bigg_metabolites: pd.DataFrame\n", - " :returns: Dictonary of metabolites and stoichiometric factors.\n", - " :rtype: dict\n", - " \"\"\"\n", - "\n", - " # @TODO ...................................\n", - " # currently no checking for compartments\n", - " #..........................................\n", - "\n", - " model_metabolites = [m.formula for m in model.metabolites]\n", - " metabolites = {}\n", - " produced = -1.0\n", - " factor = 0\n", - "\n", - " for s in equation.split(' '):\n", - " # switch from reactants to products\n", - " if s == '=':\n", - " produced = 1.0\n", - " # found stoichiometric factor\n", - " elif s.isnumeric():\n", - " factor = float(s)\n", - " # skip\n", - " elif s == '+':\n", - " continue\n", - " # found metabolite\n", - " else:\n", - " # get information from MetaNetX\n", - " metabolite, compartment = s.split('@')\n", - " # build or identify metabolite\n", - " new_metabolite = build_metabolite_mnx(metabolite, model, mnx_chem_prop, mnx_chem_xref,bigg_metabolites, namespace)\n", - " # add metabolite\n", - " if new_metabolite.id in [_.id for _ in metabolites]:\n", - " # ......................................................\n", - " # @TODO: \n", - " # check if metabolite if both reactant and product\n", - " # suggests exchange reaction \n", - " # -> maybe a good place to change compartment for one?\n", - " # -> what about name and directions???\n", - " # ......................................................\n", - " try:\n", - " test = model.metabolites.get_by_id(new_metabolite.id)\n", - " new_metabolite = new_metabolite.copy()\n", - " new_metabolite.id = new_metabolite.id + '_i'\n", - " except:\n", - " new_metabolite.id = new_metabolite.id + '_i'\n", - "\n", - " metabolites[new_metabolite] = factor * produced\n", - "\n", - " return metabolites\n", - "\n", - "\n", - "# @TODO\n", - "# @DOCS\n", - "# UNDER CONSTRUCTION\n", - "def get_metabolites_kegg(model,equation,chem_xref,chem_prop,bigg_metabolites, namespace):\n", - " \"\"\"Based on a given KEGG equation and a model, get or\n", - " create metabolite entires in/for the model.\n", - "\n", - " :param model: A GEM.\n", - " :type model: cobra.Model\n", - " :param equation: The equation from KEGG\n", - " :type equation: string\n", - " :param chem_xref: The chem_xref table from MetaNetX\n", - " :type chem_xref: pd.DataFrame\n", - " :param chem_prop: The chem_prop table from MetaNetX\n", - " :type chem_prop: pd.DataFrame\n", - " :param bigg_metabolites: The BiGG compound namespace table.\n", - " :type bigg_metabolites: pd.DataFrame\n", - " :returns: Dictonary of metabolites and stoichiometric factors.\n", - " :rtype: dict\n", - " \"\"\"\n", - "\n", - " # @TODO ...................................\n", - " # currently no checking for compartments\n", - " #..........................................\n", - "\n", - " model_metabolites = [m.formula for m in model.metabolites]\n", - " model_kegg_ids = [m.annotation['kegg.compound'] for m in model.metabolites if 'kegg.compound' in m.annotation]\n", - " metabolites = {}\n", - " produced = -1.0\n", - " factor = 1\n", - " mnx_id = ''\n", - "\n", - " for s in equation.split(' '):\n", - " # switch from reactants to products\n", - " if '=' in s:\n", - " produced = 1.0\n", - " # found stoichiometric factor\n", - " elif s.isnumeric():\n", - " factor = float(s)\n", - " # skip\n", - " elif s == '+':\n", - " continue\n", - " # found metabolite\n", - " else:\n", - " # check if s is a valid ID\n", - " if '(' in s:\n", - " s = s.split('(')[0]\n", - " # ..................................\n", - " # @TODO\n", - " # known case: DNA(n) --> DNA(n+1)\n", - " # currently note in brackets gets ignored\n", - " # ..................................\n", - " elif not s.isalnum():\n", - " print('Problem: unknown character in ID inside get_metabolites_kegg() detected.\\nPlease contact dev about your problem.')\n", - " sys.exit(1)\n", - "\n", - " mnx_id = chem_xref[chem_xref['source'] == F'kegg.compound:{s}']\n", - " # case 1:\n", - " # add metabolite via MetaNetX\n", - " # -> make sure, only 1 ID match is found (match is unambiguous)\n", - " if len(mnx_id) == 1:\n", - " mnx_id = mnx_id['ID'].item()\n", - " metabolite = build_metabolite_mnx(mnx_id, model, chem_prop, chem_xref, bigg_metabolites, namespace)\n", - " # case 2:\n", - " # add metabolite via KEGG\n", - " else:\n", - " metabolite = build_metabolite_kegg(s, model, model_kegg_ids, bigg_metabolites, namespace)\n", - "\n", - " # add new metabolite\n", - " # @TODO : place to check for exchanges?\n", - " if metabolite.id in [_.id for _ in metabolites]:\n", - " try:\n", - " test = model.metabolites.get_by_id(metabolite.id)\n", - " metabolite = metabolite.copy()\n", - " metabolite.id = metabolite.id + '_i'\n", - " except:\n", - " metabolite.id = metabolite.id + '_i'\n", - "\n", - " metabolites[metabolite] = factor * produced\n", - "\n", - " return metabolites\n", - "\n", - "\n", - "\n", - "def add_gene(model, reaction, row, first=False):\n", - " \"\"\"Add a new gene to a genome-scale metabolic cobra model.\n", - "\n", - " :param model: The model.\n", - " :type model: cobra.Model\n", - " :param reaction: The reaction id to add the gene to.\n", - " :type reaction: string\n", - " :param row: A single row of the output table of map_BiGG_reactions().\n", - " :type row: pd.Series\n", - " :param first: Shows, if gene is the first gene to be added to the reaction.\n", - " :type first: bool, true if gene is first to be added.\n", - " :returns: The updated model.\n", - " :rtype: cobra.Model\n", - " \"\"\"\n", - "\n", - " # add gene\n", - " if first or model.reactions.get_by_id(reaction).gene_reaction_rule == '':\n", - " model.reactions.get_by_id(reaction).gene_reaction_rule = row[0]\n", - " else:\n", - " model.reactions.get_by_id(reaction).gene_reaction_rule = model.reactions.get_by_id(reaction).gene_reaction_rule + ' or ' + row[0]\n", - "\n", - " # add name\n", - " model.genes.get_by_id(row[0]).name = row[1]\n", - "\n", - " # add annotations\n", - " if not pd.isnull(row[4]):\n", - " model.genes.get_by_id(row[0]).annotation['ncbigene'] = row[4]\n", - " model.genes.get_by_id(row[0]).annotation['ncbiprotein'] = row[2].split('.')[0]\n", - " # note: annotations like sbo, kegg.genes and uniprot missing\n", - "\n", - " return model\n", - "\n", - "\n", - "# UNDER CONSTRUCTION\n", - "def add_reaction(model,row,reac_xref,reac_prop,chem_xref,chem_prop,bigg_metabolites, namespace:str='BiGG', exclude_dna=True, exclude_rna=True):\n", - "\n", - " # create reaction object\n", - " reac = cobra.Reaction(create_random_id(model,'reac','SPECIMEN'))\n", - "\n", - " # ----------------------------\n", - " # curate reaction via MetaNetX\n", - " # ----------------------------\n", - " # try kegg.reaction --> metanetx.reaction\n", - " if F'kegg.reaction:{row[\"KEGG.reaction\"]}' in list(reac_xref['source']):\n", - " \n", - " # get MetaNetX ID\n", - " met_reac_kegg = reac_xref[reac_xref['source']==F'kegg.reaction:{row[\"KEGG.reaction\"]}']\n", - " met_reac = reac_prop[reac_prop['ID']==met_reac_kegg['ID'].iloc[0]]\n", - "\n", - " # make sure exactly one entry is parsed\n", - " # @TODO : parallel parsing\n", - " if len(met_reac) > 1:\n", - " print(F'Warning: multiple matches for kegg.reaction {row[\"KEGG.reaction\"]} found. Only first one will be used.')\n", - " met_reac = met_reac.head(1)\n", - "\n", - " # add name\n", - " # --------\n", - " # from MetaNetX KEGG description\n", - " reac.name = met_reac_kegg['description'].iloc[0].split('|')[0]\n", - "\n", - " # add notes\n", - " # ---------\n", - " reac.notes['creation'] = 'via MetaNetX'\n", - " reac.notes['KEGG.information'] = row['KEGG.notes']\n", - "\n", - " # add metabolites\n", - " # ----------------\n", - " reac.add_metabolites(get_metabolites_mnx(model,met_reac['mnx equation'].iloc[0],chem_xref,chem_prop,bigg_metabolites, namespace))\n", - " #@TODO .............\n", - " # direction of reaction\n", - " # ---> current solution:\n", - " # use one direction only\n", - " # ..................\n", - "\n", - " # add annotations\n", - " # ---------------\n", - " reac.annotation['ec-code'] = row['EC number']\n", - " reac.annotation['kegg.reaction'] = row['KEGG.reaction']\n", - " reac.annotation['metanetx.reaction'] = met_reac_kegg['ID'].iloc[0]\n", - " met_reac_anno = reac_xref[reac_xref['ID']==met_reac_kegg['ID'].iloc[0]]\n", - " for db in ['metacyc.reaction','seed.reaction','rhea','bigg.reaction']:\n", - " db_matches = met_reac_anno[met_reac_anno['source'].str.contains(db)]\n", - " if len(db_matches) == 1:\n", - " reac.annotation[db] = db_matches['source'].iloc[0].split(':',1)[1]\n", - " elif len(db_matches) > 1:\n", - " reac.annotation[db] = [r.split(':',1)[1] for r in db_matches['source'].tolist()]\n", - " else:\n", - " continue\n", - " \n", - " # if not possible, use information from KEGG only\n", - " # ------------------------\n", - " # curate reaction via KEGG\n", - " # ------------------------\n", - " else:\n", - " \n", - " # retrieve reaction information from KEGG\n", - " reac_kegg = kegg_reaction_parser(row['KEGG.reaction'])\n", - "\n", - " # add name\n", - " # --------\n", - " # from KEGG name\n", - " reac.name = reac_kegg['name']\n", - "\n", - " # add notes\n", - " # ---------\n", - " reac.notes['creation'] = 'via KEGG'\n", - " reac.notes['KEGG.information'] = row['KEGG.notes']\n", - "\n", - " # add metabolites\n", - " # ----------------\n", - " reac.add_metabolites(get_metabolites_kegg(model,reac_kegg['equation'],chem_xref,chem_prop,bigg_metabolites, namespace))\n", - " #@TODO .............\n", - " # direction of reaction\n", - " # ---> current solution:\n", - " # use one direction only\n", - " # ..................\n", - "\n", - " # add annotations\n", - " # ---------------\n", - " reac.annotation['ec-code'] = row['EC number']\n", - " reac.annotation['kegg.reaction'] = row['KEGG.reaction']\n", - " for db, identifiers in reac_kegg['db'].items():\n", - " if len(identifiers) == 1:\n", - " reac.annotation[db] = identifiers[0]\n", - " else:\n", - " reac.annotation[db] = identifiers\n", - "\n", - "\n", - " # --------------------------------------\n", - " # re-set ID to fit namespace if possible\n", - " # --------------------------------------\n", - " match_id_to_namespace(reac, namespace)\n", - "\n", - " # ---------------------\n", - " # add reaction to model\n", - " # ---------------------\n", - " \n", - " # if the ID change results in an ID already in the model, use that reaction\n", - " if reac.id in [_.id for _ in model.reactions]:\n", - " print(f'{reac.id} already in model, not added a second time.')\n", - " else:\n", - " # check if reaction is complete\n", - " # and fullfills the requirements / parameters\n", - " if isreaction_complete(reac, exclude_dna, exclude_rna):\n", - " model.add_reactions([reac])\n", - " else:\n", - " print(F'reaction {reac.name} for gene {row[\"locus_tag\"]} could not be completely reconstructed, not added to model.')\n", - " return model\n", - "\n", - " # --------\n", - " # add gene\n", - " # --------\n", - " # check if gene is already in model\n", - " if row['locus_tag'] in model.genes:\n", - " # if - for whatever reason - gene already in gpr, skip\n", - " if row['locus_tag'] in model.reactions.get_by_id(reac.id).gene_reaction_rule:\n", - " return model\n", - " # create new gpr, if nonexistent\n", - " elif not model.reactions.get_by_id(reac.id).gene_reaction_rule or len(model.reactions.get_by_id(reac.id).gene_reaction_rule) == 0:\n", - " model.reactions.get_by_id(reac.id).gene_reaction_rule = row['locus_tag']\n", - " # add gene to existing gpr\n", - " else:\n", - " model.reactions.get_by_id(reac.id).gene_reaction_rule = model.reactions.get_by_id(reac.id).gene_reaction_rule + ' or ' + row['locus_tag']\n", - " else:\n", - " # add (to) gene reaction rule and curate new gene object\n", - " if not model.reactions.get_by_id(reac.id).gene_reaction_rule or len(model.reactions.get_by_id(reac.id).gene_reaction_rule) == 0:\n", - " model = add_gene(model, reac.id, row, first=True)\n", - " else:\n", - " model = add_gene(model, reac.id, row, first=False)\n", - "\n", - " return model\n", - "\n", - "# notes\n", - "# @CHECK : connections, e.g. input is now a param short \n", - "def extent_model(table, model,chem_prop_file,chem_xref_file,reac_prop_file,reac_xref_file, namespace, exclude_dna=True, exclude_rna=True):\n", - " \"\"\"Add reactions, metabolites and genes to a model based on the output of map_to_bigg().\n", - "\n", - " :param table: The table with the information to be added to the model.\n", - " :type table: pd.DataFrame, output of map_to_bigg\n", - " :param model: The genome-scale metabolic model to be extended\n", - " :type model: cobra.Model\n", - " :param chem_prop_file: Path to the MetaNetX chem_prop file.\n", - " :type chem_prop_file: string\n", - " :param chem_xref_file: Path to the MetaNetX chem_xref file.\n", - " :type chem_xref_file: string\n", - " :param reac_prop_file: Path to the MetaNetX reac_prop file.\n", - " :type reac_prop_file: string\n", - " :param reac_xref_file: Path to the MetaNetX reac_xref file.\n", - " :type reac_xref_file: string\n", - " :param exclude_dna: Tag to include or exclude DNA reactions.\n", - " :type exclude_dna: bool, default is True.\n", - " :param exclude_rna: Tag to include or exclude RNA reactions.\n", - " :type exclude_rna: bool, default is True.\n", - " :returns: The extended model.\n", - " :rytpe: cobra.Model\n", - " \"\"\"\n", - "\n", - " # load MetaNetX database / namespace\n", - " chem_prop = pd.read_csv(chem_prop_file, sep='\\t', comment='#', names=['ID','name','reference','formula','charge','mass','InChI','InChIKey','SMILES'])\n", - " chem_xref = pd.read_csv(chem_xref_file, sep='\\t', comment='#', names=['source','ID','description'])\n", - "\n", - " reac_prop = pd.read_csv(reac_prop_file, sep='\\t', comment='#', names=['ID','mnx equation','reference','classifs','is_balanced','is_transport'])\n", - " reac_xref = pd.read_csv(reac_xref_file, sep='\\t', comment='#', names=['source','ID','description'])\n", - "\n", - " # load bigg metabolite namespace\n", - " bigg_metabolites = load_a_table_from_database('bigg_metabolites', False)\n", - " bigg_metabolites.rename(columns={'id':'bigg_id'}, inplace=True)\n", - " bigg_metabolites = bigg_metabolites[['bigg_id','universal_bigg_id','name','CHEBI','BioCyc','KEGG Compound','MetaNetX (MNX) Chemical','SEED Compound','InChI Key']]\n", - "\n", - " # add genes one by one to model\n", - " print('\\tAdding genes and if needed reactions and metabolites to model:')\n", - " for row_idx in tqdm(table.index):\n", - "\n", - " # generate Name -> KEGG.reaction dictionary\n", - " react_dict = make_reaction_annotation_dict(model,'KEGG')\n", - " # generate Name -> BiGG.reaction dictionary\n", - " react_dict_2 = make_reaction_annotation_dict(model,'BiGG')\n", - "\n", - " # get row in pandas format\n", - " row = table.iloc[row_idx]\n", - "\n", - "\n", - " # case 1: BiGG name already in model = reaction in model\n", - " if not pd.isnull(row['bigg_id']) and any((True for _ in row['bigg_id'].split(' ') if _ in react_dict_2.values())):\n", - " # get matching reaction id(s)\n", - " reac_found = [_ for _ in row['bigg_id'].split(' ') if _ in react_dict_2.values()]\n", - " # add genes to all reactions\n", - " for r in reac_found:\n", - " model = add_gene(model, r, row)\n", - "\n", - " # case 1: KEGG reaction ID in model = reaction probably in model as well\n", - " elif row['KEGG.reaction'] in react_dict.values():\n", - " # get corresponding reaction\n", - " react_found = [_ for _ in react_dict.keys() if row['KEGG.reaction'] == react_dict[_]]\n", - " # add gene to all reactions found\n", - " for r in react_found:\n", - " model = add_gene(model,r,row)\n", - "\n", - " # case 3: reaction not in model\n", - " # -> add reaction(s), gene and metabolites if needed\n", - " else:\n", - " # case 3.1: one reaction\n", - " react = row['KEGG.reaction'].split(' ')\n", - " if len(react) == 1:\n", - " model = add_reaction(model,row,reac_xref,reac_prop,chem_xref,chem_prop,bigg_metabolites, namespace, exclude_dna, exclude_rna)\n", - "\n", - " # case 3.2: multiple reactions\n", - " # add each reaction separatly, with the same gene for th gene reaction rule\n", - " # note: zero reactions not possible due to previous filtering\n", - " else:\n", - "\n", - " for r in react:\n", - " temp_row = row.copy(deep=True)\n", - " temp_row['KEGG.reaction'] = r\n", - " model = add_reaction(model,temp_row,reac_xref,reac_prop,chem_xref,chem_prop,bigg_metabolites, namespace, exclude_dna, exclude_rna)\n", - "\n", - " return model\n", - "\n" - ] + "source": [] }, { "cell_type": "markdown", @@ -996,199 +69,6 @@ "### Test Area 51" ] }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [], - "source": [ - "mg = pd.read_csv(mapped_genes_path)\n", - "mapped_genes_comp = map_BiGG_reactions(mapped_genes_path)" - ] - }, - { - "cell_type": "code", - "execution_count": 74, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\tAdding genes and if needed reactions and metabolites to model:\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "e2bb8d34305c46d7b915363193783ee0", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/53 [00:00, ?it/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "reaction tRNA-uridine uracilmutase for gene AB-1_S128_00258 could not be completely reconstructed, not added to model.\n", - "reaction L-Methionine:tRNAMet ligase (AMP-forming) for gene AB-1_S128_00407 could not be completely reconstructed, not added to model.\n", - "reaction Aminoacyl-tRNA aminoacylhydrolase for gene AB-1_S128_00466 could not be completely reconstructed, not added to model.\n", - "reaction S-adenosyl-L-methionine:rRNA (adenine-N6-)-methyltransferase for gene AB-1_S128_01553 could not be completely reconstructed, not added to model.\n", - "reaction deoxynucleoside triphosphate:DNA deoxynucleotidyltransferase for gene AB-1_S128_02056 could not be completely reconstructed, not added to model.\n", - "reaction L-Cysteine:tRNA(Cys) ligase (AMP-forming) for gene AB-1_S128_02165 could not be completely reconstructed, not added to model.\n", - "reaction deoxynucleoside triphosphate:DNA deoxynucleotidyltransferase for gene AB-1_S128_02195 could not be completely reconstructed, not added to model.\n", - "reaction tRNA-guanosine34:7-aminomethyl-7-carbaguanine tRNA-D-ribosyltransferase for gene AB-1_S128_02293 could not be completely reconstructed, not added to model.\n", - "reaction deoxynucleoside triphosphate:DNA deoxynucleotidyltransferase for gene AB-1_S128_02491 could not be completely reconstructed, not added to model.\n", - "reaction S-Adenosyl-L-methionine:tRNA guanine N7-methyltransferase for gene AB-1_S128_02703 could not be completely reconstructed, not added to model.\n", - "reaction queuosine34 in tRNA:acceptor oxidoreductase for gene AB-1_S128_02971 could not be completely reconstructed, not added to model.\n", - "reaction L-threonylcarbamoyladenylate:adenine in tRNA N6-L-threonylcarbamoyltransferase for gene AB-1_S128_02973 could not be completely reconstructed, not added to model.\n", - "reaction delta2-isopentenyl-diphosphate:tRNA isopentenyltransferase for gene AB-1_S128_02976 could not be completely reconstructed, not added to model.\n", - "reaction L-Asparagine:tRNA(Asn) ligase (AMP-forming) for gene AB-1_S128_03362 could not be completely reconstructed, not added to model.\n", - "reaction nucleoside-triphosphate:RNA nucleotidyltransferase (DNA-directed); for gene AB-1_S128_03866 could not be completely reconstructed, not added to model.\n", - "reaction L-proline:tRNA(Pro) ligase (AMP-forming) for gene AB-1_S128_04260 could not be completely reconstructed, not added to model.\n", - "reaction deoxynucleoside triphosphate:DNA deoxynucleotidyltransferase for gene AB-1_S128_04393 could not be completely reconstructed, not added to model.\n", - "reaction L-Tryptophan -tRNA(Trp) ligase (AMP-forming) for gene AB-1_S128_04485 could not be completely reconstructed, not added to model.\n", - "reaction glutharedoxin:arsenate oxidoreductase for gene AB-1_S128_05136 could not be completely reconstructed, not added to model.\n", - "reaction nucleoside-triphosphate:RNA nucleotidyltransferase (DNA-directed); for gene AB-1_S128_05470 could not be completely reconstructed, not added to model.\n", - "reaction deoxynucleoside triphosphate:DNA deoxynucleotidyltransferase for gene AB-1_S128_05605 could not be completely reconstructed, not added to model.\n", - "reaction beta-Lactamhydrolase for gene AB-1_S128_05725 could not be completely reconstructed, not added to model.\n" - ] - } - ], - "source": [ - "draft_model = load_model(draft_model_path, 'cobra')\n", - "new_model = extent_model(mapped_genes_comp, \n", - " draft_model,\n", - " chem_prop_file,\n", - " chem_xref_file,\n", - " reac_prop_file,\n", - " reac_xref_file, \n", - " namespace='BiGG', \n", - " exclude_dna=True, exclude_rna=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 76, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "SPECIMENreac2A7RQ7\n", - "SPECIMENreac81FWMJ\n", - "SPECIMENreacPKXFUQ\n", - "AGMT\n", - "SPECIMENreac3WTKGW\n", - "R_TPRDCOAS\n", - "SPECIMENreacCFJ7N8\n", - "SPECIMENreacJPD1V9\n", - "SPECIMENreacQHC8MB\n", - "SPECIMENreacNO6KCQ\n", - "SPECIMENreac522PJU\n", - "SPECIMENreacJU0MHQ\n", - "SPECIMENreac7U4580\n", - "SPECIMENreac0OTQ1Q\n", - "SPECIMENreac8JUQXD\n", - "SPECIMENreacHPK3IW\n", - "SPECIMENreacTNN7GZ\n", - "MAN1PT\n", - "SPECIMENreac44OE2G\n", - "SPECIMENreacZ3S4RB\n", - "SPECIMENreacQBYJ8V\n", - "SPECIMENreacJOSC6B\n", - "SPECIMENreac9ZVS32\n", - "23\n" - ] - } - ], - "source": [ - "c = 0\n", - "for x in new_model.reactions:\n", - " if x.notes['creation'] != 'via template':\n", - " c += 1\n", - " print(x.id)\n", - "print(c)" - ] - }, - { - "cell_type": "code", - "execution_count": 85, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "
Reaction identifier | SPECIMENreacJU0MHQ | \n", - "
Name | GTP:GTP guanylyltransferase | \n", - "
Memory address | \n", - "0x1fbd61060 | \n", - "
Stoichiometry | \n", - "\n",
- " 2.0 gtp_c --> SPECIMENmeta0B2QH7 + 2.0 ppi_c \n", - "2.0 GTP C10H12N5O14P3 --> cyclic di-3',5'-guanylate + 2.0 Diphosphate \n", - " | \n",
- "
GPR | AB-1_S128_02424 | \n", - "
Lower bound | 0.0 | \n", - "
Upper bound | 1000.0 | \n", - "