Skip to content

Commit

Permalink
Merge pull request #6 from ncats/sqlite_inchi_duplex_patch
Browse files Browse the repository at this point in the history
Sqlite inchi duplex patch, reactome gene patch, Rhea reaction class additon.
  • Loading branch information
johnbraisted authored Nov 9, 2023
2 parents 796e5f9 + af85eb3 commit 6bed4f3
Show file tree
Hide file tree
Showing 19 changed files with 954 additions and 297 deletions.
3 changes: 2 additions & 1 deletion config/db_load_resource_config.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ ready analytesource.txt source bulk None "sourceId,rampId,IDtype,geneOrCompound,
ready analytesynonym.txt analytesynonym bulk None "Synonym,rampId,geneOrCompound,source"
ready analytetopathway.txt analytehaspathway bulk None "rampId,pathwayRampId,pathwaySource"
ready analyte.txt analyte bulk rampId "rampId,type"
ready catalyzes.txt catalyzed bulk None "rampCompoundId,rampGeneId, proteinType"
ready catalyzes.txt catalyzed bulk None "rampCompoundId,rampGeneId,proteinType"
empty reactomecatalyzed.sql catalyzed bulk None "rampCompoundId,rampGeneId"
empty wikicatalyzed.sql catalyzed bulk None "rampCompoundId,rampGeneId"
ready chemProps.txt chem_props bulk None "ramp_id,chem_data_source,chem_source_id,iso_smiles,inchi_key_prefix,inchi_key,inchi,mw,monoisotop_mass,common_name,mol_formula"
Expand All @@ -15,3 +15,4 @@ ready reaction.txt reaction bulk ramp_rxn_id "ramp_rxn_id,rxn_source_id,status,i
ready reaction_to_metabolite.txt reaction2met bulk None "ramp_rxn_id,rxn_source_id,ramp_cmpd_id,substrate_product,met_source_id,met_name,is_cofactor"
ready reaction_to_protein.txt reaction2protein bulk None "ramp_rxn_id,rxn_source_id,ramp_gene_id,uniprot,protein_name"
ready reaction_protein_to_metabolite.txt reaction_protein2met bulk None "ramp_rxn_id,rxn_source_id,ramp_gene_id,gene_source_id,substrate_product,ramp_cmpd_id,cmpd_source_id,cmpd_name,is_cofactor"
ready rheaReactionToEcClass.txt reaction_ec_class bulk None "ramp_rxn_id,rxn_source_id,rxn_class_ec,ec_level,rxn_class,rxn_class_hierarchy"
3 changes: 2 additions & 1 deletion config/external_resource_config.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ hmdb_gene http https://hmdb.ca/system/downloads/current/hmdb_proteins.zip hmdb_p
hmdb_met_sdf http https://hmdb.ca/system/downloads/current/structures.zip structures.zip structures.sdf ../misc/data/chemprops/hmdb/ zip chem_props_sdf
reactome_met http http://www.reactome.org/download/current/ChEBI2Reactome_All_Levels.txt ChEBI2Reactome_All_Levels.txt ChEBI2Reactome_All_Levels.txt ../misc/data/reactome/ none pathways_mets
reactome_gene http http://www.reactome.org/download/current/UniProt2Reactome_All_Levels.txt UniProt2Reactome_All_Levels.txt UniProt2Reactome_All_Levels.txt ../misc/data/reactome/ none pathways_genes
wiki_pathways_mets_genes http https://wikipathways-data.wmcloud.org/current/rdf/wikipathways-20230710-rdf-wp.zip wikipathways-20230710-rdf-wp.zip ./wp/ ../misc/data/wikipathwaysRDF/ zip pathways_mets_genes
wiki_pathways_mets_genes http https://wikipathways-data.wmcloud.org/current/rdf/wikipathways-20231010-rdf-wp.zip wikipathways-20231010-rdf-wp.zip ./wp/ ../misc/data/wikipathwaysRDF/ zip pathways_mets_genes
chebi_met_sdf ftp https://ftp.ebi.ac.uk/pub/databases/chebi/SDF/ChEBI_complete_3star.sdf.gz ChEBI_complete_3star.sdf.gz ChEBI_complete_3star.sdf ../misc/data/chemprops/chebi/ gzip chem_props_sdf
lipidmaps_met http https://www.lipidmaps.org/files/?file=LMSD&ext=sdf.zip LMSD.sdf.zip structures.sdf ../misc/data/chemprops/lipidmaps/ zip chem_props_sdf
uniprot_human http https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/taxonomic_divisions/uniprot_sprot_human.dat.gz uniprot_sprot_human.dat.gz uniprot_sprot_human.dat ../misc/data/uniprot_human/ gzip proteins
Expand All @@ -14,3 +14,4 @@ rhea_to_ec http https://ftp.expasy.org/databases/rhea/tsv/rhea2ec.tsv rhea2ec.ts
rhea_rxn_direction http https://ftp.expasy.org/databases/rhea/tsv/rhea-directions.tsv rhea-directions.tsv rhea-directions.tsv ../misc/data/rhea/ none rhea_rxn_direction_table
chebi_to_chebi_relations http http://ftp.ebi.ac.uk/pub/databases/chebi/Flat_file_tab_delimited/relation.tsv relation.tsv relation.tsv ../misc/data/chebi/ none chebi_relations
chebi_ontology_owl http http://ftp.ebi.ac.uk/pub/databases/chebi/ontology/chebi.owl.gz chebi.owl.gz chebi.owl ../misc/data/chebi/ gzip chebi_ontology
expasy_ec2class ftp https://ftp.expasy.org/databases/enzyme/enzclass.txt enzclass.txt enzclass.txt ../misc/data/rhea none expasy_ec2class
14 changes: 7 additions & 7 deletions config/ramp_resource_version_update.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
ramp_db_version db_mod_date status data_source_id data_source_name data_source_url data_source_version
v2.3.0 7/20/2023 current hmdb HMDB https://hmdb.ca/ v5.0 (2021-11-17)
v2.3.0 7/20/2023 current reactome Reactome https://reactome.org/ v85 (May 2023)
v2.3.0 7/20/2023 current wiki WikiPathways https://www.wikipathways.org/index.php/WikiPathways v20230710 (2023-07-10)
v2.3.0 7/20/2023 current kegg KEGG https://www.genome.jp/kegg/ from HMDB (v5.0) (2021-11-17)
v2.3.0 7/20/2023 current chebi ChEBI https://www.ebi.ac.uk/chebi/ Release 223 (2023-07-01)
v2.3.0 7/20/2023 current lipidmaps Lipid Maps https://www.lipidmaps.org/ Release 2023-07-12
v2.3.0 8/3/2022 current rhea Rhea https://www.rhea-db.org/ Release 128 (2023-06-28)
v2.4.1 10/24/2023 current hmdb HMDB https://hmdb.ca/ v5.0 (2021-11-17)
v2.4.1 10/24/2023 current reactome Reactome https://reactome.org/ v86 (Sep 2023)
v2.4.1 10/24/2023 current wiki WikiPathways https://www.wikipathways.org/index.php/WikiPathways v20231010 (2023-10-10)
v2.4.1 10/24/2023 current kegg KEGG https://www.genome.jp/kegg/ from HMDB (v5.0) (2021-11-17)
v2.4.1 10/24/2023 current chebi ChEBI https://www.ebi.ac.uk/chebi/ Release 226 (2023-10-01)
v2.4.1 10/24/2023 current lipidmaps Lipid Maps https://www.lipidmaps.org/ Release 2023-10-24
v2.4.1 10/24/2023 current rhea Rhea https://www.rhea-db.org/ Release 129 (2023-09-13)
227 changes: 23 additions & 204 deletions main/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,225 +16,42 @@
class Main():

def runEverything(self, resourceConfigFile, getDatabaseFiles = True):

start = time.time()

sql = writeToSQL()

# build the ramp resource config
resourceConf = RampConfig()
resourceConf.loadConfig(resourceConfigFile)

#stat = getStatistics()
stat = getStatistics()
hmdb = hmdbData(resourceConf)
wikipathways = WikipathwaysRDF(resourceConf)
reactome = reactomeData(resourceConf)
kegg = KeggData()
lipidmaps = lipidmapsChemData(resourceConf)
rhea = RheaParser(resourceConf)

# works based on your computer, setup working directory
os.chdir('../main/')
#
# #kegg.getEverything(False)
# #print("KEGG Wonder")
# print("Getting hmdb...")
# hmdb.getEverything(True)
# print("Getting wiki...")
# wikipathways.getEverything(True)
# print("Getting reactome...")
# reactome.getEverything(True)
#
# # This parses and writes lipid maps
# # sql write will be handled by EntityBuilder
# print("Getting LipidMaps...")
# lipidmaps.getEverything(True)
#
# print("Getting Rhea info...")
# rhea.processRhea()



#
# #Here are the identifiers that are present for each gene:
# #kegg: keggid (mainID), 'Ensembl', 'HGNC', 'HPRD', 'NCBI-GeneID', 'NCBI-ProteinID', 'OMIM', 'UniProt', 'Vega', 'miRBase'
# #wikipathways: (no mainID), 'Entrez', 'Enzyme Nomenclature', 'Uniprot (Uniprot-TrEMBL)
# #hmdb: HMDB-protien-accession (mainID), 'Uniprot'
# #reactome:Uniprot (mainID)
#
# print('Generate compound id')
# hmdbcompoundnum = sql.createRampCompoundID(hmdb.metaboliteIDDictionary, "hmdb", 0)
# print("hmdbcompoundnum: ", hmdbcompoundnum)
# keggcompoundnum = sql.createRampCompoundID(kegg.metaboliteIDDictionary, "kegg", hmdbcompoundnum)
# wikicompoundnum = sql.createRampCompoundID(wikipathways.metaboliteIDDictionary, "wiki", keggcompoundnum)
# print("wikicompoundnum: ", wikicompoundnum)
# reactomecompoundnum = sql.createRampCompoundID(reactome.metaboliteIDDictionary, "reactome", wikicompoundnum)
#
# print('Generate gene id ...')
# hmdbgenenum = sql.createRampGeneID(hmdb.geneInfoDictionary, "hmdb", 0)
# kegggenenum = sql.createRampGeneID(kegg.geneInfoDictionary, "kegg", hmdbgenenum)
# wikigenenum = sql.createRampGeneID(wikipathways.geneInfoDictionary, "wiki", kegggenenum)
# reactomegenenum = sql.createRampGeneID(reactome.geneInfoDictionary, "reactome", wikigenenum)
# print(" hmdbgenenum ", hmdbgenenum, " kegggenenum ", kegggenenum, " wikigenenum ", wikigenenum, " reactomegenenum ", reactomegenenum)
""" print('Write to sql file...')
hmdbnumbers = sql.write(
hmdb.metaboliteCommonName,
hmdb.pathwayDictionary,
hmdb.pathwayCategory,
hmdb.metabolitesWithPathwaysDictionary,
hmdb.metabolitesWithSynonymsDictionary,
hmdb.metaboliteIDDictionary,
hmdb.pathwaysWithGenesDictionary,
hmdb.metabolitesLinkedToGenes,
hmdb.geneInfoDictionary,
hmdb.biofluidLocation,
hmdb.biofluid,
hmdb.cellularLocation,
hmdb.cellular,
hmdb.pathwayOntology,
hmdb.exoEndoDictionary,
hmdb.exoEndo,
hmdb.tissueLocation,
hmdb.tissue,
hmdb.metaInchi,
"hmdb",
0,0)
wikipathwaysnumbers = sql.write(
wikipathways.metaboliteCommonName,
wikipathways.pathwayDictionary,
wikipathways.pathwayCategory,
wikipathways.metabolitesWithPathwaysDictionary,
wikipathways.metabolitesWithSynonymsDictionary,
wikipathways.metaboliteIDDictionary,
wikipathways.pathwaysWithGenesDictionary,
wikipathways.metabolitesLinkedToGenes,
wikipathways.geneInfoDictionary,
wikipathways.biofluidLocation,
wikipathways.biofluid,
wikipathways.cellularLocation,
wikipathways.cellular,
wikipathways.pathwayOntology,
wikipathways.exoEndoDictionary,
wikipathways.exoEndo,
wikipathways.tissueLocation,
wikipathways.tissue,
dict(),
"wiki",
hmdbnumbers[0],hmdbnumbers[1])
reactomenumbers = sql.write(
reactome.metaboliteCommonName,
reactome.pathwayDictionary,
reactome.pathwayCategory,
reactome.metabolitesWithPathwaysDictionary,
reactome.metabolitesWithSynonymsDictionary,
reactome.metaboliteIDDictionary,
reactome.pathwaysWithGenesDictionary,
reactome.metabolitesLinkedToGenes,
reactome.geneInfoDictionary,
reactome.biofluidLocation,
reactome.biofluid,
reactome.cellularLocation,
reactome.cellular,
reactome.pathwayOntology,
reactome.exoEndoDictionary,
reactome.exoEndo,
reactome.tissueLocation,
reactome.tissue,
dict(),
"reactome",
wikipathwaysnumbers[0],wikipathwaysnumbers[1])
keggnumbers = sql.write(
kegg.metaboliteCommonName,
kegg.pathwayDictionary,
kegg.pathwayCategory,
kegg.metabolitesWithPathwaysDictionary,
kegg.metabolitesWithSynonymsDictionary,
kegg.metaboliteIDDictionary,
kegg.pathwaysWithGenesDictionary,
kegg.metabolitesLinkedToGenes,
kegg.geneInfoDictionary,
kegg.biofluidLocation,
kegg.biofluid,
kegg.cellularLocation,
kegg.cellular,
kegg.pathwayOntology,
kegg.exoEndoDictionary,
kegg.exoEndo,
kegg.tissueLocation,
kegg.tissue,
dict(),
"kegg",
reactomenumbers[0],reactomenumbers[1])

print("Done ... for importing database")
#kegg.getEverything(False)
#print("KEGG Wonder")
print("Getting hmdb...")
hmdb.getEverything(True)
print("Getting wiki...")
wikipathways.getEverything(True)
print("Getting reactome...")
reactome.getEverything(True)

print("Compound:")
stat.analyteOverlaps(sql.rampCompoundIdInWhichDatabases, sql.rampCompoundIDdictionary, "Compound")
print("\n")
print("Gene:")
stat.analyteOverlaps(sql.rampGeneIdInWhichDatabases, sql.rampGeneIDdictionary, "Gene")
# This parses and writes lipid maps
# sql write will be handled by EntityBuilder
print("Getting LipidMaps...")
lipidmaps.getEverything(True)

stat.databaseContent(hmdb.pathwayDictionary,
hmdb.pathwayCategory,
hmdb.metabolitesWithPathwaysDictionary,
hmdb.metabolitesWithSynonymsDictionary,
hmdb.metaboliteIDDictionary,
hmdb.pathwaysWithGenesDictionary,
hmdb.geneInfoDictionary,
hmdb.biofluidLocation,
hmdb.biofluid,
hmdb.cellularLocation,
hmdb.cellular,
hmdb.pathwayOntology,
hmdb.exoEndoDictionary,
"hmdb")
stat.databaseContent(kegg.pathwayDictionary,
kegg.pathwayCategory,
kegg.metabolitesWithPathwaysDictionary,
kegg.metabolitesWithSynonymsDictionary,
kegg.metaboliteIDDictionary,
kegg.pathwaysWithGenesDictionary,
kegg.geneInfoDictionary,
kegg.biofluidLocation,
kegg.biofluid,
kegg.cellularLocation,
kegg.cellular,
kegg.pathwayOntology,
kegg.exoEndoDictionary,
"kegg")
stat.databaseContent(reactome.pathwayDictionary,
reactome.pathwayCategory,
reactome.metabolitesWithPathwaysDictionary,
reactome.metabolitesWithSynonymsDictionary,
reactome.metaboliteIDDictionary,
reactome.pathwaysWithGenesDictionary,
reactome.geneInfoDictionary,
reactome.biofluidLocation,
reactome.biofluid,
reactome.cellularLocation,
reactome.cellular,
reactome.pathwayOntology,
reactome.exoEndoDictionary,
"reactome")
stat.databaseContent(wikipathways.pathwayDictionary,
wikipathways.pathwayCategory,
wikipathways.metabolitesWithPathwaysDictionary,
wikipathways.metabolitesWithSynonymsDictionary,
wikipathways.metaboliteIDDictionary,
wikipathways.pathwaysWithGenesDictionary,
wikipathways.geneInfoDictionary,
wikipathways.biofluidLocation,
wikipathways.biofluid,
wikipathways.cellularLocation,
wikipathways.cellular,
wikipathways.pathwayOntology,
wikipathways.exoEndoDictionary,
"wiki")
"""
print("Getting Rhea info...")
rhea.processRhea()

# constructs the entity builder
builder = EntityBuilder(resourceConf)
Expand All @@ -244,10 +61,12 @@ def runEverything(self, resourceConfigFile, getDatabaseFiles = True):
# the result are files for DB loading in /misc/sql

builder.fullBuild()

print(time.time() - start)


# Database loading is handled as a separate, un-coupled step.


resourceConfFile = "../config/external_resource_config.txt"
main = Main()
main.runEverything(resourceConfigFile = resourceConfFile)
Expand Down
12 changes: 7 additions & 5 deletions main/mainDBLoad.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from util.rampDBBulkLoader import rampDBBulkLoader



class mainDBLoad():

def __init__(self):
Expand Down Expand Up @@ -68,14 +67,17 @@ def loadDBAfterTruncatingTables(self, incrementLevel = 'increment_patch_release'
# this method populates a table that reflects the current status of the database.
# metrics such as gene and metabolite counts for reach data sets are tallied.
loader.updateDataStatusSummary()


# generate pathway similarity matrices, analyte lists and whatnot
# this process replaced the old system of having Rdata in the package
loader.generateAndLoadRampSupplementalData()

loader = mainDBLoad()

# increment level 'increment_patch_release', 'increment_minor_release',
# or 'specified' (new version, perhaps major release)
loader.loadDBAfterTruncatingTables(incrementLevel = 'increment_patch_release',
optionalVersionOveride = "",
optionalVersionNote = "20220822 patch release, update chem_props inchi values.",
loader.loadDBAfterTruncatingTables(incrementLevel = 'specified',
optionalVersionOveride = "2.4.2",
optionalVersionNote = "20231107 Data refresh. Rhea Reaction Classes. Reactome gene patch.",
truncateTables=True)

8 changes: 4 additions & 4 deletions main/mainSqliteDBLoad.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def loadDBAfterTruncatingTables(self, sqliteFile, incrementLevel = 'increment_pa

# pass the credentials object to the constructed rampDBBulLoader

loader = SQLiteDBBulkLoader(self.dbPropsFile, sqliteFile)
loader = SQLiteDBBulkLoader(dbPropsFile=self.dbPropsFile, sqliteFileName=sqliteFile)


# truncate tables
Expand Down Expand Up @@ -79,8 +79,8 @@ def loadDBAfterTruncatingTables(self, sqliteFile, incrementLevel = 'increment_pa

# increment level 'increment_patch_release', 'increment_minor_release',
# or 'specified' (new version, perhaps major release)
loader.loadDBAfterTruncatingTables(sqliteFile = '../RaMP_SQLite_v2.3.0_Structure.sqlite', incrementLevel = 'specified',
optionalVersionOveride = "2.3.0",
optionalVersionNote = "20230727 data update/refresh release",
loader.loadDBAfterTruncatingTables(sqliteFile = '/mnt/ncatsprod/braistedjc/tmp_work/RaMP_SQLite_BASE.sqlite', incrementLevel = 'specified',
optionalVersionOveride = "2.4.2",
optionalVersionNote = "20231107 data update, Rhea reaction to EC reaction class. Reactome Genes Patch.",
truncateTables=True)

Loading

0 comments on commit 6bed4f3

Please sign in to comment.