add parts for SQLite DB loading

ncats · Aug 7, 2023 · 01c6583 · 01c6583
1 parent df46a09
commit 01c6583
Show file tree

Hide file tree

Showing 5 changed files with 1,330 additions and 2 deletions.
diff --git a/config/db_load_resource_config.txt b/config/db_load_resource_config.txt
@@ -4,7 +4,7 @@ ready	analytesource.txt	source	bulk	None	"sourceId,rampId,IDtype,geneOrCompound,
 ready	analytesynonym.txt	analytesynonym	bulk	None	"Synonym,rampId,geneOrCompound,source"
 ready	analytetopathway.txt	analytehaspathway	bulk	None	"rampId,pathwayRampId,pathwaySource"
 ready	analyte.txt	analyte	bulk	rampId	"rampId,type"
-ready	catalyzes.txt	catalyzed	bulk	None	"rampCompoundId,rampGeneId"
+ready	catalyzes.txt	catalyzed	bulk	None	"rampCompoundId,rampGeneId, proteinType"
 empty	reactomecatalyzed.sql	catalyzed	bulk	None	"rampCompoundId,rampGeneId"
 empty	wikicatalyzed.sql	catalyzed	bulk	None	"rampCompoundId,rampGeneId"
 ready	chemProps.txt	chem_props	bulk	None	"ramp_id,chem_data_source,chem_source_id,iso_smiles,inchi_key_prefix,inchi_key,inchi,mw,monoisotop_mass,common_name,mol_formula"

diff --git a/main/mainSqliteDBLoad.py b/main/mainSqliteDBLoad.py
@@ -0,0 +1,86 @@
+import sys
+sys.path.append('../src')
+from util.SQLiteDBBulkLoader import SQLiteDBBulkLoader
+
+
+
+class mainSQLiteDBLoad(object):
+
+    def __init__(self):
+
+
+        # db login credentials and host info
+        # this is a private text file for login credentials
+        # Format:
+            # host=<host_uri>
+            # dbname=<db_name_usually_ramp>
+            # username=<db_user_name_often_root>
+            # conpass=<db_connection_password>
+        self.dbPropsFile = "../config/ramp_db_props.txt"
+
+        # config for tables to load
+        # a tab delimited file indicating which tables to load.
+        self.dbConfigFilePath = "../config/db_load_resource_config.txt"
+
+
+
+    def loadDBAfterTruncatingTables(self, sqliteFile, incrementLevel = 'increment_patch_release', optionalVersionOveride = None, optionalVersionNote = None, truncateTables = False, tablesToKeep=['db_version', 'version_info']):
+
+
+
+    ################# DB Loading Instructions
+
+        # Sets logging level
+
+        # config file holds login credentials in this format:
+
+        # pass the credentials object to the constructed rampDBBulLoader
+
+        loader = SQLiteDBBulkLoader(self.dbPropsFile, sqliteFile)
+
+
+        # truncate tables
+        if truncateTables:
+            loader.truncateTables(tablesToSkip=tablesToKeep)
+
+
+        # update methods
+        # the sql_resource_config.txt is a tab delimited file indicating which resources to load
+        # those marked as 'ready' will be updated. Usually all database tables are updated in one run.
+        # this method loads the intermediate parsing results from the ../../misc/sql/ directory.
+        loader.load(self.dbConfigFilePath)     
+
+        # update Ontology Metabolite counts
+        loader.updateOntologyMetaboliteCounts()
+
+        # update Source pathwayCount
+        loader.updateSourcePathwayCount()
+
+        # sets the new updated version
+        loader.updateDBVersion(incrementLevel = incrementLevel, optionalVersion = optionalVersionOveride, optionalNote = optionalVersionNote)
+
+        # sets the analyte intercept json in the version table.
+        # precondition: the updateDBVersion must have been set so that the
+        # intersections can be attached to the current version
+        loader.updateEntityIntersects()
+
+        # this optional method tracks database version information supplied in this file.
+        loader.updateVersionInfo("../config/ramp_resource_version_update.txt")
+
+        # this method populates a table that reflects the current status of the database.
+        # metrics such as gene and metabolite counts for reach data sets are tallied.
+        loader.updateDataStatusSummary()
+
+        # generate pathway similarity matrices, analyte lists and whatnot
+        # this process replaced the old system of having Rdata in the package
+        loader.generateAndLoadRampSupplementalData()
+
+loader = mainSQLiteDBLoad()
+
+# increment level 'increment_patch_release', 'increment_minor_release', 
+# or 'specified' (new version, perhaps major release)
+loader.loadDBAfterTruncatingTables(sqliteFile = '../RaMP_SQLite_v2.3.0_Structure.sqlite', incrementLevel = 'specified',  
+                                   optionalVersionOveride = "2.3.0", 
+                                   optionalVersionNote = "20230727 data update/refresh release", 
+                                   truncateTables=True)
+
diff --git a/src/util/EntityBuilder.py b/src/util/EntityBuilder.py
@@ -1258,7 +1258,8 @@ def writeReactionEntities(self):
 
         for rxnId in self.reactionDict:
             rxn = self.reactionDict[rxnId]
-            file.write(rxn.getMainRecordString())
+            if rxn.status > 0:
+                file.write(rxn.getMainRecordString())
 
         file.close()
 

diff --git a/src/util/RampSupplementalDataBuilder.py b/src/util/RampSupplementalDataBuilder.py
@@ -0,0 +1,155 @@
+'''
+Created on Aug 2, 2023
+
+@author: braistedjc
+'''
+import pandas as pd
+from sqlalchemy import create_engine
+from sqlalchemy import MetaData
+from sklearn.metrics.pairwise import pairwise_distances
+
+class RampSupplementalDataBuilder(object):
+    '''
+    classdocs
+    '''
+
+
+    def __init__(self, dbType, credInfo):
+        '''
+        Constructor
+        '''
+        # the type of DB, MySQL or SQLite
+        self.dbType = dbType
+
+        # a MySQL RaMP db_properties file, or an SQLite DB file 
+        self.credInfo = credInfo
+
+        # sqlalchemy engine to provide connections to DB        
+        self.engine = None
+
+        if self.dbType == 'sqlite':
+            self.engine = self.createSQLiteEngine(self.credInfo)
+
+        # all analyte pathway similarity matrix
+        self.analyteResult = None
+
+        # all analyte pathway similarity matrix        
+        self.metsResult = None
+
+        # all analyte pathway similarity matrix
+        self.genesResult = None
+
+
+
+    def createSQLiteEngine(self, sqliteFile=None):
+        engine = create_engine('sqlite:///'+sqliteFile, echo=False)
+        return engine
+
+    def listTables(self):
+        if self.dbType == 'mysql':
+            sql = 'show tables'
+        elif self.dbType == 'sqlite':
+            sql = "SELECT name FROM sqlite_master WHERE type ='table' AND name NOT LIKE 'sqlite_%'";
+        else:
+            print("Unsupported DB Type: " + self.dbType)
+            return
+
+        with self.engine.connect() as conn:
+            tables = conn.execute(sql).all()            
+            tables = pd.DataFrame(tables)
+            print("tables shape:" + str(tables.shape))
+            print(tables)
+            conn.close()
+
+    def buildPathwaySimilarityMatrices(self):
+        x = None    
+
+    def buildAnalyteSetStats(self):
+        x = None
+
+    def buildSimilarityMatrix(self, matrixType):
+        df = None
+
+        analyteKey = 'RAMP_%'
+        minPathwaySize = 10
+
+        if matrixType == 'mets':
+            analyteKey = 'RAMP_C%'
+            minPathwaySize = 5
+        elif matrixType == 'genes':
+            analyteKey = 'RAMP_G%'
+            minPathwaySize = 5
+
+        sql = "select ap.pathwayRampId, ap.rampID from analytehaspathway ap, pathway p "\
+        "where p.type != 'hmdb' and ap.pathwayRampId = p.pathwayRampId and ap.rampId like '" + analyteKey + "'"
+
+        with self.engine.connect() as conn:
+            df = conn.execute(sql).all()
+            df = pd.DataFrame(df)
+            df.columns = ['pathwayRampId', 'rampId']
+            print(df.shape)
+            print(list(df.columns))
+
+            crossTab = pd.crosstab(df['rampId'], df['pathwayRampId'])
+            ctSums = crossTab.sum(axis=0)
+            pwSubset = ctSums[ctSums >= minPathwaySize]
+
+            pwNames = pwSubset.index.values.tolist()
+            crossTab = crossTab.loc[:,pwNames]
+
+            dm = 1.0 - pairwise_distances(crossTab.T.to_numpy(), metric='jaccard')
+
+
+            dm = pd.DataFrame(dm)
+
+            dm.columns = crossTab.columns       
+            dm.index = crossTab.columns
+
+            conn.close()
+
+        return dm
+
+
+
+    def buildAnalyteSet(self, dataSource, geneOrMet):
+
+        print("building analyte stat set")
+
+        rampIdPrefix = "RAMP_C%"
+        if geneOrMet == 'genes':
+            rampIdPrefix = "RAMP_G%"
+
+        sql = "select ap.pathwayRampId, count(distinct(ap.rampId)) as Freq, p.type as pathwaySource "\
+        "from analytehaspathway ap, pathway p "\
+        "where p.type = '" + dataSource + "' and ap.pathwayRampId = p.pathwayRampId and ap.rampId like '" + rampIdPrefix + "' group by ap.pathwayRampId"
+
+        df = None
+
+        with self.engine.connect() as conn:
+            df = conn.execute(sql).all()
+            df = pd.DataFrame(df)
+
+            print("Stats shape")
+            print(df.shape)
+            print("Stats header")
+            print(df.columns)
+
+            conn.close()
+
+        return df
+
+
+#pwob = PathwayOverlapBuilder(dbType = "sqlite", credInfo = "X:\\braistedjc\\tmp_work\\RaMP_SQLite_v2.3.0_Structure.sqlite")
+#pwob.listTables()
+#pwob.buildBaseMatrix(matrixType = "analytes")
+# pwob.buildSimilarityMatrix(matrixType = "genes")
+
+#pwob.buildAnalyteSet("wiki", "met")
+#pwob.buildAnalyteSet("wiki", "gene")
+
+#pwob.buildAnalyteSet("reactome", "met")
+#pwob.buildAnalyteSet("reactome", "gene")
+
+#pwob.buildAnalyteSet("hmdb", "met")
+# pwob.buildAnalyteSet("hmdb", "gene")
+#pwob.buildBaseMatrix(matrixType = "genes")