Skip to content

Commit

Permalink
add parts for SQLite DB loading
Browse files Browse the repository at this point in the history
  • Loading branch information
johnbraisted committed Aug 7, 2023
1 parent df46a09 commit 01c6583
Show file tree
Hide file tree
Showing 5 changed files with 1,330 additions and 2 deletions.
2 changes: 1 addition & 1 deletion config/db_load_resource_config.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ ready analytesource.txt source bulk None "sourceId,rampId,IDtype,geneOrCompound,
ready analytesynonym.txt analytesynonym bulk None "Synonym,rampId,geneOrCompound,source"
ready analytetopathway.txt analytehaspathway bulk None "rampId,pathwayRampId,pathwaySource"
ready analyte.txt analyte bulk rampId "rampId,type"
ready catalyzes.txt catalyzed bulk None "rampCompoundId,rampGeneId"
ready catalyzes.txt catalyzed bulk None "rampCompoundId,rampGeneId, proteinType"
empty reactomecatalyzed.sql catalyzed bulk None "rampCompoundId,rampGeneId"
empty wikicatalyzed.sql catalyzed bulk None "rampCompoundId,rampGeneId"
ready chemProps.txt chem_props bulk None "ramp_id,chem_data_source,chem_source_id,iso_smiles,inchi_key_prefix,inchi_key,inchi,mw,monoisotop_mass,common_name,mol_formula"
Expand Down
86 changes: 86 additions & 0 deletions main/mainSqliteDBLoad.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
import sys
sys.path.append('../src')
from util.SQLiteDBBulkLoader import SQLiteDBBulkLoader



class mainSQLiteDBLoad(object):

def __init__(self):


# db login credentials and host info
# this is a private text file for login credentials
# Format:
# host=<host_uri>
# dbname=<db_name_usually_ramp>
# username=<db_user_name_often_root>
# conpass=<db_connection_password>
self.dbPropsFile = "../config/ramp_db_props.txt"

# config for tables to load
# a tab delimited file indicating which tables to load.
self.dbConfigFilePath = "../config/db_load_resource_config.txt"



def loadDBAfterTruncatingTables(self, sqliteFile, incrementLevel = 'increment_patch_release', optionalVersionOveride = None, optionalVersionNote = None, truncateTables = False, tablesToKeep=['db_version', 'version_info']):



################# DB Loading Instructions

# Sets logging level

# config file holds login credentials in this format:

# pass the credentials object to the constructed rampDBBulLoader

loader = SQLiteDBBulkLoader(self.dbPropsFile, sqliteFile)


# truncate tables
if truncateTables:
loader.truncateTables(tablesToSkip=tablesToKeep)


# update methods
# the sql_resource_config.txt is a tab delimited file indicating which resources to load
# those marked as 'ready' will be updated. Usually all database tables are updated in one run.
# this method loads the intermediate parsing results from the ../../misc/sql/ directory.
loader.load(self.dbConfigFilePath)

# update Ontology Metabolite counts
loader.updateOntologyMetaboliteCounts()

# update Source pathwayCount
loader.updateSourcePathwayCount()

# sets the new updated version
loader.updateDBVersion(incrementLevel = incrementLevel, optionalVersion = optionalVersionOveride, optionalNote = optionalVersionNote)

# sets the analyte intercept json in the version table.
# precondition: the updateDBVersion must have been set so that the
# intersections can be attached to the current version
loader.updateEntityIntersects()

# this optional method tracks database version information supplied in this file.
loader.updateVersionInfo("../config/ramp_resource_version_update.txt")

# this method populates a table that reflects the current status of the database.
# metrics such as gene and metabolite counts for reach data sets are tallied.
loader.updateDataStatusSummary()

# generate pathway similarity matrices, analyte lists and whatnot
# this process replaced the old system of having Rdata in the package
loader.generateAndLoadRampSupplementalData()

loader = mainSQLiteDBLoad()

# increment level 'increment_patch_release', 'increment_minor_release',
# or 'specified' (new version, perhaps major release)
loader.loadDBAfterTruncatingTables(sqliteFile = '../RaMP_SQLite_v2.3.0_Structure.sqlite', incrementLevel = 'specified',
optionalVersionOveride = "2.3.0",
optionalVersionNote = "20230727 data update/refresh release",
truncateTables=True)

3 changes: 2 additions & 1 deletion src/util/EntityBuilder.py
Original file line number Diff line number Diff line change
Expand Up @@ -1258,7 +1258,8 @@ def writeReactionEntities(self):

for rxnId in self.reactionDict:
rxn = self.reactionDict[rxnId]
file.write(rxn.getMainRecordString())
if rxn.status > 0:
file.write(rxn.getMainRecordString())

file.close()

Expand Down
155 changes: 155 additions & 0 deletions src/util/RampSupplementalDataBuilder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
'''
Created on Aug 2, 2023
@author: braistedjc
'''
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy import MetaData
from sklearn.metrics.pairwise import pairwise_distances

class RampSupplementalDataBuilder(object):
'''
classdocs
'''


def __init__(self, dbType, credInfo):
'''
Constructor
'''
# the type of DB, MySQL or SQLite
self.dbType = dbType

# a MySQL RaMP db_properties file, or an SQLite DB file
self.credInfo = credInfo

# sqlalchemy engine to provide connections to DB
self.engine = None

if self.dbType == 'sqlite':
self.engine = self.createSQLiteEngine(self.credInfo)

# all analyte pathway similarity matrix
self.analyteResult = None

# all analyte pathway similarity matrix
self.metsResult = None

# all analyte pathway similarity matrix
self.genesResult = None



def createSQLiteEngine(self, sqliteFile=None):
engine = create_engine('sqlite:///'+sqliteFile, echo=False)
return engine

def listTables(self):
if self.dbType == 'mysql':
sql = 'show tables'
elif self.dbType == 'sqlite':
sql = "SELECT name FROM sqlite_master WHERE type ='table' AND name NOT LIKE 'sqlite_%'";
else:
print("Unsupported DB Type: " + self.dbType)
return

with self.engine.connect() as conn:
tables = conn.execute(sql).all()
tables = pd.DataFrame(tables)
print("tables shape:" + str(tables.shape))
print(tables)
conn.close()

def buildPathwaySimilarityMatrices(self):
x = None

def buildAnalyteSetStats(self):
x = None

def buildSimilarityMatrix(self, matrixType):
df = None

analyteKey = 'RAMP_%'
minPathwaySize = 10

if matrixType == 'mets':
analyteKey = 'RAMP_C%'
minPathwaySize = 5
elif matrixType == 'genes':
analyteKey = 'RAMP_G%'
minPathwaySize = 5

sql = "select ap.pathwayRampId, ap.rampID from analytehaspathway ap, pathway p "\
"where p.type != 'hmdb' and ap.pathwayRampId = p.pathwayRampId and ap.rampId like '" + analyteKey + "'"

with self.engine.connect() as conn:
df = conn.execute(sql).all()
df = pd.DataFrame(df)
df.columns = ['pathwayRampId', 'rampId']
print(df.shape)
print(list(df.columns))

crossTab = pd.crosstab(df['rampId'], df['pathwayRampId'])
ctSums = crossTab.sum(axis=0)
pwSubset = ctSums[ctSums >= minPathwaySize]

pwNames = pwSubset.index.values.tolist()
crossTab = crossTab.loc[:,pwNames]

dm = 1.0 - pairwise_distances(crossTab.T.to_numpy(), metric='jaccard')


dm = pd.DataFrame(dm)

dm.columns = crossTab.columns
dm.index = crossTab.columns

conn.close()

return dm



def buildAnalyteSet(self, dataSource, geneOrMet):

print("building analyte stat set")

rampIdPrefix = "RAMP_C%"
if geneOrMet == 'genes':
rampIdPrefix = "RAMP_G%"

sql = "select ap.pathwayRampId, count(distinct(ap.rampId)) as Freq, p.type as pathwaySource "\
"from analytehaspathway ap, pathway p "\
"where p.type = '" + dataSource + "' and ap.pathwayRampId = p.pathwayRampId and ap.rampId like '" + rampIdPrefix + "' group by ap.pathwayRampId"

df = None

with self.engine.connect() as conn:
df = conn.execute(sql).all()
df = pd.DataFrame(df)

print("Stats shape")
print(df.shape)
print("Stats header")
print(df.columns)

conn.close()

return df


#pwob = PathwayOverlapBuilder(dbType = "sqlite", credInfo = "X:\\braistedjc\\tmp_work\\RaMP_SQLite_v2.3.0_Structure.sqlite")
#pwob.listTables()
#pwob.buildBaseMatrix(matrixType = "analytes")
# pwob.buildSimilarityMatrix(matrixType = "genes")

#pwob.buildAnalyteSet("wiki", "met")
#pwob.buildAnalyteSet("wiki", "gene")

#pwob.buildAnalyteSet("reactome", "met")
#pwob.buildAnalyteSet("reactome", "gene")

#pwob.buildAnalyteSet("hmdb", "met")
# pwob.buildAnalyteSet("hmdb", "gene")
#pwob.buildBaseMatrix(matrixType = "genes")
Loading

0 comments on commit 01c6583

Please sign in to comment.