From e144b4cda01be34e5e992abe0c717992aa9d0556 Mon Sep 17 00:00:00 2001 From: johnbraisted Date: Mon, 11 Dec 2023 11:13:53 -0500 Subject: [PATCH 1/2] update to reactomeData.py (entrez genes patch), sqlite update geneCount patch --- src/parse/reactomeData.py | 35 +++++++++++++++++++++++++--------- src/util/SQLiteDBBulkLoader.py | 18 ++++++++++++----- 2 files changed, 39 insertions(+), 14 deletions(-) diff --git a/src/parse/reactomeData.py b/src/parse/reactomeData.py index 219b25e..0a7d053 100755 --- a/src/parse/reactomeData.py +++ b/src/parse/reactomeData.py @@ -389,11 +389,16 @@ def getCommonNameFromUniprot(self): files = os.listdir("../misc/data/Uniprot/") path = "../misc/data/Uniprot/" i = 0 + haveMapping = False + haveGeneSymbol = False #print('Parsing UniProt files ...') for f in files: i = i + 1 #if i % 1000 == 0: #print('Processing {} files'.format(i)) + + haveMapping = False + haveGeneSymbol = False try: tree = ET.parse(path + f) geneid = f.replace(".xml","") @@ -410,31 +415,43 @@ def getCommonNameFromUniprot(self): #print(geneid+":"+name.text) try: mapping = self.geneInfoDictionary['uniprot:'+geneid] + haveMapping = True mapping["common_name"] = "gene_symbol:"+name.text + haveGeneSymbol = True except KeyError: pass + print("Key Error for "+geneid+" in file "+f) + # print("Raw data does not have this ID ...") # print(geneid) # we now have uniprot to 'common_name', really gene id. # now we want to grab the NCBI/Entrez 'GeneID' -# if childtag == "dbReference": -# if child2.get("type") == "GeneID": -# geneId = child2.get("id") -# geneId = 'entrez:'+geneId + if childtag == "dbReference": + if child2.get("type") == "GeneID": + + if not haveMapping: + print("Hey we are adding a gene id but don't have new mapping. Uniprot:"+f) + # we don't have the mapping for the protein based from above... + # jump to next child... eventually next file. + continue + + geneId = child2.get("id") + geneId = 'entrez:'+geneId # protein to gene can be 1:n, so they have to be stored as a list # lets check for a value -# idList = mapping.get("small_e_entrez", None) -# if(idList == None): -# idList = list() -# mapping["small_e_entrez"] = idList + idList = mapping.get("small_e_entrez", None) + if(idList == None): + idList = list() + mapping["small_e_entrez"] = idList -# idList.append(geneId) + idList.append(geneId) except ET.ParseError: print("Skip {} ...".format(f)) + pass # def checkFiles(self): diff --git a/src/util/SQLiteDBBulkLoader.py b/src/util/SQLiteDBBulkLoader.py index 84c8ce9..ae3333e 100644 --- a/src/util/SQLiteDBBulkLoader.py +++ b/src/util/SQLiteDBBulkLoader.py @@ -156,7 +156,6 @@ def loadFile(self, resource, engine): df = df.drop_duplicates(ignore_index=False, inplace=False, keep='first') print(str(df.shape)) - print(df.head(n=5)) table = resource.destTable # this loads the data frame into the table. try: @@ -834,7 +833,7 @@ def updateSourcePathwayCount(self): # conn.execute(sql) # conn.close() - sql = "select ap.rampId, count(distinct(ap.pathwayRampId)) as pathwayCount from analytehaspathway ap "\ + sql = "select count(distinct(ap.pathwayRampId)) as pathwayCount, ap.rampId from analytehaspathway ap "\ "where ap.pathwaySource != 'hmdb' group by ap.rampId" sql2 = "update source set pathwayCount = :pathwayCount where rampId = :rampId" @@ -842,11 +841,20 @@ def updateSourcePathwayCount(self): with self.engine.connect() as conn: df = conn.execute(sql).all() df = pd.DataFrame(df) - df.columns = ["rampId", "pathwayCount"] - + df.columns = ["pathwayCount", "rampId"] + + print("setting pw count... shape=") + print(df.shape) + print(df.head(10)) + + k = 0 for i,row in df.iterrows(): + k = k + 1 + if k < 10: + print(row) + print("\n") conn.execute(sql2, row) - + conn.close() print("Finished: updating pathway counts in source table") From 17ba30fe7258e4192dc70f67c9b0113d4323ee56 Mon Sep 17 00:00:00 2001 From: johnbraisted Date: Fri, 16 Feb 2024 11:59:44 -0500 Subject: [PATCH 2/2] remove old code thats not used. Remediation of vulnerabilites --- src/update/RaMPDatabase.py | 68 ------------------ src/update/RaMPFixer.py | 27 ------- .../__pycache__/RaMPDatabase.cpython-35.pyc | Bin 2293 -> 0 bytes .../__pycache__/RaMPFixer.cpython-35.pyc | Bin 2464 -> 0 bytes 4 files changed, 95 deletions(-) delete mode 100644 src/update/RaMPDatabase.py delete mode 100644 src/update/RaMPFixer.py delete mode 100644 src/update/__pycache__/RaMPDatabase.cpython-35.pyc delete mode 100644 src/update/__pycache__/RaMPFixer.cpython-35.pyc diff --git a/src/update/RaMPDatabase.py b/src/update/RaMPDatabase.py deleted file mode 100644 index 2e43159..0000000 --- a/src/update/RaMPDatabase.py +++ /dev/null @@ -1,68 +0,0 @@ -import os -import pymysql.cursors -from fileinput import close -import pandas as pd - - -class RaMPDatabase(): - ''' - This class is the super class of all checker, updater class - It contains general functions to aid genral functionality of other classes - - attribute str dbname database name for the mysql - - ''' - def __init__(self): - - self.table_names = [ - "analyte", - "analytehasontology", - "analytehaspathway", - "analytesynonym", - "catalyzed", - "ontology", - "pathway", - "source"] - - - - def check_path(self,dir): - ''' - This fucntion check if this directory exists, otherwise it will create one - - param dict dir: The directory to check or created. - - return: True if the path has been created successfully - ''' - if not os.path.exists(dir): - try: - os.makedirs(dir) # check if the directory exists, create one if not - return True - except OSError as e: # Trap the OS error and show the embedded error code - if e.errno != errno.EEXIST: - raise - - - def connectToRaMP(self, host= "localhost", user = "root" - ,password = "Ehe131224",dbname = "mathelabramp"): - ''' - Connect to local RaMP database by MySQL - - param str host host name for the mysql connection - - param str user username for the mysql conncection - - param str dbname database name for connection if None: connect to the database page - instead of table page - - param str password the password you used for you computer's mysql database - ''' - if dbname is not None: - conn = pymysql.connect(host = host, - user= user, - password = password, - db = dbname, - charset = "utf8mb4", - cursorclass = pymysql.cursors.DictCursor) - else: - conn = pymysql.connect(host = host, - user= user, - password = password, - charset = "utf8mb4", - cursorclass = pymysql.cursors.DictCursor) - return conn - diff --git a/src/update/RaMPFixer.py b/src/update/RaMPFixer.py deleted file mode 100644 index 2747155..0000000 --- a/src/update/RaMPFixer.py +++ /dev/null @@ -1,27 +0,0 @@ -from update.RaMPDatabase import RaMPDatabase -import pymysql.err -from builtins import str -import pandas as pd -import numpy as np -from schema import session,Source,Analyte,Pathway,Analytehasontology,\ -Analytehaspathway,Analytesynonym,Ontology,Catalyzed - -class RaMPFixer(RaMPDatabase): - ''' - This class simulates the function of the C# code in RaMP - Get the original RaMP data from MySQL, then do the following things: - 1) For each table, find the missing data cell, and remove the entire row since the map - is not successful. (Some columns are deliberated to left empty with 'NA'.) - 2) Remove special character such as ' ', and wrong data in the cell. Consider drop entire - row based on conditions - 3) Remapping corrected RaMP ID relations based on corrected data. - 4) Create and import new RaMP data to the database. - ''' - - def __init__(self): - super().__init__() - - self.new_tables = dict() - - - diff --git a/src/update/__pycache__/RaMPDatabase.cpython-35.pyc b/src/update/__pycache__/RaMPDatabase.cpython-35.pyc deleted file mode 100644 index 24e39f980f2f0af9209081d4d7d1bd6bb6318754..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2293 zcmZ`*-EQ1O6h5|hy+7SFB~n{aaWO(dXhn(Wk5GvaRHOk_qJfg2w$@74tUa4`;`O*Q zV+BVk7o_jNGw?v%%q>^E0#}?f_HMS&+VRQEc+St9^PSIoV`F3Z;{D&F?IFNluyzGF zeu$6$8bykK9^?R#=f*zdJ}d%=0`S^*9f-Q{T^c~zf!Dab0~bC-A?Y2AyMLn**dKeg zo8(f9gr>_Y#KA*+^tUL)(nsWzDmqAU5Rl2JLsrwS8+Rc^+dWA8kPaXnLV5*WV+^=B zLWdi6^b>#b<0EePlxuPJ=aAubIL|aoa;`PQftd@Yt5T?TNzNG0bC%3Saw61iR+T9? ztGCX;W5W_z7@ifH9f?9Hp0in1Bqo!%VkF~PN-x&ed7hb?p>c^0TVFyub53w$R5q;) z24fUUrv+aKhMryE%*|#}xo8))ewjNZ=49ew#^|E|eSCKD(I26})HQi)GPcUqQ6J6$ zVpq|F@nApd@q*{I5z$sVm~)LMl({^rqxUY?N^a(-ypFE6OS&#(Q7@uFf|2I+nMkAl zO0U=MwxO1lO2pWUuHboc5z}!r!l0u?KKla{)&8Gvc6TN}YoYYy!JRKYzc+a#lWHLf zqbH|Qo#>J$VnWrNXq8Nws<`>Gy4z)K`*EBVnTg}8l!9w0APBtm`|SEUBDR=vB+aNGsc zh2s!T0#HxkH-Kj^s3tz(o?m@V3K24sx0Bd@i-I9VMlS277aH=iZ&Wckf4|1zHq2pJK+!gd(7e zV7h^i!$ysRb0i`UP6nX%TAW}d5huu0SbA)_b0n^=+qR@Bf)k=5fC~s;b)op|-HLBf z>7vSc7r`>%ud&-#>D`q@fuJDnai?N*=u7&T*6c}r@bvrD{v{ZQYbXPa_&>`K&yit7 z;ahbp>{mwO9jZyA+wc@|@4Dl6lH8?JLfSr}cxM?1osginV#bmmU1A&w*$AEz%{rn% zgs#0^_gy0UBdw$@j@wBsD>_Zeh-zt)$VG{qtUlJvvF@ecRXX-4ojRA&Us92ODR7wRzNn0wY3r!Hhidc4H#qID zxPHYjY;YnV1kd308% zQJ3B9^;>@<2KBwN_f$UqVOQpIEUg-1p>A^%uyA=WEI#aATof@HfEm>oAHe)UG{Fj9 z>-0O;5z<8sTNJL)YXEMt&R)M=b~x69=%q69;~G&{;Mku=Rs^xMR@f*hVwo!|@~IM& zA`S9Lr$SHM`_2a<(4i7hD%g!PeyDOM(XWSglVLe9Zd8X4ruaEz&tR&0r_3gfZSOW-IQSC#P( z#EwpF6v84jIujLq=bcmFMhFE8ALulUcnhqP-FA^>mT@rxZHz+DuqyeZUDN}6$I6!- zES0!+1Lx(lzSuDeiMk?2Nv2IMQgwW3>reoO$~KIpvI{)dghYHt^H3e9-NYp)IwLwl zdqbGl_;y0%yD{18d&7RCY}z7IrfmCult#JlmyIwA@=+IYeYf2fjo+{;p5Wf`ebiW%&DYPTV6en9EPIJW&aoUDnPdUy#s?eoxr5VQMdl}%G>@Kj$XulpH-UJz zH@F6vz%!42^Cvo@J;-HMY|D|X-M*BAO2*Q5Hv7T-bs@uK5SFip|DGb4Z3!OCjjZv(JDnw>%U6(cxiR1 z8}=?Kt`ZB6dc3ELuxoD$yVt)WU!pv&!_HadoI$ElwL%$#A&+vE*oq(2OT1v>c;c%z ztBh5pYC$$ZT!gA336}8zvbgVNlZOedDs!U2(cM%Tb?yLaG7;eD3N=6RJid@|0nS7?9vzpENM-US zyC(~>499OaUd~;WSx{hxD_pq-Q?Yv^kKJ^hgBs2^f6>Jfd1mwru*)SB^UZxOu|u@f zLT+*aVEzt94{Rbns6c|I95ep;wIL{pAx)6C5>wEdjpjvfCF`W^Jc>%QVnGP>7Uc&^)WM=u&>$Ym^ZMW86E#Iw} zbHK267qoAf