From e144b4cda01be34e5e992abe0c717992aa9d0556 Mon Sep 17 00:00:00 2001
From: johnbraisted <jb212828@gmail.com>
Date: Mon, 11 Dec 2023 11:13:53 -0500
Subject: [PATCH 1/2] update to reactomeData.py (entrez genes patch), sqlite
 update geneCount patch

---
 src/parse/reactomeData.py      | 35 +++++++++++++++++++++++++---------
 src/util/SQLiteDBBulkLoader.py | 18 ++++++++++++-----
 2 files changed, 39 insertions(+), 14 deletions(-)

diff --git a/src/parse/reactomeData.py b/src/parse/reactomeData.py
index 219b25e..0a7d053 100755
--- a/src/parse/reactomeData.py
+++ b/src/parse/reactomeData.py
@@ -389,11 +389,16 @@ def getCommonNameFromUniprot(self):
         files = os.listdir("../misc/data/Uniprot/")
         path = "../misc/data/Uniprot/"   
         i = 0
+        haveMapping = False
+        haveGeneSymbol = False
         #print('Parsing UniProt files ...')
         for f in files:
             i = i + 1
             #if i % 1000 == 0:
                 #print('Processing {} files'.format(i))
+
+            haveMapping = False
+            haveGeneSymbol = False
             try:
                 tree = ET.parse(path + f)
                 geneid = f.replace(".xml","")
@@ -410,31 +415,43 @@ def getCommonNameFromUniprot(self):
                                     #print(geneid+":"+name.text)
                                     try:
                                         mapping = self.geneInfoDictionary['uniprot:'+geneid]
+                                        haveMapping = True
                                         mapping["common_name"] = "gene_symbol:"+name.text
+                                        haveGeneSymbol = True
                                     except KeyError:
                                         pass
+                                        print("Key Error for "+geneid+" in file "+f)
+	                                
                                         # print("Raw data does not have this ID ...")
                                         # print(geneid)
                                         
                         # we now have uniprot to 'common_name', really gene id.
                         # now we want to grab the NCBI/Entrez 'GeneID'                 
-#                        if childtag == "dbReference":
-#                            if child2.get("type") == "GeneID":
-#                                geneId = child2.get("id")
-#                                geneId = 'entrez:'+geneId
+                        if childtag == "dbReference":
+                            if child2.get("type") == "GeneID":
+                                
+                                if not haveMapping:
+                                    print("Hey we are adding a gene id but don't have new mapping. Uniprot:"+f)
+                                    # we don't have the mapping for the protein based from above... 
+                                    # jump to next child... eventually next file.
+                                    continue
+
+                                geneId = child2.get("id")
+                                geneId = 'entrez:'+geneId
                                 # protein to gene can be 1:n, so they have to be stored as a list
                                 # lets check for a value
-#                                idList = mapping.get("small_e_entrez", None)
-#                                if(idList == None):
-#                                    idList = list()
-#                                    mapping["small_e_entrez"] = idList
+                                idList = mapping.get("small_e_entrez", None)
+                                if(idList == None):
+                                    idList = list()
+                                    mapping["small_e_entrez"] = idList
 
-#                                idList.append(geneId)  
+                                idList.append(geneId)  
                                 
                         
                                         
             except ET.ParseError:
                 print("Skip {} ...".format(f))
+                
                 pass
              
 #     def checkFiles(self):
diff --git a/src/util/SQLiteDBBulkLoader.py b/src/util/SQLiteDBBulkLoader.py
index 84c8ce9..ae3333e 100644
--- a/src/util/SQLiteDBBulkLoader.py
+++ b/src/util/SQLiteDBBulkLoader.py
@@ -156,7 +156,6 @@ def loadFile(self, resource, engine):
             df = df.drop_duplicates(ignore_index=False, inplace=False, keep='first')
             print(str(df.shape))
 
-        print(df.head(n=5))
         table = resource.destTable
         # this loads the data frame into the table.
         try:
@@ -834,7 +833,7 @@ def updateSourcePathwayCount(self):
         #    conn.execute(sql)
         #    conn.close()
     
-        sql = "select ap.rampId, count(distinct(ap.pathwayRampId)) as pathwayCount from analytehaspathway ap "\
+        sql = "select count(distinct(ap.pathwayRampId)) as pathwayCount, ap.rampId from analytehaspathway ap "\
         "where ap.pathwaySource != 'hmdb' group by ap.rampId"
         
         sql2 = "update source set pathwayCount = :pathwayCount where rampId = :rampId"
@@ -842,11 +841,20 @@ def updateSourcePathwayCount(self):
         with self.engine.connect() as conn:
             df = conn.execute(sql).all()
             df = pd.DataFrame(df)
-            df.columns = ["rampId", "pathwayCount"]
-            
+            df.columns = ["pathwayCount", "rampId"]
+
+            print("setting pw count... shape=")
+            print(df.shape)
+            print(df.head(10))
+
+            k = 0
             for i,row in df.iterrows():
+                k = k + 1
+                if k < 10:
+                    print(row)
+                    print("\n")
                 conn.execute(sql2, row)
-            
+
             conn.close()
 
         print("Finished: updating pathway counts in source table")

From 17ba30fe7258e4192dc70f67c9b0113d4323ee56 Mon Sep 17 00:00:00 2001
From: johnbraisted <jb212828@gmail.com>
Date: Fri, 16 Feb 2024 11:59:44 -0500
Subject: [PATCH 2/2] remove old code thats not used. Remediation of
 vulnerabilites

---
 src/update/RaMPDatabase.py                    |  68 ------------------
 src/update/RaMPFixer.py                       |  27 -------
 .../__pycache__/RaMPDatabase.cpython-35.pyc   | Bin 2293 -> 0 bytes
 .../__pycache__/RaMPFixer.cpython-35.pyc      | Bin 2464 -> 0 bytes
 4 files changed, 95 deletions(-)
 delete mode 100644 src/update/RaMPDatabase.py
 delete mode 100644 src/update/RaMPFixer.py
 delete mode 100644 src/update/__pycache__/RaMPDatabase.cpython-35.pyc
 delete mode 100644 src/update/__pycache__/RaMPFixer.cpython-35.pyc

diff --git a/src/update/RaMPDatabase.py b/src/update/RaMPDatabase.py
deleted file mode 100644
index 2e43159..0000000
--- a/src/update/RaMPDatabase.py
+++ /dev/null
@@ -1,68 +0,0 @@
-import os
-import pymysql.cursors
-from fileinput import close
-import pandas as pd
-
-
-class RaMPDatabase():
-    '''
-    This class is the super class of all checker, updater class 
-    It contains general functions to aid genral functionality of other classes
-    
-    attribute str dbname database name for the mysql
-     
-    '''
-    def __init__(self):
-        
-        self.table_names = [
-                       "analyte",
-                       "analytehasontology",
-                       "analytehaspathway",
-                       "analytesynonym",
-                       "catalyzed",
-                       "ontology",
-                       "pathway",
-                       "source"]
-        
-        
-        
-    def check_path(self,dir):
-        '''
-        This fucntion check if this directory exists, otherwise it will create one
-        - param dict dir: The directory to check or created.
-        - return: True if the path has been created successfully
-        '''
-        if not os.path.exists(dir):
-            try:
-                os.makedirs(dir) # check if the directory exists, create one if not
-                return True
-            except OSError as e: # Trap the OS error and show the embedded error code
-                if e.errno != errno.EEXIST:
-                    raise
-        
-    
-    def connectToRaMP(self, host= "localhost", user = "root"
-                      ,password = "Ehe131224",dbname = "mathelabramp"):
-        '''
-        Connect to local RaMP database by MySQL
-        - param str host host name for the mysql connection
-        - param str user username for the mysql conncection 
-        - param str dbname database name for connection if None: connect to the database page
-        instead of table page
-        - param str password the password you used for you computer's mysql database 
-        '''
-        if dbname is not None:
-            conn = pymysql.connect(host = host,
-                                   user= user,
-                                   password = password,
-                                   db = dbname,
-                                   charset = "utf8mb4",
-                                   cursorclass = pymysql.cursors.DictCursor)
-        else:
-            conn = pymysql.connect(host = host,
-                                   user= user,
-                                   password = password,
-                                   charset = "utf8mb4",
-                                   cursorclass = pymysql.cursors.DictCursor)
-        return conn     
-        
diff --git a/src/update/RaMPFixer.py b/src/update/RaMPFixer.py
deleted file mode 100644
index 2747155..0000000
--- a/src/update/RaMPFixer.py
+++ /dev/null
@@ -1,27 +0,0 @@
-from update.RaMPDatabase import RaMPDatabase
-import pymysql.err
-from builtins import str
-import pandas as pd
-import numpy as np
-from schema import session,Source,Analyte,Pathway,Analytehasontology,\
-Analytehaspathway,Analytesynonym,Ontology,Catalyzed
-
-class RaMPFixer(RaMPDatabase):
-    '''
-    This class simulates the function of the C# code in RaMP
-    Get the original RaMP data from MySQL, then do the following things:
-    1) For each table, find the missing data cell, and remove the entire row since the map
-    is not successful. (Some columns are deliberated to left empty with 'NA'.)
-    2) Remove special character such as ' ', and wrong data in the cell. Consider drop entire
-    row based on conditions
-    3) Remapping corrected RaMP ID relations based on corrected data.
-    4) Create and import new RaMP data to the database.
-    '''
-    
-    def __init__(self):
-        super().__init__()
-        
-        self.new_tables = dict()
-            
-    
-
diff --git a/src/update/__pycache__/RaMPDatabase.cpython-35.pyc b/src/update/__pycache__/RaMPDatabase.cpython-35.pyc
deleted file mode 100644
index 24e39f980f2f0af9209081d4d7d1bd6bb6318754..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 2293
zcmZ`*-EQ1O6h5|hy+7SFB~n{aaWO(dXhn(Wk5GvaRHOk_qJfg2w$@74tUa4`;`O*Q
zV+BVk7o_jNGw?v%%q>^E0#}?f_HMS&+VRQEc+St9^PSIoV`F3Z;{D&F?IFNluyzGF
zeu$6$8bykK9^?R#=f*zdJ}d%=0`S^*9f-Q{T^c~zf!Dab0~bC-A?Y2AyMLn**dKeg
zo8(f9gr>_Y#KA*+^tUL)(nsWzDmqAU5Rl2JLsrwS8+Rc^+dWA8kPaXnLV5*WV+^=B
zLWdi6^b>#b<0EePlxuPJ=aAubIL|aoa;`PQftd@Yt5T?TNzNG0bC%3Saw61iR+T9?
ztGCX;W5W_z7@ifH9f?9Hp0in1Bqo!%VkF~PN-x&ed7hb?p>c^0TVFyub53w$R5q;)
z24fUUrv+aKhMryE%*|#}xo8))ewjNZ=49ew#^|E|eSCKD(I26})HQi)GPcUqQ6J6$
zVpq|F@nApd@q*{I5z$sVm~)LMl({^rqxUY?N^a(-ypFE6OS&#(Q7@uFf|2I+nMkAl
zO0U=MwxO1lO2pWUuHboc5z}!r!l0u?KKla{)&8Gvc6TN}YoYYy!JRKYzc+a#lWHLf
zqbH|Qo#>J$VnWrNXq8Nws<`>Gy4z)K`*EBVnTg}8l!9w0APBtm`|SEUBDR<N6QB0t
zMSv$gm%}*@48XAm$3C2UaN&VYp}y`GgLcIuY(Fb@pkq34G6a=?_sR$F>=vB+aNGsc
zh2s!T0#HxkH-Kj^s3tz(o?m@<UU={vX9wfp%r-?`LUdLo1wpF8BFko2eFSWpDUled
zY9@Zkw9&U4NS$U{u*|U2jL@qDf|kibtW18&O0M_<)e}QDcCZf+VJl+@TZeguC~MTy
z?KNd3Or;7`QWd(PPy`w@bB0J^Qz0-)=QG7-N)n;<tjhCxW#Djs+_S#awnIubY(lsF
z1wRoOgSNdN4)&Ci%65fPg|y+`-t)%?hvPtXu?RN6P1X|*DUImS^3a7Glg$rsuBq%0
zZehhYyuP=E_u0)i>V3K24sx0Bd@i-I9VMlS277aH=iZ&Wckf4|1zHq2pJK+!gd(7e
zV7h^i!$ysRb0i`UP6nX%TAW}d5huu0SbA)_b0n^=+qR@Bf)k=5fC~s;b)op|-HLBf
z>7vSc7r`>%ud&-#>D`q@fuJDnai?N*=u7&T*6c}r@bvrD{v{ZQYbXPa_&>`K&yit7
z;ahbp>{mwO9jZyA+wc@|@4Dl6lH8?JLfSr}cxM?1osginV#bmmU1A&w*$AEz%{rn%
zgs#0^_gy0UBdw$@j@wBsD>_Zeh-zt)$VG{qtUlJvvF@ecRXX-<r@;`V!S<Jit)EV9
zFPU?tg|WTL%pNSJ_wALWQd%na^%-|;udJP2+}R`=2X2G5!$;VtyKXWLT@$G52sw3w
z#7z`7Y!YZYr2F5_*k}zNN`k^CxL@Bw0mG5E>4ojR<uAvZNLOz*N)u0`PR)zdN=1%i
zJB;InOe^H^*pA}(WySOMO)rj9nZ&W8uaMdz@gWInoHaqMtB7c6UE@y<81+JLbak{D
zdTxE(i#FTLY`6JP)ZfuymgOQVuqKKK+lD1CQm(CErcQCqiSy}5qW@(R?pO4E)U-bG
Iu6d*2Uj+JPM*si-

diff --git a/src/update/__pycache__/RaMPFixer.cpython-35.pyc b/src/update/__pycache__/RaMPFixer.cpython-35.pyc
deleted file mode 100644
index 7724f6515ca78d24d7be18997854629a3ede894f..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 2464
zcmZ`*TW=dh6h7-)Y{w}OP}Ek6G>A&Us92ODR7wRzNn0wY3r!Hhidc<z$ML4SJKLGv
zrgoDTQo(QGFYPN&`~{x)&a9n8qHMgkIk)eeIp^DKtgUs|JJZp3ZKA*E($T>4H#qID
zxP<t56cZ&LjXd)By+(13Ms+8vQ{14WNu%butVN?0Cxc#_Mjd(>HYjY;YnV1kd308%
zQJ3B9^;>@<2KBwN_f$UqVOQpIEUg-1p>A^%uyA=WEI#aATof@HfEm>oAHe)UG{Fj9
z>-0O;5z<8sTNJL)YXEMt&R)M=b~x69=%q69;~G&{;Mku=Rs^xMR@f*hVwo!|@~IM&
zA`S9Lr$SHM`_2a<(4i7hD%g!PeyDOM(<VBMQW-mm2$7AL7@dg6v%Md`zsV}82zBLF
z$FV++(nGxAYVSHnxBB9NHbTi@Do~18-4v534V^_2S<Cv)ZJ^@#rjQUBmFO4BX{a=h
zj1op4BZoAo7898{*GMnbxv)hLC~GH0JP;r5=|mx$xJXhfBuqmUM`M&FSBR-atR}fo
zNtVyVag<NR)^K}k(0A@W?Te>XWSglVLe9Zd8X4ruaEz&tR&0r_3gfZSOW-IQSC#P(
z#EwpF6v84jIujLq=bcmFMhFE8ALulUcnhqP-FA^>mT@rxZHz+DuqyeZUDN}6$I6!-
zES0!+1Lx(lzSuDeiMk?2Nv2IMQgwW3>reoO$~KIpvI{)dghYHt^H3e9-NYp)IwLwl
zdqbGl_;y0%yD{18d&7RCY}z7IrfmCult#JlmyIwA@=+IYeYf2fjo+{;p5Wf`eb<k^
z|0n(P_0HXcpR6+W;LfctKEHjitAipz!R*1YHqUG(19iadcwkL%P-MVdy}K?SWHZwO
z!gYq~#>iW%&DYPTV6en9EPIJW&aoUDnPdUy#s?eoxr5VQMdl}%G>@Kj$XulpH-UJz
zH@F6vz%!42^C<UN4bP~>vo@J;-HMY|D|X-M*BAO2*Q5Hv<P;C6r5j-QKO^t7L3xvo
zT66+YgU)Jn+N2yj({ff2wb>7T-bs@uK5SFip|DGb4Z3!OCjjZv(JDnw>%U6(cxiR1
z8}=?Kt`ZB6dc3ELuxoD$yVt)WU!pv&!_HadoI$ElwL%$#A&+vE*oq(2OT1v>c;c%z
ztBh5pYC$$ZT!gA336}8zvbgVNlZOedDs!<Ia-%YX=77ubObm6Z1mf}Vo&#fhjPTEH
zFe@a^0mceEkmm7B*r_h!(2>U2(cM%Tb?yLaG7;eD3N=6RJid@|0nS7?9vzpENM-US
zyC(~>499OaUd~;WSx{hxD_pq-Q?Yv^kKJ^hgBs2^f6>Jfd1mwru*)SB^UZxOu|u@f
zLT+*aVEzt94{Rbns<!3T>6c|I95ep;wIL{pAx)6C5>wEdjpj<cIvusGE=-^vg`*Zv
z4c{N_=Ce$Vt{kdVJuy2P^irf@*$s4(=yV7VWo@`Ue}8+zeCddkt2f+sABoB{_VU$f
zf33W-Z87*vY4^Er$_BeOJep+-g~q^^YoN#!!1p!JbXm*7vJG+zefvBvn;CW{2lq^6
zrpgucMhvznYiU;2hjNGvLB6T%w+#Lfpt9*UZPdNk9(;=7rol1FS~xLXs8yF$Hn?83
zweLvYcfI8ESI^JK*CDqXxKQVY_euRNZykRd-W_k#>v<cs`Hc$-`hO?=7853K0tmC~
z`-u*Vn4f#T|Gbd#LbBrfp$>fCF`W^Jc>%QVnGP>7Uc&^)WM=u&>$Ym^ZMW86E#Iw}
zbHK267qoAf<VUO7ESaGU1I%u@vUn-m<06XlD7B02$nZgy*1QLcvY8eMc=sBfo%;(k
XmulC(S7q}Re@!eG_H9&#&z|=$4&JBJ