Merge branch 'main' into docs_and_dashes

ncbo · Oct 10, 2022 · 87f73cc · 87f73cc
2 parents 0945aad + 6cf1d1b
commit 87f73cc
Show file tree

Hide file tree

Showing 4 changed files with 44 additions and 35 deletions.
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -113,7 +113,10 @@ pipeline {
         stage('Merge') {
             steps {
                 dir('./gitrepo') {
-			        sh '. venv/bin/activate && python3.8 run.py catmerge --merge_all'
+	            sh 'echo "Starting that big merge."'
+		    sh '. venv/bin/activate && python3.8 run.py catmerge --merge_all'
+		    sh 'echo "Finished that big merge."'
+                    //sh '. venv/bin/activate && python3.8 run.py catmerge --exclude NCBITAXON,GAZ,DRON,BERO,SNOMEDCT'
                     sh 'gunzip data/merged/merged-kg.tar.gz'
                     sh 'tar -rvf data/merged/merged-kg.tar data/merged/qc/'
                     sh 'tar -rvf data/merged/merged-kg.tar data/merged/merged-kg_nodes.tsv'

diff --git a/docs/_config.yml b/docs/_config.yml
@@ -3946,4 +3946,4 @@ ontologies:
     - id: suicideo
       status: OK
       nodecount: 409
-      edgecount: 422
+      edgecount: 422
diff --git a/kg_bioportal/merge_utils/merge_kg.py b/kg_bioportal/merge_utils/merge_kg.py
@@ -1,5 +1,4 @@
 import copy
-import gzip
 import os
 from copy import deepcopy
 import tarfile
@@ -183,43 +182,50 @@ def merge_with_cat_merge(merge_all: bool, include_only: list, exclude: list) ->
         name='merged-kg',
         nodes=nodepaths,
         edges=edgepaths,
-        output_dir=OUTPUT_PATH
+        output_dir=OUTPUT_PATH,
+        qc_report=False
     )
 
-    # Check for nodes with identical CURIEs
-    # by parsing the OUTPUT_PATH/qc/merged-kg-duplicate-nodes.tsv
-    comp_dupnode_path = os.path.join(OUTPUT_PATH,"qc","merged-kg-duplicate-nodes.tsv.gz")
-    with gzip.open(comp_dupnode_path) as infile:
-        dupnode_df = pd.read_csv(infile, sep='\t', index_col='id')
-    uniq_df = dupnode_df.groupby('id').agg(lambda x: '|'.join(set(x))).reset_index()
-    dup_count = len(dupnode_df)
-    uniq_count = len(uniq_df)
-    uniq_ids = list(uniq_df['id'])
-    print(f"Reducing {dup_count} duplicated nodes to {uniq_count} nonredundant nodes...")
+    # Find duplicate nodes and rows
+    # This would normally be done by the cat_merge qc, 
+    # but we don't need the full report, just dup nodes
+    # For duplicate rows, remove all but the first instance.
+    # For duplicate nodes (those with identical CURIEs),
+    # merge all fields with a delimiter
 
     nodefile_name = "merged-kg_nodes.tsv"
     nodefile_path = os.path.join(OUTPUT_PATH,nodefile_name)
+    edgefile_name = "merged-kg_nodes.tsv"
+    edgefile_path = os.path.join(OUTPUT_PATH,edgefile_name)
     temp_nodefile_name = "merged-kg_nodes.tsv.temp"
     temp_nodefile_path = os.path.join(OUTPUT_PATH,temp_nodefile_name)
     merge_graph_path = os.path.join(OUTPUT_PATH,'merged-kg.tar.gz')
-    tgfile = tarfile.open(merge_graph_path)
-    tgfile.extract(nodefile_name, OUTPUT_PATH)
-
-    with open(nodefile_path, 'r') as infile:
-        with open(temp_nodefile_path, 'w') as outfile:
-            seen_ids = []
-            for line in infile:
-                splitline = line.split("\t")
-                if splitline[0] in uniq_ids:
-                    if splitline[0] in seen_ids:
-                        continue
-                    outrow = uniq_df.loc[uniq_df['id'] == splitline[0]]
-                    outline = outrow.to_csv(header=None, index=False, sep='\t')
-                    seen_ids.append(splitline[0])
-                    outfile.write(outline)
-                else:
-                    outfile.write(line)
-
-    os.replace(temp_nodefile_path,nodefile_path)
+    graph_file_paths = []
+
+    with tarfile.open(merge_graph_path) as intar:
+        graph_files = intar.getnames()
+        print(graph_files)
+        for graph_file in graph_files:
+            intar.extract(graph_file, path=os.path.dirname(merge_graph_path))
+            graph_file_paths.append(os.path.join(os.path.dirname(merge_graph_path), graph_file))
+    os.remove(merge_graph_path)
+
+    # Remove duplicate rows and merge duplicate nodes
+    print("Reading merged graph to process duplicates...")
+    nodes_df = pd.read_csv(nodefile_path, sep='\t', index_col='id')
+    print(f"Node count before removing complete duplicates: {len(nodes_df.index)}")
+    nodes_df.drop_duplicates(keep='first', inplace=True)
+    print(f"Node count after removing complete duplicates: {len(nodes_df.index)}")
+    uniq_df = nodes_df.groupby('id').agg(lambda x: '|'.join(set(x)))
+    print(f"Node count after merging duplicate nodes: {len(uniq_df.index)}")
+    uniq_df.to_csv(temp_nodefile_path, sep='\t')
+
+    os.replace(temp_nodefile_path, nodefile_path)
+
+    # Compress it again
+    with tarfile.open(merge_graph_path, "w:gz") as outtar:
+         for graph_file in graph_file_paths:
+             outtar.add(graph_file, arcname=os.path.basename(graph_file))
+             os.remove(graph_file)
 
     print("Complete.")
diff --git a/setup.py b/setup.py
@@ -62,7 +62,7 @@ def find_version(*file_paths):
         'compress_json',
         'click==8.0.4',
         'pyyaml',
-        'kgx==1.5.9',
+        'kgx',
         'sphinx',
         'sphinx_rtd_theme',
         'recommonmark',
@@ -71,7 +71,7 @@ def find_version(*file_paths):
         'pandas',
         'networkx',
         'kghub-downloader',
-        'cat-merge==0.1.15'
+        'cat-merge'
     ],
     extras_require=extras,
 )