Skip to content

Commit

Permalink
Merge branch 'main' into docs_and_dashes
Browse files Browse the repository at this point in the history
  • Loading branch information
caufieldjh authored Oct 10, 2022
2 parents 0945aad + 6cf1d1b commit 87f73cc
Show file tree
Hide file tree
Showing 4 changed files with 44 additions and 35 deletions.
5 changes: 4 additions & 1 deletion Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,10 @@ pipeline {
stage('Merge') {
steps {
dir('./gitrepo') {
sh '. venv/bin/activate && python3.8 run.py catmerge --merge_all'
sh 'echo "Starting that big merge."'
sh '. venv/bin/activate && python3.8 run.py catmerge --merge_all'
sh 'echo "Finished that big merge."'
//sh '. venv/bin/activate && python3.8 run.py catmerge --exclude NCBITAXON,GAZ,DRON,BERO,SNOMEDCT'
sh 'gunzip data/merged/merged-kg.tar.gz'
sh 'tar -rvf data/merged/merged-kg.tar data/merged/qc/'
sh 'tar -rvf data/merged/merged-kg.tar data/merged/merged-kg_nodes.tsv'
Expand Down
2 changes: 1 addition & 1 deletion docs/_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3946,4 +3946,4 @@ ontologies:
- id: suicideo
status: OK
nodecount: 409
edgecount: 422
edgecount: 422
68 changes: 37 additions & 31 deletions kg_bioportal/merge_utils/merge_kg.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import copy
import gzip
import os
from copy import deepcopy
import tarfile
Expand Down Expand Up @@ -183,43 +182,50 @@ def merge_with_cat_merge(merge_all: bool, include_only: list, exclude: list) ->
name='merged-kg',
nodes=nodepaths,
edges=edgepaths,
output_dir=OUTPUT_PATH
output_dir=OUTPUT_PATH,
qc_report=False
)

# Check for nodes with identical CURIEs
# by parsing the OUTPUT_PATH/qc/merged-kg-duplicate-nodes.tsv
comp_dupnode_path = os.path.join(OUTPUT_PATH,"qc","merged-kg-duplicate-nodes.tsv.gz")
with gzip.open(comp_dupnode_path) as infile:
dupnode_df = pd.read_csv(infile, sep='\t', index_col='id')
uniq_df = dupnode_df.groupby('id').agg(lambda x: '|'.join(set(x))).reset_index()
dup_count = len(dupnode_df)
uniq_count = len(uniq_df)
uniq_ids = list(uniq_df['id'])
print(f"Reducing {dup_count} duplicated nodes to {uniq_count} nonredundant nodes...")
# Find duplicate nodes and rows
# This would normally be done by the cat_merge qc,
# but we don't need the full report, just dup nodes
# For duplicate rows, remove all but the first instance.
# For duplicate nodes (those with identical CURIEs),
# merge all fields with a delimiter

nodefile_name = "merged-kg_nodes.tsv"
nodefile_path = os.path.join(OUTPUT_PATH,nodefile_name)
edgefile_name = "merged-kg_nodes.tsv"
edgefile_path = os.path.join(OUTPUT_PATH,edgefile_name)
temp_nodefile_name = "merged-kg_nodes.tsv.temp"
temp_nodefile_path = os.path.join(OUTPUT_PATH,temp_nodefile_name)
merge_graph_path = os.path.join(OUTPUT_PATH,'merged-kg.tar.gz')
tgfile = tarfile.open(merge_graph_path)
tgfile.extract(nodefile_name, OUTPUT_PATH)

with open(nodefile_path, 'r') as infile:
with open(temp_nodefile_path, 'w') as outfile:
seen_ids = []
for line in infile:
splitline = line.split("\t")
if splitline[0] in uniq_ids:
if splitline[0] in seen_ids:
continue
outrow = uniq_df.loc[uniq_df['id'] == splitline[0]]
outline = outrow.to_csv(header=None, index=False, sep='\t')
seen_ids.append(splitline[0])
outfile.write(outline)
else:
outfile.write(line)

os.replace(temp_nodefile_path,nodefile_path)
graph_file_paths = []

with tarfile.open(merge_graph_path) as intar:
graph_files = intar.getnames()
print(graph_files)
for graph_file in graph_files:
intar.extract(graph_file, path=os.path.dirname(merge_graph_path))
graph_file_paths.append(os.path.join(os.path.dirname(merge_graph_path), graph_file))
os.remove(merge_graph_path)

# Remove duplicate rows and merge duplicate nodes
print("Reading merged graph to process duplicates...")
nodes_df = pd.read_csv(nodefile_path, sep='\t', index_col='id')
print(f"Node count before removing complete duplicates: {len(nodes_df.index)}")
nodes_df.drop_duplicates(keep='first', inplace=True)
print(f"Node count after removing complete duplicates: {len(nodes_df.index)}")
uniq_df = nodes_df.groupby('id').agg(lambda x: '|'.join(set(x)))
print(f"Node count after merging duplicate nodes: {len(uniq_df.index)}")
uniq_df.to_csv(temp_nodefile_path, sep='\t')

os.replace(temp_nodefile_path, nodefile_path)

# Compress it again
with tarfile.open(merge_graph_path, "w:gz") as outtar:
for graph_file in graph_file_paths:
outtar.add(graph_file, arcname=os.path.basename(graph_file))
os.remove(graph_file)

print("Complete.")
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def find_version(*file_paths):
'compress_json',
'click==8.0.4',
'pyyaml',
'kgx==1.5.9',
'kgx',
'sphinx',
'sphinx_rtd_theme',
'recommonmark',
Expand All @@ -71,7 +71,7 @@ def find_version(*file_paths):
'pandas',
'networkx',
'kghub-downloader',
'cat-merge==0.1.15'
'cat-merge'
],
extras_require=extras,
)

0 comments on commit 87f73cc

Please sign in to comment.