From 37546f3084e0979419bf79faf367006cd4123fad Mon Sep 17 00:00:00 2001 From: Aaron Leong Date: Wed, 14 Sep 2016 13:55:00 -0700 Subject: [PATCH 1/2] Cast step as an integer in diff_collection2 so it won't throw an error in Python3 --- src/utils/diff.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/utils/diff.py b/src/utils/diff.py index 00b3b221..e219bbbb 100644 --- a/src/utils/diff.py +++ b/src/utils/diff.py @@ -209,7 +209,7 @@ def diff_collections2(b1, b2, result_dir, use_parallel=True, step=10000): _updates = [] if len(ids_common) > 0: if use_parallel: - step = len(ids_common)/multiprocessing.cpu_count() + step = int(len(ids_common)/multiprocessing.cpu_count()) task_list = [ids_common[i:i+step] for i in range(0, len(ids_common), step)] pool = multiprocessing.Pool() partial_worker = partial(_diff_parallel_worker, b1.target_collection.name, b2.target_collection.name) From 66ba6a1440d6c2abd5ce62d67e9bd833ace7e053 Mon Sep 17 00:00:00 2001 From: Aaron Leong Date: Wed, 14 Sep 2016 15:21:25 -0700 Subject: [PATCH 2/2] Added script to compare pyobj output from diff_collections2 --- src/utils/compare_pyobj.py | 47 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100755 src/utils/compare_pyobj.py diff --git a/src/utils/compare_pyobj.py b/src/utils/compare_pyobj.py new file mode 100755 index 00000000..585803b7 --- /dev/null +++ b/src/utils/compare_pyobj.py @@ -0,0 +1,47 @@ +from utils.common import loadobj + +def pyobj_compare_worker(file_range, dir1, dir2): + print("Starting worker on: " + str(file_range)) + fail_list = [] + for _file_num in file_range: #First doc starts at '1' + _obj1 = loadobj(dir1+'/'+str(_file_num)+'.pyobj') + _obj2 = loadobj(dir2+'/'+str(_file_num)+'.pyobj') + if (_obj1['source'] != _obj2['source'] or + _obj1['add'] != _obj2['add'] or + _obj1['delete'] != _obj2['delete'] or + _obj1['update'] != _obj2['update']): + fail_list.append(_file_num) + print("Finished worker on: " + str(file_range)) + return (file_range, fail_list) + +def pyobj_compare_parallel(dir1, dir2): + import os + import multiprocessing + from functools import partial + print("Starting compare") + partial_function = partial(pyobj_compare_worker, dir1=dir1, dir2=dir2) + _doc_num = len(os.listdir(dir1)) + if len(os.listdir(dir2)) != _doc_num: + print("File count does not match") + return False + # step = int(_doc_num/multiprocessing.cpu_count()) + step = int(_doc_num/4) + filename_list = range(_doc_num+1) + task_list = [filename_list[i:i+step] for i in range(1, _doc_num, step)] + pool = multiprocessing.Pool(4) + results = pool.map(partial_function, task_list) + pool.close() + pool.join() + for result in results: + print("Files " + str(result[0][0]) + "-" + str(result[0][len(result[0])-1]) + ":" + + str(len(result[1])) + " mismatches") + +def main(): + import sys + if len(sys.argv) != 3: + print("Usage: compare_pyobj directory1 directory2") + return + pyobj_compare_parallel(sys.argv[1], sys.argv[2]) + +if __name__ == "__main__": + main() \ No newline at end of file