From aa31b60af5af71e65f1e1eeecb6e9f9f7e5c285e Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Fri, 2 Aug 2024 21:25:02 +0800 Subject: [PATCH 01/11] fix: kill all created pids --- bigcodebench/eval/utils.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/bigcodebench/eval/utils.py b/bigcodebench/eval/utils.py index 844d2ea..c65b849 100644 --- a/bigcodebench/eval/utils.py +++ b/bigcodebench/eval/utils.py @@ -221,7 +221,19 @@ def safe_exec(*args, **kwargs): try: yield finally: - # Restore original functions after the block + for pid in child_pids: + try: + os.kill(pid, signal.SIGTERM) + os.waitpid(pid, 0) + except ProcessLookupError: + pass # Process already terminated + except Exception as e: + print(f"Error terminating process {pid}: {e}") + try: + os.kill(pid, signal.SIGKILL) + except Exception: + pass + os.kill = original_kill os.killpg = original_killpg os.system = original_system From 18e9401daf54d890e3e2a6c70129c7f42c9b45b7 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Fri, 2 Aug 2024 21:38:58 +0800 Subject: [PATCH 02/11] fix: avoid no found pids --- bigcodebench/eval/utils.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/bigcodebench/eval/utils.py b/bigcodebench/eval/utils.py index c65b849..25db40a 100644 --- a/bigcodebench/eval/utils.py +++ b/bigcodebench/eval/utils.py @@ -141,7 +141,7 @@ def safe_kill(pid, sig): else: print(f"Prevented attempt to kill PID {pid} with signal {sig}") except ProcessLookupError: - print(f"Process {pid} does not exist.") + pass def safe_killpg(pgid, sig): if pgid == current_pgid or pgid in {os.getpgid(pid) for pid in child_pids}: @@ -224,15 +224,18 @@ def safe_exec(*args, **kwargs): for pid in child_pids: try: os.kill(pid, signal.SIGTERM) - os.waitpid(pid, 0) + # Wait for a short time to see if the process terminates + for _ in range(10): # Wait up to 1 second + time.sleep(0.1) + if os.waitpid(pid, os.WNOHANG) != (0, 0): + break + else: + # If the process didn't terminate, try SIGKILL + os.kill(pid, signal.SIGKILL) except ProcessLookupError: pass # Process already terminated except Exception as e: - print(f"Error terminating process {pid}: {e}") - try: - os.kill(pid, signal.SIGKILL) - except Exception: - pass + print(f"Error handling process {pid}: {e}") os.kill = original_kill os.killpg = original_killpg From 50d1fd1839f54318287d2d9cdfc31d59afade434 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Fri, 2 Aug 2024 21:43:07 +0800 Subject: [PATCH 03/11] fix: add time --- bigcodebench/eval/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bigcodebench/eval/utils.py b/bigcodebench/eval/utils.py index 25db40a..f1301ea 100644 --- a/bigcodebench/eval/utils.py +++ b/bigcodebench/eval/utils.py @@ -29,6 +29,7 @@ import tempfile import subprocess import multiprocessing +import time from typing import Optional TIMEOUT_LIMIT=240.0 From 092020000721852560e399c4fd98134c32e33b89 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Fri, 2 Aug 2024 21:46:16 +0800 Subject: [PATCH 04/11] fix: avoid no child pid kill --- bigcodebench/eval/utils.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/bigcodebench/eval/utils.py b/bigcodebench/eval/utils.py index f1301ea..214f141 100644 --- a/bigcodebench/eval/utils.py +++ b/bigcodebench/eval/utils.py @@ -228,8 +228,11 @@ def safe_exec(*args, **kwargs): # Wait for a short time to see if the process terminates for _ in range(10): # Wait up to 1 second time.sleep(0.1) - if os.waitpid(pid, os.WNOHANG) != (0, 0): - break + try: + # Check if the process has terminated + os.kill(pid, 0) + except ProcessLookupError: + break # Process has terminated else: # If the process didn't terminate, try SIGKILL os.kill(pid, signal.SIGKILL) From f771678591c2ee4b1f824a165179efb0140339c9 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Fri, 2 Aug 2024 21:53:56 +0800 Subject: [PATCH 05/11] fix: avoid no child pid kill --- bigcodebench/eval/utils.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/bigcodebench/eval/utils.py b/bigcodebench/eval/utils.py index 214f141..82b8085 100644 --- a/bigcodebench/eval/utils.py +++ b/bigcodebench/eval/utils.py @@ -225,19 +225,16 @@ def safe_exec(*args, **kwargs): for pid in child_pids: try: os.kill(pid, signal.SIGTERM) - # Wait for a short time to see if the process terminates - for _ in range(10): # Wait up to 1 second + for _ in range(10): time.sleep(0.1) try: - # Check if the process has terminated os.kill(pid, 0) except ProcessLookupError: - break # Process has terminated + break else: - # If the process didn't terminate, try SIGKILL os.kill(pid, signal.SIGKILL) except ProcessLookupError: - pass # Process already terminated + pass except Exception as e: print(f"Error handling process {pid}: {e}") From 021aee2ac55682454428967d4c925dae42e81a57 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Fri, 2 Aug 2024 22:12:28 +0800 Subject: [PATCH 06/11] fix(inspect): update inspect --- bigcodebench/inspect.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/bigcodebench/inspect.py b/bigcodebench/inspect.py index b06f5bd..920902e 100755 --- a/bigcodebench/inspect.py +++ b/bigcodebench/inspect.py @@ -19,7 +19,7 @@ def inspection(args): shutil.rmtree(path, ignore_errors=True) if not os.path.exists(path): os.makedirs(path) - problems = get_bigcodebench() + problems = get_bigcodebench(subset=flags.subset) eval_results = json.load(open(args.eval_results, "r")) for task_id, results in eval_results["eval"].items(): @@ -30,7 +30,7 @@ def inspection(args): os.makedirs(task_path) task_id_data = problems[task_id] with open(os.path.join(task_path, "ground_truth.py"), "w") as f: - f.write(task_id_data[f"{args.subset}_prompt"] + "\n\n" + task_id_data["canonical_solution"]) + f.write(task_id_data[f"{args.split}_prompt"] + "\n\n" + task_id_data["canonical_solution"]) # write test with open(os.path.join(task_path, "test_case.py"), "w") as f: @@ -49,7 +49,10 @@ def inspection(args): def main(): parser = argparse.ArgumentParser() parser.add_argument("--eval-results", required=True, type=str) - parser.add_argument("--subset", required=True, type=str) + parser.add_argument( + "--split", required=True, type=str, choices=["complete", "instruct"] + ) + parser.add_argument("--subset", default="hard", type=str, choices=["full", "hard"]) parser.add_argument("--in-place", action="store_true") args = parser.parse_args() From 957ea7f2da13dbd10ccad66893c4a04c896e8d78 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Fri, 2 Aug 2024 22:13:48 +0800 Subject: [PATCH 07/11] fix(inspect): change flg --- bigcodebench/inspect.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigcodebench/inspect.py b/bigcodebench/inspect.py index 920902e..ce6fb3b 100755 --- a/bigcodebench/inspect.py +++ b/bigcodebench/inspect.py @@ -19,7 +19,7 @@ def inspection(args): shutil.rmtree(path, ignore_errors=True) if not os.path.exists(path): os.makedirs(path) - problems = get_bigcodebench(subset=flags.subset) + problems = get_bigcodebench(subset=args.subset) eval_results = json.load(open(args.eval_results, "r")) for task_id, results in eval_results["eval"].items(): From d35fd70ec1d99534364ea411c47b934435ec9558 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Fri, 2 Aug 2024 22:15:58 +0800 Subject: [PATCH 08/11] fix(inspect): update args --- bigcodebench/inspect.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/bigcodebench/inspect.py b/bigcodebench/inspect.py index ce6fb3b..da04fad 100755 --- a/bigcodebench/inspect.py +++ b/bigcodebench/inspect.py @@ -14,7 +14,7 @@ def inspection(args): -- completion.py: prompt + completion -- execution_trace.txt: execution trace """ - path = os.path.join("inspect", args.eval_results.split("/")[-1].replace(".json", "")) + path = os.path.join(args.save_path, args.eval_results.split("/")[-1].replace(".json", "")) if args.in_place: shutil.rmtree(path, ignore_errors=True) if not os.path.exists(path): @@ -48,12 +48,13 @@ def inspection(args): f.write("="*50 + "\n") def main(): parser = argparse.ArgumentParser() - parser.add_argument("--eval-results", required=True, type=str) + parser.add_argument("--eval_results", required=True, type=str) parser.add_argument( "--split", required=True, type=str, choices=["complete", "instruct"] ) parser.add_argument("--subset", default="hard", type=str, choices=["full", "hard"]) - parser.add_argument("--in-place", action="store_true") + parser.add_argument("--save_path", default="inspect", type=str) + parser.add_argument("--in_place", action="store_true") args = parser.parse_args() inspection(args) From a9cc5b147f65ee19685a036f1cad1dbdfd6a0620 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Fri, 2 Aug 2024 22:17:48 +0800 Subject: [PATCH 09/11] fix(doc): update inspect doc --- README.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 16fd4a9..17d066c 100755 --- a/README.md +++ b/README.md @@ -320,7 +320,11 @@ Here are some tips to speed up the evaluation: You can inspect the failed samples by using the following command: ```bash -bigcodebench.inspect --eval-results sample-sanitized-calibrated_eval_results.json --in-place +# Inspect the failed samples and save the results to `inspect/` +bigcodebench.inspect --eval_results sample-sanitized-calibrated_eval_results.json --split complete --subset hard + +# Re-run the inspection in place +bigcodebench.inspect --eval_results sample-sanitized-calibrated_eval_results.json --split complete --subset hard --in_place ``` ## 🚀 Full Script From 4ab7c7f83cc3aa4ac28752c35434ce1439c91fa0 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Fri, 2 Aug 2024 22:25:46 +0800 Subject: [PATCH 10/11] fix(inspect): skip problems --- bigcodebench/inspect.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bigcodebench/inspect.py b/bigcodebench/inspect.py index da04fad..272166c 100755 --- a/bigcodebench/inspect.py +++ b/bigcodebench/inspect.py @@ -28,6 +28,8 @@ def inspection(args): task_path = os.path.join(path, task_id) if not os.path.exists(task_path): os.makedirs(task_path) + if task_id not in problems: + continue task_id_data = problems[task_id] with open(os.path.join(task_path, "ground_truth.py"), "w") as f: f.write(task_id_data[f"{args.split}_prompt"] + "\n\n" + task_id_data["canonical_solution"]) From 617b5bdfc96de98bc77bf82cd521310522ad6e12 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Fri, 2 Aug 2024 22:28:30 +0800 Subject: [PATCH 11/11] fix(inspect): avoid empty folder --- bigcodebench/inspect.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bigcodebench/inspect.py b/bigcodebench/inspect.py index 272166c..50c7e01 100755 --- a/bigcodebench/inspect.py +++ b/bigcodebench/inspect.py @@ -23,13 +23,13 @@ def inspection(args): eval_results = json.load(open(args.eval_results, "r")) for task_id, results in eval_results["eval"].items(): + if task_id not in problems: + continue if all(result["status"] == "pass" for result in results): continue task_path = os.path.join(path, task_id) if not os.path.exists(task_path): os.makedirs(task_path) - if task_id not in problems: - continue task_id_data = problems[task_id] with open(os.path.join(task_path, "ground_truth.py"), "w") as f: f.write(task_id_data[f"{args.split}_prompt"] + "\n\n" + task_id_data["canonical_solution"])