diff --git a/README.md b/README.md index 16fd4a9..17d066c 100755 --- a/README.md +++ b/README.md @@ -320,7 +320,11 @@ Here are some tips to speed up the evaluation: You can inspect the failed samples by using the following command: ```bash -bigcodebench.inspect --eval-results sample-sanitized-calibrated_eval_results.json --in-place +# Inspect the failed samples and save the results to `inspect/` +bigcodebench.inspect --eval_results sample-sanitized-calibrated_eval_results.json --split complete --subset hard + +# Re-run the inspection in place +bigcodebench.inspect --eval_results sample-sanitized-calibrated_eval_results.json --split complete --subset hard --in_place ``` ## 🚀 Full Script diff --git a/bigcodebench/eval/utils.py b/bigcodebench/eval/utils.py index 844d2ea..82b8085 100644 --- a/bigcodebench/eval/utils.py +++ b/bigcodebench/eval/utils.py @@ -29,6 +29,7 @@ import tempfile import subprocess import multiprocessing +import time from typing import Optional TIMEOUT_LIMIT=240.0 @@ -141,7 +142,7 @@ def safe_kill(pid, sig): else: print(f"Prevented attempt to kill PID {pid} with signal {sig}") except ProcessLookupError: - print(f"Process {pid} does not exist.") + pass def safe_killpg(pgid, sig): if pgid == current_pgid or pgid in {os.getpgid(pid) for pid in child_pids}: @@ -221,7 +222,22 @@ def safe_exec(*args, **kwargs): try: yield finally: - # Restore original functions after the block + for pid in child_pids: + try: + os.kill(pid, signal.SIGTERM) + for _ in range(10): + time.sleep(0.1) + try: + os.kill(pid, 0) + except ProcessLookupError: + break + else: + os.kill(pid, signal.SIGKILL) + except ProcessLookupError: + pass + except Exception as e: + print(f"Error handling process {pid}: {e}") + os.kill = original_kill os.killpg = original_killpg os.system = original_system diff --git a/bigcodebench/inspect.py b/bigcodebench/inspect.py index b06f5bd..50c7e01 100755 --- a/bigcodebench/inspect.py +++ b/bigcodebench/inspect.py @@ -14,15 +14,17 @@ def inspection(args): -- completion.py: prompt + completion -- execution_trace.txt: execution trace """ - path = os.path.join("inspect", args.eval_results.split("/")[-1].replace(".json", "")) + path = os.path.join(args.save_path, args.eval_results.split("/")[-1].replace(".json", "")) if args.in_place: shutil.rmtree(path, ignore_errors=True) if not os.path.exists(path): os.makedirs(path) - problems = get_bigcodebench() + problems = get_bigcodebench(subset=args.subset) eval_results = json.load(open(args.eval_results, "r")) for task_id, results in eval_results["eval"].items(): + if task_id not in problems: + continue if all(result["status"] == "pass" for result in results): continue task_path = os.path.join(path, task_id) @@ -30,7 +32,7 @@ def inspection(args): os.makedirs(task_path) task_id_data = problems[task_id] with open(os.path.join(task_path, "ground_truth.py"), "w") as f: - f.write(task_id_data[f"{args.subset}_prompt"] + "\n\n" + task_id_data["canonical_solution"]) + f.write(task_id_data[f"{args.split}_prompt"] + "\n\n" + task_id_data["canonical_solution"]) # write test with open(os.path.join(task_path, "test_case.py"), "w") as f: @@ -48,9 +50,13 @@ def inspection(args): f.write("="*50 + "\n") def main(): parser = argparse.ArgumentParser() - parser.add_argument("--eval-results", required=True, type=str) - parser.add_argument("--subset", required=True, type=str) - parser.add_argument("--in-place", action="store_true") + parser.add_argument("--eval_results", required=True, type=str) + parser.add_argument( + "--split", required=True, type=str, choices=["complete", "instruct"] + ) + parser.add_argument("--subset", default="hard", type=str, choices=["full", "hard"]) + parser.add_argument("--save_path", default="inspect", type=str) + parser.add_argument("--in_place", action="store_true") args = parser.parse_args() inspection(args)