From aa31b60af5af71e65f1e1eeecb6e9f9f7e5c285e Mon Sep 17 00:00:00 2001
From: Terry Zhuo <terryzhuo25@gmail.com>
Date: Fri, 2 Aug 2024 21:25:02 +0800
Subject: [PATCH 01/11] fix: kill all created pids

---
 bigcodebench/eval/utils.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/bigcodebench/eval/utils.py b/bigcodebench/eval/utils.py
index 844d2ea..c65b849 100644
--- a/bigcodebench/eval/utils.py
+++ b/bigcodebench/eval/utils.py
@@ -221,7 +221,19 @@ def safe_exec(*args, **kwargs):
     try:
         yield
     finally:
-        # Restore original functions after the block
+        for pid in child_pids:
+            try:
+                os.kill(pid, signal.SIGTERM)
+                os.waitpid(pid, 0)
+            except ProcessLookupError:
+                pass  # Process already terminated
+            except Exception as e:
+                print(f"Error terminating process {pid}: {e}")
+                try:
+                    os.kill(pid, signal.SIGKILL)
+                except Exception:
+                    pass
+        
         os.kill = original_kill
         os.killpg = original_killpg
         os.system = original_system

From 18e9401daf54d890e3e2a6c70129c7f42c9b45b7 Mon Sep 17 00:00:00 2001
From: Terry Zhuo <terryzhuo25@gmail.com>
Date: Fri, 2 Aug 2024 21:38:58 +0800
Subject: [PATCH 02/11] fix: avoid no found pids

---
 bigcodebench/eval/utils.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/bigcodebench/eval/utils.py b/bigcodebench/eval/utils.py
index c65b849..25db40a 100644
--- a/bigcodebench/eval/utils.py
+++ b/bigcodebench/eval/utils.py
@@ -141,7 +141,7 @@ def safe_kill(pid, sig):
             else:
                 print(f"Prevented attempt to kill PID {pid} with signal {sig}")
         except ProcessLookupError:
-            print(f"Process {pid} does not exist.")
+            pass
 
     def safe_killpg(pgid, sig):
         if pgid == current_pgid or pgid in {os.getpgid(pid) for pid in child_pids}:
@@ -224,15 +224,18 @@ def safe_exec(*args, **kwargs):
         for pid in child_pids:
             try:
                 os.kill(pid, signal.SIGTERM)
-                os.waitpid(pid, 0)
+                # Wait for a short time to see if the process terminates
+                for _ in range(10):  # Wait up to 1 second
+                    time.sleep(0.1)
+                    if os.waitpid(pid, os.WNOHANG) != (0, 0):
+                        break
+                else:
+                    # If the process didn't terminate, try SIGKILL
+                    os.kill(pid, signal.SIGKILL)
             except ProcessLookupError:
                 pass  # Process already terminated
             except Exception as e:
-                print(f"Error terminating process {pid}: {e}")
-                try:
-                    os.kill(pid, signal.SIGKILL)
-                except Exception:
-                    pass
+                print(f"Error handling process {pid}: {e}")
         
         os.kill = original_kill
         os.killpg = original_killpg

From 50d1fd1839f54318287d2d9cdfc31d59afade434 Mon Sep 17 00:00:00 2001
From: Terry Zhuo <terryzhuo25@gmail.com>
Date: Fri, 2 Aug 2024 21:43:07 +0800
Subject: [PATCH 03/11] fix: add time

---
 bigcodebench/eval/utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/bigcodebench/eval/utils.py b/bigcodebench/eval/utils.py
index 25db40a..f1301ea 100644
--- a/bigcodebench/eval/utils.py
+++ b/bigcodebench/eval/utils.py
@@ -29,6 +29,7 @@
 import tempfile
 import subprocess
 import multiprocessing
+import time
 from typing import Optional
 
 TIMEOUT_LIMIT=240.0

From 092020000721852560e399c4fd98134c32e33b89 Mon Sep 17 00:00:00 2001
From: Terry Zhuo <terryzhuo25@gmail.com>
Date: Fri, 2 Aug 2024 21:46:16 +0800
Subject: [PATCH 04/11] fix: avoid no child pid kill

---
 bigcodebench/eval/utils.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/bigcodebench/eval/utils.py b/bigcodebench/eval/utils.py
index f1301ea..214f141 100644
--- a/bigcodebench/eval/utils.py
+++ b/bigcodebench/eval/utils.py
@@ -228,8 +228,11 @@ def safe_exec(*args, **kwargs):
                 # Wait for a short time to see if the process terminates
                 for _ in range(10):  # Wait up to 1 second
                     time.sleep(0.1)
-                    if os.waitpid(pid, os.WNOHANG) != (0, 0):
-                        break
+                    try:
+                        # Check if the process has terminated
+                        os.kill(pid, 0)
+                    except ProcessLookupError:
+                        break  # Process has terminated
                 else:
                     # If the process didn't terminate, try SIGKILL
                     os.kill(pid, signal.SIGKILL)

From f771678591c2ee4b1f824a165179efb0140339c9 Mon Sep 17 00:00:00 2001
From: Terry Zhuo <terryzhuo25@gmail.com>
Date: Fri, 2 Aug 2024 21:53:56 +0800
Subject: [PATCH 05/11] fix: avoid no child pid kill

---
 bigcodebench/eval/utils.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/bigcodebench/eval/utils.py b/bigcodebench/eval/utils.py
index 214f141..82b8085 100644
--- a/bigcodebench/eval/utils.py
+++ b/bigcodebench/eval/utils.py
@@ -225,19 +225,16 @@ def safe_exec(*args, **kwargs):
         for pid in child_pids:
             try:
                 os.kill(pid, signal.SIGTERM)
-                # Wait for a short time to see if the process terminates
-                for _ in range(10):  # Wait up to 1 second
+                for _ in range(10):
                     time.sleep(0.1)
                     try:
-                        # Check if the process has terminated
                         os.kill(pid, 0)
                     except ProcessLookupError:
-                        break  # Process has terminated
+                        break
                 else:
-                    # If the process didn't terminate, try SIGKILL
                     os.kill(pid, signal.SIGKILL)
             except ProcessLookupError:
-                pass  # Process already terminated
+                pass
             except Exception as e:
                 print(f"Error handling process {pid}: {e}")
         

From 021aee2ac55682454428967d4c925dae42e81a57 Mon Sep 17 00:00:00 2001
From: Terry Zhuo <terryzhuo25@gmail.com>
Date: Fri, 2 Aug 2024 22:12:28 +0800
Subject: [PATCH 06/11] fix(inspect): update inspect

---
 bigcodebench/inspect.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/bigcodebench/inspect.py b/bigcodebench/inspect.py
index b06f5bd..920902e 100755
--- a/bigcodebench/inspect.py
+++ b/bigcodebench/inspect.py
@@ -19,7 +19,7 @@ def inspection(args):
         shutil.rmtree(path, ignore_errors=True)
     if not os.path.exists(path):
         os.makedirs(path)
-    problems = get_bigcodebench()
+    problems = get_bigcodebench(subset=flags.subset)
 
     eval_results = json.load(open(args.eval_results, "r"))
     for task_id, results in eval_results["eval"].items():
@@ -30,7 +30,7 @@ def inspection(args):
             os.makedirs(task_path)
         task_id_data = problems[task_id]
         with open(os.path.join(task_path, "ground_truth.py"), "w") as f:
-            f.write(task_id_data[f"{args.subset}_prompt"] + "\n\n" + task_id_data["canonical_solution"])
+            f.write(task_id_data[f"{args.split}_prompt"] + "\n\n" + task_id_data["canonical_solution"])
         
         # write test
         with open(os.path.join(task_path, "test_case.py"), "w") as f:
@@ -49,7 +49,10 @@ def inspection(args):
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument("--eval-results", required=True, type=str)
-    parser.add_argument("--subset", required=True, type=str)
+    parser.add_argument(
+        "--split", required=True, type=str, choices=["complete", "instruct"]
+    )
+    parser.add_argument("--subset", default="hard", type=str, choices=["full", "hard"])
     parser.add_argument("--in-place", action="store_true")
     args = parser.parse_args()
     

From 957ea7f2da13dbd10ccad66893c4a04c896e8d78 Mon Sep 17 00:00:00 2001
From: Terry Zhuo <terryzhuo25@gmail.com>
Date: Fri, 2 Aug 2024 22:13:48 +0800
Subject: [PATCH 07/11] fix(inspect): change flg

---
 bigcodebench/inspect.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bigcodebench/inspect.py b/bigcodebench/inspect.py
index 920902e..ce6fb3b 100755
--- a/bigcodebench/inspect.py
+++ b/bigcodebench/inspect.py
@@ -19,7 +19,7 @@ def inspection(args):
         shutil.rmtree(path, ignore_errors=True)
     if not os.path.exists(path):
         os.makedirs(path)
-    problems = get_bigcodebench(subset=flags.subset)
+    problems = get_bigcodebench(subset=args.subset)
 
     eval_results = json.load(open(args.eval_results, "r"))
     for task_id, results in eval_results["eval"].items():

From d35fd70ec1d99534364ea411c47b934435ec9558 Mon Sep 17 00:00:00 2001
From: Terry Zhuo <terryzhuo25@gmail.com>
Date: Fri, 2 Aug 2024 22:15:58 +0800
Subject: [PATCH 08/11] fix(inspect): update args

---
 bigcodebench/inspect.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/bigcodebench/inspect.py b/bigcodebench/inspect.py
index ce6fb3b..da04fad 100755
--- a/bigcodebench/inspect.py
+++ b/bigcodebench/inspect.py
@@ -14,7 +14,7 @@ def inspection(args):
         -- completion.py: prompt + completion
         -- execution_trace.txt: execution trace
     """
-    path = os.path.join("inspect", args.eval_results.split("/")[-1].replace(".json", ""))
+    path = os.path.join(args.save_path, args.eval_results.split("/")[-1].replace(".json", ""))
     if args.in_place:
         shutil.rmtree(path, ignore_errors=True)
     if not os.path.exists(path):
@@ -48,12 +48,13 @@ def inspection(args):
                     f.write("="*50 + "\n")
 def main():
     parser = argparse.ArgumentParser()
-    parser.add_argument("--eval-results", required=True, type=str)
+    parser.add_argument("--eval_results", required=True, type=str)
     parser.add_argument(
         "--split", required=True, type=str, choices=["complete", "instruct"]
     )
     parser.add_argument("--subset", default="hard", type=str, choices=["full", "hard"])
-    parser.add_argument("--in-place", action="store_true")
+    parser.add_argument("--save_path", default="inspect", type=str)
+    parser.add_argument("--in_place", action="store_true")
     args = parser.parse_args()
     
     inspection(args)

From a9cc5b147f65ee19685a036f1cad1dbdfd6a0620 Mon Sep 17 00:00:00 2001
From: Terry Zhuo <terryzhuo25@gmail.com>
Date: Fri, 2 Aug 2024 22:17:48 +0800
Subject: [PATCH 09/11] fix(doc): update inspect doc

---
 README.md | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 16fd4a9..17d066c 100755
--- a/README.md
+++ b/README.md
@@ -320,7 +320,11 @@ Here are some tips to speed up the evaluation:
 You can inspect the failed samples by using the following command:
 
 ```bash
-bigcodebench.inspect --eval-results sample-sanitized-calibrated_eval_results.json --in-place
+# Inspect the failed samples and save the results to `inspect/`
+bigcodebench.inspect --eval_results sample-sanitized-calibrated_eval_results.json --split complete --subset hard
+
+# Re-run the inspection in place
+bigcodebench.inspect --eval_results sample-sanitized-calibrated_eval_results.json --split complete --subset hard --in_place
 ```
 
 ## 🚀 Full Script

From 4ab7c7f83cc3aa4ac28752c35434ce1439c91fa0 Mon Sep 17 00:00:00 2001
From: Terry Zhuo <terryzhuo25@gmail.com>
Date: Fri, 2 Aug 2024 22:25:46 +0800
Subject: [PATCH 10/11] fix(inspect): skip problems

---
 bigcodebench/inspect.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/bigcodebench/inspect.py b/bigcodebench/inspect.py
index da04fad..272166c 100755
--- a/bigcodebench/inspect.py
+++ b/bigcodebench/inspect.py
@@ -28,6 +28,8 @@ def inspection(args):
         task_path = os.path.join(path, task_id)
         if not os.path.exists(task_path):
             os.makedirs(task_path)
+        if task_id not in problems:
+            continue
         task_id_data = problems[task_id]
         with open(os.path.join(task_path, "ground_truth.py"), "w") as f:
             f.write(task_id_data[f"{args.split}_prompt"] + "\n\n" + task_id_data["canonical_solution"])

From 617b5bdfc96de98bc77bf82cd521310522ad6e12 Mon Sep 17 00:00:00 2001
From: Terry Zhuo <terryzhuo25@gmail.com>
Date: Fri, 2 Aug 2024 22:28:30 +0800
Subject: [PATCH 11/11] fix(inspect): avoid empty folder

---
 bigcodebench/inspect.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bigcodebench/inspect.py b/bigcodebench/inspect.py
index 272166c..50c7e01 100755
--- a/bigcodebench/inspect.py
+++ b/bigcodebench/inspect.py
@@ -23,13 +23,13 @@ def inspection(args):
 
     eval_results = json.load(open(args.eval_results, "r"))
     for task_id, results in eval_results["eval"].items():
+        if task_id not in problems:
+            continue
         if all(result["status"] == "pass" for result in results):
             continue
         task_path = os.path.join(path, task_id)
         if not os.path.exists(task_path):
             os.makedirs(task_path)
-        if task_id not in problems:
-            continue
         task_id_data = problems[task_id]
         with open(os.path.join(task_path, "ground_truth.py"), "w") as f:
             f.write(task_id_data[f"{args.split}_prompt"] + "\n\n" + task_id_data["canonical_solution"])