Add MATH tests to testbed (microsoft#914)

* add MATH eval to testbed * update --------- Co-authored-by: Qingyun Wu <[email protected]>
isaka · Dec 18, 2023 · aa946b3 · aa946b3
1 parent dd4a2da
commit aa946b3
Show file tree

Hide file tree

Showing 7 changed files with 263 additions and 0 deletions.
diff --git a/samples/tools/testbed/includes/math_requirements.txt b/samples/tools/testbed/includes/math_requirements.txt
@@ -0,0 +1,4 @@
+git+https://github.com/microsoft/autogen.git
+sympy
+matplotlib
+numpy
diff --git a/samples/tools/testbed/scenarios/MATH/README.md b/samples/tools/testbed/scenarios/MATH/README.md
@@ -0,0 +1,27 @@
+## Get json file to run
+
+This will convert the math problems to json format and put it in the `scenarios/MATH` folder.
+```sh
+cd samples/tools/testbed/
+python scenarios/MATH/problems_to_json.py
+```
+
+## Run the testbed
+
+Note: this will first run autogen on the math problems, and then use a LLM as answer checker to check the answers.
+This means the results is not 100% accurate.
+
+```sh
+python run_scenarios.py scenarios/MATH/problems.jsonl -c <config_list> --requirements math_requirements.txt
+```
+
+## Get the correct count
+Use `--path` or `-p` to specify the path to the problem directory, the default is `./results/problems/`, which is the default save path of this testbed.
+```sh
+python scenarios/MATH/count_correct_math.py --path <path_to_problem_dir>
+```
+
+Example output:
+```
+Trial 0 | Total Correct: 10 | Total Problems: 17
+```
diff --git a/samples/tools/testbed/scenarios/MATH/answer.txt b/samples/tools/testbed/scenarios/MATH/answer.txt
@@ -0,0 +1 @@
+__ANSWER__
diff --git a/samples/tools/testbed/scenarios/MATH/count_correct_math.py b/samples/tools/testbed/scenarios/MATH/count_correct_math.py
@@ -0,0 +1,56 @@
+import argparse
+import json
+import os
+
+
+def main(args):
+    stars = "*" * 100
+
+    # initiate the correct count for each trial
+    correct_count = [0 for i in range(args.num_trials)]
+
+    for i in range(args.num_trials):
+        for problem_name in os.listdir(args.path):
+            problem_path = os.path.join(args.path, problem_name, str(i))
+            if os.path.isdir(problem_path):
+                checker_file_path = os.path.join(problem_path, "checker_messages.json")
+
+                with open(checker_file_path, "r") as file:
+                    checker_messages = json.load(file)
+
+                    check_result = checker_messages["checker_proxy"][-1]["content"].lower()
+
+                    if (
+                        "the answer is correct" in check_result
+                        or "the answer is approximated but should be correct" in check_result
+                    ):
+                        correct_count[i] += 1
+                        # print(f"{problem_name} | Correct")
+                    # else:
+                    # print(f"{problem_name} | Wrong")
+
+        print(f"{stars}\nTrial {i} | Total Correct: {correct_count[i]} | Total Problems: {len(os.listdir(args.path))}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="""Print Math Problems results.""".strip(),
+    )
+    parser.add_argument(
+        "--path",
+        "-p",
+        type=str,
+        default="./results/problems/",
+        help="Path to the problems directory",
+    )
+    # num trials
+    parser.add_argument(
+        "--num_trials",
+        "-n",
+        type=int,
+        default=1,
+        help="Number of trials to check",
+    )
+
+    args = parser.parse_args()
+    main(args)
diff --git a/samples/tools/testbed/scenarios/MATH/problems_to_json.py b/samples/tools/testbed/scenarios/MATH/problems_to_json.py
@@ -0,0 +1,77 @@
+import json
+
+problems = [
+    "Find all $x$ that satisfy the inequality $(2x+10)(x+3)<(3x+9)(x+8)$. Express your answer in interval notation.",
+    "Find the value of $a_2+a_4+a_6+a_8+\\dots+a_{98}$ if $a_1, a_2, a_3, \\ldots$ is an arithmetic progression with common difference $1$ and \\[a_1+a_2+a_3+\\dots+a_{98}=137.\\]",
+    "Tina the tourist goes on a trip. She starts at the origin and drives north (in the positive $y$ direction) for $10$ units. Then she turns east (the positive $x$ direction) and as she's turning her camera flies out the window and lands exactly at $(0,10)$. She then drives $9$ units east, turns and drives $8$ units north.  She continues this pattern of turning and driving one unit less than after the previous turn, until stopping after driving $1$ unit east. She reaches for her camera only to find it missing! She activates the GPS homing device on her camera and drives back to it in a straight line. What is the equation of this line? Express your answer as $ax+by=c$, where $a$, $b$, and $c$ are integers, $a>0$, and $a$ is as small as possible.",
+    "For what negative value of $k$ is there exactly one solution to the system of equations \\begin{align*}\ny &= 2x^2 + kx + 6 \\\\\ny &= -x + 4?\n\\end{align*}",
+    "If $\\frac{3x^2-4x+1}{x-1}=m$, and $x$ can be any real number except $1$, what real values can $m$ NOT have?",
+    "Find all numbers $a$ for which the graph of $y=x^2+a$ and the graph of $y=ax$ intersect. Express your answer in interval notation.",
+    "If $\\displaystyle{f(x)=x^{(x+1)}(x+2)^{(x+3)}}$, then find the value of $f(0)+f(-1)+f(-2)+f(-3)$.",
+    "An envelope contains eight bills: 2 ones, 2 fives, 2 tens, and 2 twenties. Two bills are drawn at random without replacement. What is the probability that their sum is $\\$20$ or more?",
+    "Find the coefficient of $x^2$ in the expansion of the product $$(1-x)(1+2x)(1-3x)\\dotsm(1+14x)(1-15x).$$",
+    "All 50 states as well as the District of Columbia and Puerto Rico, have distinct two-letter postal abbreviations. If a two-letter sequence of letters (such as CO or EE) is chosen at random, what is the probability that it is a postal abbreviation for one of the 50 states, the District of Columbia, or Puerto Rico? Express your answer as a common fraction.",
+    "Let $x$ and $y$ be real numbers.  Find the set of possible values of\n\\[\\frac{(x + y)(1 - xy)}{(1 + x^2)(1 + y^2)}.\\]",
+    "On a number line, the coordinates of $P$ and $Q$ are 8 and 48, respectively. The midpoint of $\\overline{PQ}$ is $B$, the midpoint of $\\overline{BQ}$ is $C$, and the midpoint of $\\overline{PC}$ is $D$. What is the coordinate of $D$?",
+    "Find $24^{-1} \\pmod{11^2}$. That is, find the residue $b$ for which $24b \\equiv 1\\pmod{11^2}$.\n\nExpress your answer as an integer from $0$ to $11^2-1$, inclusive.",
+    "There are two cameras that take pictures of a traffic intersection. Camera A starts taking pictures at $6$ AM and takes a picture every $11$ minutes. Camera B starts taking pictures at $7$ AM and takes pictures every $7$ minutes. Camera A and Camera B take a picture at the same time at four different times before noon. When Camera A and Camera B take their last picture together, how many minutes before noon is it?",
+    "Let $z$ be a complex number such that $z^{13} = 1.$  Let $w_1,$ $w_2,$ $\\dots,$ $w_k$ be all the possible values of\n\\[z + z^3 + z^4 + z^9 + z^{10} + z^{12}.\\]Find $w_1^2 + w_2^2 + \\dots + w_k^2.$",
+    "There are 190 people on the beach. 110 are wearing sunglasses, 70 are wearing bathing suits, and 95 are wearing a hat.  Everyone is wearing at least one of these items. 30 are wearing both bathing suits and sunglasses. 25 are wearing both bathing suits and a hat. 40 are wearing both sunglasses and a hat.  How many people are wearing all three items?",
+    "Completely simplify and rationalize the denominator: $$\\frac{\\sqrt{160}}{\\sqrt{252}}\\times\\frac{\\sqrt{245}}{\\sqrt{108}}$$",
+]
+answers = [
+    # 6 algebra
+    "(-\\infty, -14)\\cup(-3,\\infty)",
+    "93",
+    "4x-5y=-50",
+    "-5",
+    "2",
+    "(-\\infty,0]\\cup[4,\\infty)",
+    # 11 problems, 2 from each category, (1 algebra is deleted)
+    "\\frac{10}{9}",
+    "\\frac{1}{2}",
+    "-588",
+    " \\frac{1}{13}",
+    "\\left[ -\\frac{1}{2}, \\frac{1}{2} \\right]",
+    "23",
+    "116",
+    "41",
+    "43",
+    "10",
+    "\\frac{5\\sqrt{42}}{27}",
+]
+
+
+def problem_to_json():
+    with open("problems.jsonl", "w") as f:
+        for i, problem in enumerate(problems):
+            # a = {
+            #     'id': f'problem{i}',
+            #     'template': 'scenario.py',
+            #     'substitutions': {
+            #         '__PROMPT__': problem,
+            #         '__ANSWER__': answers[i],
+            #     },
+            # }
+            a = {
+                "id": f"problem{i}",
+                "template": "./",
+                "substitutions": {"prompt.txt": {"__PROMPT__": problem}, "answer.txt": {"__ANSWER__": answers[i]}},
+            }
+            # Convert the dictionary to a JSON string and write it to the file
+            json_string = json.dumps(a)
+            f.write(json_string + "\n")  # Add a newline character after each JSON object
+
+
+problem_to_json()
+
+problems = []
+with open("problems.jsonl", "r") as file:
+    for line in file:
+        # Parse each line as a JSON object
+        problem = json.loads(line)
+        problems.append(problem)
+        print(problem["substitutions"])
+        print()
+
+# Now 'problems' is a list of dictionaries, each representing a problem
diff --git a/samples/tools/testbed/scenarios/MATH/prompt.txt b/samples/tools/testbed/scenarios/MATH/prompt.txt
@@ -0,0 +1 @@
+__PROMPT__
diff --git a/samples/tools/testbed/scenarios/MATH/scenario.py b/samples/tools/testbed/scenarios/MATH/scenario.py
@@ -0,0 +1,97 @@
+import os
+import json
+import autogen
+
+import testbed_utils
+
+testbed_utils.init()
+
+
+PROMPT = ""
+with open("prompt.txt", "rt") as fh:
+    PROMPT = fh.read()
+
+ANSWER = ""
+with open("answer.txt", "rt") as fh:
+    ANSWER = fh.read()
+
+
+####################
+config_list = autogen.config_list_from_json(
+    "OAI_CONFIG_LIST",
+    filter_dict={"model": ["gpt40613"]},
+)
+llm_config = {
+    "cache_seed": 42,
+    "config_list": config_list,
+    "timeout": 600,
+}
+code_execution_config = {
+    "work_dir": "coding",
+    "use_docker": False,  # set to True or image name like "python:3" to use docker
+}
+# ---------between "user" and "assistant"---------
+assistant = autogen.AssistantAgent(name="assistant", llm_config=llm_config)
+user_proxy = autogen.UserProxyAgent(
+    name="user",
+    human_input_mode="NEVER",
+    code_execution_config=code_execution_config,
+    max_consecutive_auto_reply=10,
+    is_termination_msg=lambda x: x.get("content", "")
+    and (x.get("content", "").rstrip().endswith("TERMINATE") or x.get("content", "").rstrip().endswith("TERMINATE.")),
+)
+
+user_proxy.initiate_chat(assistant, message=PROMPT)
+
+
+# --------- extract reply ---------
+response_with_ans = ""
+messages = assistant._oai_messages[user_proxy]
+for j in range(len(messages) - 1, -1, -1):
+    if (
+        messages[j]["role"] == "assistant"
+        and messages[j]["content"].strip() != "TERMINATE"
+        and messages[j]["content"].strip() != "TERMINATE."
+    ):
+        response_with_ans = messages[j]["content"]
+        break
+
+
+# ---------between "answer_checker" and "checker_proxy"---------
+# define answer checker chat
+
+check_sys_msg = """You are a helpful AI assistant. You will use your coding and language skills to verify the answer.
+You are given:
+    1. A problem.
+    2. A reply with the answer to the problem.
+    3. A ground truth answer.
+Please do the following:
+1. Extract the answer in the reply: "The answer is <answer extracted>".
+2. Check whether the answer in the reply matches the ground truth answer. When comparison is not obvious (for example, 3*\\sqrt(6) and 7.348), you may write code to check the answer and wait for the user to execute the code.
+3. After everything is done, please choose a reply from the following options:
+    - "The answer is correct."
+    - "The answer is approximated but should be correct. Correct Answer: <ground truth answer> | Answer extracted: <answer extracted>."
+    - "The answer is incorrect. Correct Answer: <ground truth answer> | Answer extracted: <answer extracted>."
+    - "The reply doesn't contain an answer." """
+
+answer_checker = autogen.AssistantAgent(name="checker", llm_config=llm_config, system_message=check_sys_msg)
+checker_proxy = autogen.UserProxyAgent(
+    name="checker_proxy",
+    human_input_mode="NEVER",
+    code_execution_config=code_execution_config,
+    max_consecutive_auto_reply=5,
+    is_termination_msg=lambda x: x.get("content", "").lower()
+    and (
+        "the answer is correct" in x.get("content", "").lower()
+        or "the answer is incorrect" in x.get("content", "").lower()
+        or "the reply doesn't contain an answer" in x.get("content", "").lower()
+        or "the answer is approximated but should be correct" in x.get("content", "").lower()
+    ),
+)
+
+message_to_check = "Problem: " + PROMPT + f"\n\nReply: {response_with_ans}\n\nGround truth answer: " + ANSWER
+checker_proxy.initiate_chat(answer_checker, message=message_to_check)
+
+
+####################
+testbed_utils.finalize(agents=[assistant, user_proxy, answer_checker, checker_proxy])