camel-ai · liuxukun2000 · Dec 10, 2024 · Dec 10, 2024 · Dec 27, 2024
diff --git a/camel/benchmarks/gaia.py b/camel/benchmarks/gaia.py
@@ -28,6 +28,9 @@
 from camel.benchmarks import BaseBenchmark
 from camel.messages.base import BaseMessage
 from camel.retrievers.auto_retriever import AutoRetriever
+from camel.societies.workforce.worker import Worker
+from camel.societies.workforce.workforce import Workforce
+from camel.tasks import Task
 
 logger = logging.getLogger(__name__)
 
@@ -271,6 +274,84 @@ def run(  # type: ignore[override]
 
         return self._generate_summary()
 
+    def run_workforce(  # type: ignore[override]
+        self,
+        workforce: Workforce,
+        on: Literal["train", "valid", "test"],
+        level: Union[int, List[int], Literal["all"]],
+        randomize: bool = False,
+        subset: Optional[int] = None,
+    ) -> Dict[str, Any]:
+        r"""Run the benchmark.
+
+        Args:
+            workforce (Workforce): The workforce to run the benchmark.
+            on (Literal["valid", "test"]): The set to run the benchmark.
+            level (Union[int, List[int], Literal["all"]]): The level to run
+                the benchmark.
+            randomize (bool, optional): Whether to randomize the data.
+                (default: :obj:`False`)
+            subset (Optional[int], optional): The subset of data to run.
+                (default: :obj:`None`)
+
+        Returns:
+            Dict[str, Any]: The results of the benchmark.
+        """
+        # Validate inputs
+        if on not in ["valid", "test"]:
+            raise ValueError(
+                f"Invalid value for `on`: {on}, expected 'valid' or 'test'."
+            )
+
+        levels = (
+            [1, 2, 3]
+            if level == "all"
+            else [level]
+            if isinstance(level, int)
+            else level
+        )
+        if not all(
+            isinstance(level, int) and level in [1, 2, 3] for level in levels
+        ):
+            raise ValueError(
+                f"Invalid value for `level`: {level}, expected 1, 2, 3 "
+                "or 'all'."
+            )
+
+        logger.info(f"Running benchmark on {on} set at levels {levels}.")
+        datas = [data for data in self._data[on] if data["Level"] in levels]
+
+        # Shuffle and subset data if necessary
+        if randomize:
+            random.shuffle(datas)
+        if subset:
+            datas = datas[:subset]
+
+        logger.info(f"Number of tasks: {len(datas)}")
+
+        # Initialize results storage
+        self._results = []
+
+        # Process tasks
+        with open(self.save_to, "w") as f:
+            for task in tqdm(datas, desc="Running"):
+                if not self._prepare_task(task):
+                    continue
+
+                try:
+                    result = workforce.process_task(self._create_task(task))
+                    self._process_workforce_result(
+                        workforce, task, result, f, None
+                    )
+                except Exception as e:
+                    self._process_workforce_result(workforce, task, None, f, e)
+                    workforce._running = False
+                finally:
+                    workforce._running = False
+                    workforce.reset()
+
+        return self._generate_summary()
+
     def _prepare_task(self, task: Dict[str, Any]) -> bool:
         r"""Prepare the task by validating and enriching its data."""
         if task["file_name"]:
@@ -307,14 +388,32 @@ def _create_user_message(self, task: Dict[str, Any]) -> BaseMessage:
             content=task["Question"],
         )
 
+    def _create_task(self, task: Dict[str, Any]) -> Task:
+        r"""Create a user message from a task.
+
+        Args:
+            task (Dict[str, Any]): The task to create the message from.
+
+        Returns:
+            Task: The task created from the input.
+        """
+        return Task(id=str(task["task_id"]), content=task["Question"])
+
     def _process_result(
         self,
         agent: ChatAgent,
         task: Dict[str, Any],
         result: Any,
         file_obj: Any,
     ) -> None:
-        r"""Process and store the result of a task."""
+        r"""Process and store the result of a task.
+
+        Args:
+            agent (ChatAgent): The agent that processed the task.
+            task (Dict[str, Any]): The task that was processed.
+            result (Any): The result of processing the task.
+            file_obj (Any): The file object to write the results to.
+        """
         model_answer = self.get_final_answer(result.msgs[0].content)
         final_answer = task["Final answer"]
         score = self.question_scorer(model_answer, final_answer)
@@ -335,6 +434,66 @@ def _process_result(
         file_obj.write(json.dumps(result_data, indent=2) + "\n")
         file_obj.flush()
 
+    def _process_workforce_result(
+        self,
+        workforce: Workforce,
+        task: Dict[str, Any],
+        result: Optional[Task],
+        file_obj: Any,
+        err: Optional[Exception],
+    ) -> None:
+        r"""Process and store the result of a task.
+
+        Args:
+            workforce (Workforce): The workforce that processed the task.
+            task (Dict[str, Any]): The task that was processed.
+            result (Optional[Task]): The result of processing the task.
+            file_obj (Any): The file object to write the results to.
+            err (Optional[Exception]): The error encountered during processing.
+        """
+        if err is None and result is not None:
+            model_answer = self.get_final_answer(str(result.result))
+            final_answer = task["Final answer"]
+            score = self.question_scorer(model_answer, final_answer)
+        else:
+            model_answer = "ERROR"
+            final_answer = task["Final answer"]
+            score = False
+        agents = [workforce.coordinator_agent, workforce.task_agent]
+        workers = [
+            agent for agent in workforce._children if isinstance(agent, Worker)
+        ]
+        agent_names = ["coordinator", "task"] + [
+            agent.node_id for agent in workers
+        ]
+        for worker in workers:
+            agents.append(worker.worker)  # type: ignore[attr-defined]
+
+        history = {}
+        tool_calls: Dict[str, Any] = {}
+
+        for agent, agent_name in zip(agents, agent_names):
+            history[agent_name] = agent.memory.get_context()[0]
+            tool_calls[agent_name] = []
+            for h in history[agent_name]:
+                if h.get("function_call", None):
+                    tool_calls[agent_name].append(h["function_call"])  # type: ignore[union-attr,typeddict-item]
+
+        result_data = {
+            "task_id": task["task_id"],
+            "question": task["Question"],
+            "level": task["Level"],
+            "model_answer": model_answer,
+            "ground_truth": final_answer,
+            "tool_calls": tool_calls,
+            "error": str(err) if err else None,
+            "score": int(score),
+            "history": history,
+        }
+        self._results.append(result_data)
+        file_obj.write(json.dumps(result_data, indent=2) + "\n")
+        file_obj.flush()
+
     def _handle_error(
         self, task: Dict[str, Any], error: Exception, file_obj: Any
     ) -> None:

diff --git a/camel/societies/workforce/utils.py b/camel/societies/workforce/utils.py
@@ -60,12 +60,12 @@ def check_if_running(running: bool) -> Callable:
     def decorator(func):
         @wraps(func)
         def wrapper(self, *args, **kwargs):
-            if self._running != running:
-                status = "not running" if running else "running"
-                raise RuntimeError(
-                    f"The workforce is {status}. Cannot perform the "
-                    f"operation {func.__name__}."
-                )
+            # if self._running != running:
+            #     status = "not running" if running else "running"
+            #     raise RuntimeError(
+            #         f"The workforce is {status}. Cannot perform the "
+            #         f"operation {func.__name__}."
+            #     )
             return func(self, *args, **kwargs)
 
         return wrapper

diff --git a/examples/benchmarks/gaia_workforce.py b/examples/benchmarks/gaia_workforce.py
@@ -0,0 +1,97 @@
+# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
+
+
+from camel.agents import ChatAgent
+from camel.benchmarks import DefaultGAIARetriever, GAIABenchmark
+from camel.messages import BaseMessage
+from camel.models import ModelFactory
+from camel.runtime import RemoteHttpRuntime
+from camel.societies.workforce import Workforce
+from camel.toolkits import CodeExecutionToolkit
+from camel.types import ModelPlatformType, ModelType, StorageType
+
+retriever = DefaultGAIARetriever(
+    vector_storage_local_path="local_data2/", storage_type=StorageType.QDRANT
+)
+
+benchmark = GAIABenchmark(
+    data_dir="datasets_test",
+    processes=1,
+    save_to="results.jsonl",
+    retriever=retriever,
+)
+
+print(f"Number of validation examples: {len(benchmark.valid)}")
+print(f"Number of test examples: {len(benchmark.test)}")
+
+
+toolkit = CodeExecutionToolkit(verbose=True)
+runtime = RemoteHttpRuntime("localhost").add(
+    toolkit.get_tools(),
+    "camel.toolkits.CodeExecutionToolkit",
+)
+
+task_prompt = """
+        You are a general AI assistant. I will ask you a question. Report your
+        thoughts, and finish your answer with the following template:
+        FINAL ANSWER: [YOUR FINAL ANSWER].
+        YOUR FINAL ANSWER should be a number OR as few words as possible OR a
+        comma separated list of numbers and/or strings.
+        If you are asked for a number, don't use comma to write your number
+        neither use units such as $ or percent sign unless specified otherwise.
+        If you are asked for a string, don't use articles, neither
+        abbreviations (e.g. for cities), and write the digits in plain text
+        unless specified otherwise.
+        If you are asked for a comma separated list, apply the above rules
+        depending of whether the element to be put in the list is a number or
+        a string.
+        """.strip()
+
+sys_msg = BaseMessage.make_assistant_message(
+    role_name="Assistant",
+    content=task_prompt,
+)
+
+tools = toolkit.get_tools()
+
+model = ModelFactory.create(
+    model_platform=ModelPlatformType.DEFAULT,
+    model_type=ModelType.DEFAULT,
+)
+
+
+agent = ChatAgent(
+    sys_msg,
+    model,
+    tools=tools,
+)
+
+workforce = Workforce("GAIA Workforce", new_worker_agent_kwargs={})
+workforce.add_single_agent_worker(
+    "An agent that can do code execution",
+    worker=agent,
+)
+
+result = benchmark.run_workforce(workforce, "valid", level="all", subset=1)
+print("correct:", result["correct"])
+print("total:", result["total"])
+
+# ruff: noqa: E501
+"""
+Number of validation examples: 165
+Number of test examples: 300
+correct: 0
+total: 3
+"""