Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Support workforce in GAIA benchmark #1301

Draft
wants to merge 3 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
161 changes: 160 additions & 1 deletion camel/benchmarks/gaia.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@
from camel.benchmarks import BaseBenchmark
from camel.messages.base import BaseMessage
from camel.retrievers.auto_retriever import AutoRetriever
from camel.societies.workforce.worker import Worker
from camel.societies.workforce.workforce import Workforce
from camel.tasks import Task

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -271,6 +274,84 @@ def run( # type: ignore[override]

return self._generate_summary()

def run_workforce( # type: ignore[override]
self,
workforce: Workforce,
on: Literal["train", "valid", "test"],
level: Union[int, List[int], Literal["all"]],
randomize: bool = False,
subset: Optional[int] = None,
) -> Dict[str, Any]:
r"""Run the benchmark.

Args:
workforce (Workforce): The workforce to run the benchmark.
on (Literal["valid", "test"]): The set to run the benchmark.
level (Union[int, List[int], Literal["all"]]): The level to run
the benchmark.
randomize (bool, optional): Whether to randomize the data.
(default: :obj:`False`)
subset (Optional[int], optional): The subset of data to run.
(default: :obj:`None`)

Returns:
Dict[str, Any]: The results of the benchmark.
"""
# Validate inputs
if on not in ["valid", "test"]:
raise ValueError(
f"Invalid value for `on`: {on}, expected 'valid' or 'test'."
)

levels = (
[1, 2, 3]
if level == "all"
else [level]
if isinstance(level, int)
else level
)
if not all(
isinstance(level, int) and level in [1, 2, 3] for level in levels
):
raise ValueError(
f"Invalid value for `level`: {level}, expected 1, 2, 3 "
"or 'all'."
)

logger.info(f"Running benchmark on {on} set at levels {levels}.")
datas = [data for data in self._data[on] if data["Level"] in levels]

# Shuffle and subset data if necessary
if randomize:
random.shuffle(datas)
if subset:
datas = datas[:subset]

logger.info(f"Number of tasks: {len(datas)}")

# Initialize results storage
self._results = []

# Process tasks
with open(self.save_to, "w") as f:
for task in tqdm(datas, desc="Running"):
if not self._prepare_task(task):
continue

try:
result = workforce.process_task(self._create_task(task))
self._process_workforce_result(
workforce, task, result, f, None
)
except Exception as e:
self._process_workforce_result(workforce, task, None, f, e)
workforce._running = False
finally:
workforce._running = False
workforce.reset()

return self._generate_summary()

def _prepare_task(self, task: Dict[str, Any]) -> bool:
r"""Prepare the task by validating and enriching its data."""
if task["file_name"]:
Expand Down Expand Up @@ -307,14 +388,32 @@ def _create_user_message(self, task: Dict[str, Any]) -> BaseMessage:
content=task["Question"],
)

def _create_task(self, task: Dict[str, Any]) -> Task:
r"""Create a user message from a task.

Args:
task (Dict[str, Any]): The task to create the message from.

Returns:
Task: The task created from the input.
"""
return Task(id=str(task["task_id"]), content=task["Question"])

def _process_result(
self,
agent: ChatAgent,
task: Dict[str, Any],
result: Any,
file_obj: Any,
) -> None:
r"""Process and store the result of a task."""
r"""Process and store the result of a task.

Args:
agent (ChatAgent): The agent that processed the task.
task (Dict[str, Any]): The task that was processed.
result (Any): The result of processing the task.
file_obj (Any): The file object to write the results to.
"""
model_answer = self.get_final_answer(result.msgs[0].content)
final_answer = task["Final answer"]
score = self.question_scorer(model_answer, final_answer)
Expand All @@ -335,6 +434,66 @@ def _process_result(
file_obj.write(json.dumps(result_data, indent=2) + "\n")
file_obj.flush()

def _process_workforce_result(
self,
workforce: Workforce,
task: Dict[str, Any],
result: Optional[Task],
file_obj: Any,
err: Optional[Exception],
) -> None:
r"""Process and store the result of a task.

Args:
workforce (Workforce): The workforce that processed the task.
task (Dict[str, Any]): The task that was processed.
result (Optional[Task]): The result of processing the task.
file_obj (Any): The file object to write the results to.
err (Optional[Exception]): The error encountered during processing.
"""
if err is None and result is not None:
model_answer = self.get_final_answer(str(result.result))
final_answer = task["Final answer"]
score = self.question_scorer(model_answer, final_answer)
else:
model_answer = "ERROR"
final_answer = task["Final answer"]
score = False
agents = [workforce.coordinator_agent, workforce.task_agent]
workers = [
agent for agent in workforce._children if isinstance(agent, Worker)
]
agent_names = ["coordinator", "task"] + [
agent.node_id for agent in workers
]
for worker in workers:
agents.append(worker.worker) # type: ignore[attr-defined]

history = {}
tool_calls: Dict[str, Any] = {}

for agent, agent_name in zip(agents, agent_names):
history[agent_name] = agent.memory.get_context()[0]
tool_calls[agent_name] = []
for h in history[agent_name]:
if h.get("function_call", None):
tool_calls[agent_name].append(h["function_call"]) # type: ignore[union-attr,typeddict-item]

result_data = {
"task_id": task["task_id"],
"question": task["Question"],
"level": task["Level"],
"model_answer": model_answer,
"ground_truth": final_answer,
"tool_calls": tool_calls,
"error": str(err) if err else None,
"score": int(score),
"history": history,
}
self._results.append(result_data)
file_obj.write(json.dumps(result_data, indent=2) + "\n")
file_obj.flush()

def _handle_error(
self, task: Dict[str, Any], error: Exception, file_obj: Any
) -> None:
Expand Down
12 changes: 6 additions & 6 deletions camel/societies/workforce/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,12 +60,12 @@ def check_if_running(running: bool) -> Callable:
def decorator(func):
@wraps(func)
def wrapper(self, *args, **kwargs):
if self._running != running:
status = "not running" if running else "running"
raise RuntimeError(
f"The workforce is {status}. Cannot perform the "
f"operation {func.__name__}."
)
# if self._running != running:
# status = "not running" if running else "running"
# raise RuntimeError(
# f"The workforce is {status}. Cannot perform the "
# f"operation {func.__name__}."
# )
return func(self, *args, **kwargs)

return wrapper
Expand Down
97 changes: 97 additions & 0 deletions examples/benchmarks/gaia_workforce.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========


from camel.agents import ChatAgent
from camel.benchmarks import DefaultGAIARetriever, GAIABenchmark
from camel.messages import BaseMessage
from camel.models import ModelFactory
from camel.runtime import RemoteHttpRuntime
from camel.societies.workforce import Workforce
from camel.toolkits import CodeExecutionToolkit
from camel.types import ModelPlatformType, ModelType, StorageType

retriever = DefaultGAIARetriever(
vector_storage_local_path="local_data2/", storage_type=StorageType.QDRANT
)

benchmark = GAIABenchmark(
data_dir="datasets_test",
processes=1,
save_to="results.jsonl",
retriever=retriever,
)

print(f"Number of validation examples: {len(benchmark.valid)}")
print(f"Number of test examples: {len(benchmark.test)}")


toolkit = CodeExecutionToolkit(verbose=True)
runtime = RemoteHttpRuntime("localhost").add(
toolkit.get_tools(),
"camel.toolkits.CodeExecutionToolkit",
)

task_prompt = """
You are a general AI assistant. I will ask you a question. Report your
thoughts, and finish your answer with the following template:
FINAL ANSWER: [YOUR FINAL ANSWER].
YOUR FINAL ANSWER should be a number OR as few words as possible OR a
comma separated list of numbers and/or strings.
If you are asked for a number, don't use comma to write your number
neither use units such as $ or percent sign unless specified otherwise.
If you are asked for a string, don't use articles, neither
abbreviations (e.g. for cities), and write the digits in plain text
unless specified otherwise.
If you are asked for a comma separated list, apply the above rules
depending of whether the element to be put in the list is a number or
a string.
""".strip()

sys_msg = BaseMessage.make_assistant_message(
role_name="Assistant",
content=task_prompt,
)

tools = toolkit.get_tools()

model = ModelFactory.create(
model_platform=ModelPlatformType.DEFAULT,
model_type=ModelType.DEFAULT,
)


agent = ChatAgent(
sys_msg,
model,
tools=tools,
)

workforce = Workforce("GAIA Workforce", new_worker_agent_kwargs={})
workforce.add_single_agent_worker(
"An agent that can do code execution",
worker=agent,
)

result = benchmark.run_workforce(workforce, "valid", level="all", subset=1)
print("correct:", result["correct"])
print("total:", result["total"])

# ruff: noqa: E501
"""
Number of validation examples: 165
Number of test examples: 300
correct: 0
total: 3
"""
Loading