Skip to content

Commit

Permalink
Changing the return format from metrics computation
Browse files Browse the repository at this point in the history
  • Loading branch information
shtoshni committed Nov 27, 2024
1 parent 261331d commit 4e8ddb3
Show file tree
Hide file tree
Showing 6 changed files with 75 additions and 44 deletions.
18 changes: 11 additions & 7 deletions nemo_skills/evaluation/metrics/arena_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,19 +73,20 @@ def fill_up_missing(self):
def is_incomplete(self, elem):
return 'judgement-gen-base' not in elem or 'judgement-base-gen' not in elem or 'generation' not in elem

def update(self, predictions, aggregation_mode):
def update(self, predictions):
"""Updating the evaluation results with the current element.
Args:
predictions (list[dict]): aggregated predictions across all generations.
The content of the file is benchmark specific.
aggregation_mode (str): "best", "first", etc. Might vary by benchmark.
"""
# this shouldn't do any heavy calculation, but just read the metric from existing json entry
# all the heavy lifting should be done in the evaluation script
self.total += 1
self.scores.append([])
if aggregation_mode == "best":
if len(predictions) > 1:
self.agg_mode = f"pass@{len(predictions)}"

judge_scores = [self._get_judge_score(elem['judgement-gen-base']) for elem in predictions]
# adding the best score out of all the generations
possible_scores = ['A>>B', 'A>B', 'A=B', 'B>A', 'B>>A']
Expand All @@ -110,24 +111,27 @@ def update(self, predictions, aggregation_mode):
break
else:
self.scores[-1].append(None) # in case judge didn't generate a valid score
elif aggregation_mode == "first":
else:
# Single prediction
self.agg_mode = "greedy"

self.lengths += len(predictions[0]['generation'])
self.scores[-1] = [
self._get_judge_score(predictions[0]['judgement-gen-base']),
self._get_judge_score(predictions[0]['judgement-base-gen']),
]
else:
raise ValueError(f"Unsupported mode {aggregation_mode}")

def get_metrics(self):
from nemo_skills.evaluation.arena_utils import get_aggregate_score

metrics = {'num_entries': self.total}
metrics.update(get_aggregate_score(self.scores))
metrics['avg_response_length'] = self.lengths / self.total
return metrics
return {self.agg_mode: metrics}

def reset(self):
self.scores = [] # list of lists
self.lengths = 0
self.total = 0
# Set automatically
self.agg_mode = "greedy"
26 changes: 14 additions & 12 deletions nemo_skills/evaluation/metrics/code_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,6 @@ def fill_up_missing(self):
def is_incomplete(self, elem):
return 'is_correct' not in elem or 'is_correct-plus' not in elem

def _update_perf_dict(self, perf_dict, correct, correct_plus):
perf_dict["total_correct"] += correct
perf_dict["total_correct_plus"] += correct_plus

def update(self, predictions):
"""Updating the evaluation results with the current element.
Expand All @@ -41,13 +37,16 @@ def update(self, predictions):
self.total += 1

if len(predictions) > 1:
correct = any([elem['is_correct'] for elem in predictions])
correct_plus = any([elem['is_correct-plus'] for elem in predictions])
self._update_perf_dict(self.agg_mode_dict["best"], correct, correct_plus)
self.agg_mode = f"pass@{len(predictions)}"

self.total_correct += any([elem['is_correct'] for elem in predictions])
self.total_correct_plus += any([elem['is_correct-plus'] for elem in predictions])
else:
correct = predictions[0]['is_correct']
correct_plus = predictions[0]['is_correct-plus']
self._update_perf_dict(self.agg_mode_dict["greedy"], correct, correct_plus)
# If single prediction, set it to greedy aggregation mode
self.agg_mode = "greedy"

self.total_correct += predictions[0]['is_correct']
self.total_correct_plus += predictions[0]['is_correct-plus']

def get_metrics(self):
metrics_dict = {}
Expand All @@ -57,8 +56,11 @@ def get_metrics(self):
metrics_dict[agg_mode]["passing_base_tests"] = (agg_metric_dict["total_correct"] / self.total) * 100.0
metrics_dict[agg_mode]["passing_plus_tests"] = (agg_metric_dict["total_correct_plus"] / self.total) * 100.0

return metrics_dict
return {self.agg_mode: metrics_dict}

def reset(self):
self.total = 0
self.agg_mode_dict = defaultdict(lambda: defaultdict(int))
self.total_correct = 0
self.total_correct_plus = 0
# Aggregation mode is automatically set
self.agg_mode = "greedy"
21 changes: 14 additions & 7 deletions nemo_skills/evaluation/metrics/if_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,16 +77,18 @@ def update(self, predictions):
predictions (list[dict]): aggregated predictions across all generations.
The content of the file is benchmark specific.
"""
# this shouldn't do any heavy calculation, but just read the metric from existing json entry
# all the heavy lifting should be done in the evaluation script
if aggregation_mode == "best":
if len(predictions) > 1:
# Selecting the best among all predictions
self.agg_mode = "best"

self._update_single_stat(self.strict_stats, [pred['strict_eval'] for pred in predictions])
self._update_single_stat(self.loose_stats, [pred['loose_eval'] for pred in predictions])
elif aggregation_mode == "first":
else:
# Single prediction
self.agg_mode = "greedy"

self._update_single_stat(self.strict_stats, [predictions[0]['strict_eval']])
self._update_single_stat(self.loose_stats, [predictions[0]['loose_eval']])
else:
raise ValueError(f"Unsupported mode {aggregation_mode}")

def get_metrics(self):
prompt_total = self.strict_stats['prompt']['total']
Expand All @@ -95,7 +97,7 @@ def get_metrics(self):
inst_strict = self.strict_stats['instruction']['correct'] / inst_total * 100.0
prompt_loose = self.loose_stats['prompt']['correct'] / prompt_total * 100.0
inst_loose = self.loose_stats['instruction']['correct'] / inst_total * 100.0
return {
metrics_dict = {
"num_prompts": prompt_total,
"num_instructions": inst_total,
"average_score": (prompt_strict + inst_strict + prompt_loose + inst_loose) / 4,
Expand All @@ -105,6 +107,8 @@ def get_metrics(self):
"instruction_loose_accuracy": inst_loose,
}

return {self.agg_mode: metrics_dict}

def reset(self):
# the original code also has a deeper breakdown into tier1 scores,
# but that's probably too much for us to track at this stage
Expand All @@ -118,3 +122,6 @@ def reset(self):
"instruction": {"total": 0, "correct": 0},
"tier0": {"total": defaultdict(int), "correct": defaultdict(int)},
}

# Automatically set
self.agg_mode = "greedy"
18 changes: 11 additions & 7 deletions nemo_skills/evaluation/metrics/lean4_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,33 +28,37 @@ def is_incomplete(self, elem):
incomplete = 'proof_status' not in elem
return incomplete

def update(self, predictions, aggregation_mode):
def update(self, predictions):
"""Updating the evaluation results with the current element.
Args:
predictions (list[dict]): aggregated predictions across all generations.
The content of the file is benchmark specific.
aggregation_mode (str): "best", "first", etc. Might vary by benchmark.
"""
self.total += 1

if aggregation_mode == "best":
if len(predictions) > 1:
# Multiple predictions, select the pass@k
self.agg_mode = f"pass@{len(predictions)}"

self.correct_proof += any([elem['proof_status'] == "completed" for elem in predictions])
if all([elem['proof_status'] == "timeout" for elem in predictions]):
self.timeout_error += 1
elif aggregation_mode == "first":
else:
self.agg_mode = "greedy"

self.correct_proof += predictions[0]['proof_status'] == "completed"
self.timeout_error += predictions[0]['proof_status'] == "timeout"
else:
raise ValueError(f"Unsupported mode {aggregation_mode}")

def get_metrics(self):
metrics = {"num_entries": self.total}
metrics["lean4_correct"] = self.correct_proof / self.total * 100.0
metrics["timeout_error"] = self.timeout_error / self.total * 100.0
return metrics
return {self.agg_mode: metrics}

def reset(self):
self.correct_proof = 0
self.timeout_error = 0
self.total = 0
# Aggregation mode is automatically set
self.agg_mode = "greedy"
17 changes: 13 additions & 4 deletions nemo_skills/evaluation/metrics/math_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ def update(self, predictions):
no_answer = True

self.update_comb_metric(
self.agg_mode_dict["pass"], current_correct_sympy, current_correct_judge, no_answer
self.agg_mode_dict[f"pass@{len(predictions)}"], current_correct_sympy, current_correct_judge, no_answer
)

# Majority@K
Expand All @@ -180,7 +180,10 @@ def get_majority_result(predictions, result_extractor):
)

self.update_comb_metric(
self.agg_mode_dict["majority"], current_correct_sympy, current_correct_judge, no_answer
self.agg_mode_dict[f"majority@{len(predictions)}"],
current_correct_sympy,
current_correct_judge,
no_answer,
)

# Reward Models
Expand Down Expand Up @@ -210,7 +213,10 @@ def get_reward_best_result(predictions, result_extractor):
)

self.update_comb_metric(
self.agg_mode_dict["rm_best"], current_correct_sympy, current_correct_judge, no_answer
self.agg_mode_dict[f"rm_best@{len(predictions)}"],
current_correct_sympy,
current_correct_judge,
no_answer,
)

# Reinitialize local vars for tracking prediction correctness
Expand Down Expand Up @@ -246,7 +252,10 @@ def get_majority_reward_result(predictions, result_extractor):
)

self.update_comb_metric(
self.agg_mode_dict["rm_majority"], current_correct_sympy, current_correct_judge, no_answer
self.agg_mode_dict[f"rm_majority@{len(predictions)}"],
current_correct_sympy,
current_correct_judge,
no_answer,
)

def get_metrics(self):
Expand Down
19 changes: 12 additions & 7 deletions nemo_skills/evaluation/metrics/mtbench_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,19 +62,22 @@ def fill_up_missing(self):
def is_incomplete(self, elem):
return 'judgement-turn1' not in elem or 'judgement-turn2' not in elem

def update(self, predictions, aggregation_mode):
def update(self, predictions):
"""Updating the evaluation results with the current element.
Args:
predictions (list[dict]): aggregated predictions across all generations.
The content of the file is benchmark specific.
aggregation_mode (str): "best", "first", etc. Might vary by benchmark.
"""
# this shouldn't do any heavy calculation, but just read the metric from existing json entry
# all the heavy lifting should be done in the evaluation script
self.total += 1
if aggregation_mode == "best":

if len(predictions) > 1:
# TODO: might all have missing judgement?
# If multiple predictions, set it to "best" aggregation mode
self.agg_mode = "best"

rating1 = max(
int(re.search(r'Rating: \[\[(\d+)\]\]', elem['judgement-turn1']).group(1))
for elem in predictions
Expand All @@ -87,15 +90,16 @@ def update(self, predictions, aggregation_mode):
)
category = predictions[0]['category']
self.scores[category].append((rating1, rating2))
elif aggregation_mode == "first":
else:
# If single prediction, set it to greedy aggregation mode
self.agg_mode = "greedy"

rating1_match = re.search(r'Rating: \[\[(\d+)\]\]', predictions[0]['judgement-turn1'])
rating1 = int(rating1_match.group(1)) if rating1_match else None
rating2_match = re.search(r'Rating: \[\[(\d+)\]\]', predictions[0]['judgement-turn2'])
rating2 = int(rating2_match.group(1)) if rating2_match else None
category = predictions[0]['category']
self.scores[category].append((rating1, rating2))
else:
raise ValueError(f"Unsupported mode {aggregation_mode}")

def get_metrics(self):
metrics = {'num_entries': self.total}
Expand Down Expand Up @@ -127,11 +131,12 @@ def get_metrics(self):
metrics["missing_rating_turn1"] = none_count_turn1
metrics["missing_rating_turn2"] = none_count_turn2
print("Please see metrics.json for MT-bench per-category breakdown")
return metrics
return {self.agg_mode: metrics}

def reset(self):
self.scores = defaultdict(list)
self.total = 0
self.agg_mode = "greedy"

def max_metrics_to_print(self):
"""We are only printing the averages, but all other metrics can still be found in metrics.json"""
Expand Down

0 comments on commit 4e8ddb3

Please sign in to comment.