diff --git a/nemo_skills/evaluation/metrics/arena_metrics.py b/nemo_skills/evaluation/metrics/arena_metrics.py index 214b9e2a4..28acded35 100644 --- a/nemo_skills/evaluation/metrics/arena_metrics.py +++ b/nemo_skills/evaluation/metrics/arena_metrics.py @@ -73,19 +73,20 @@ def fill_up_missing(self): def is_incomplete(self, elem): return 'judgement-gen-base' not in elem or 'judgement-base-gen' not in elem or 'generation' not in elem - def update(self, predictions, aggregation_mode): + def update(self, predictions): """Updating the evaluation results with the current element. Args: predictions (list[dict]): aggregated predictions across all generations. The content of the file is benchmark specific. - aggregation_mode (str): "best", "first", etc. Might vary by benchmark. """ # this shouldn't do any heavy calculation, but just read the metric from existing json entry # all the heavy lifting should be done in the evaluation script self.total += 1 self.scores.append([]) - if aggregation_mode == "best": + if len(predictions) > 1: + self.agg_mode = f"pass@{len(predictions)}" + judge_scores = [self._get_judge_score(elem['judgement-gen-base']) for elem in predictions] # adding the best score out of all the generations possible_scores = ['A>>B', 'A>B', 'A=B', 'B>A', 'B>>A'] @@ -110,14 +111,15 @@ def update(self, predictions, aggregation_mode): break else: self.scores[-1].append(None) # in case judge didn't generate a valid score - elif aggregation_mode == "first": + else: + # Single prediction + self.agg_mode = "greedy" + self.lengths += len(predictions[0]['generation']) self.scores[-1] = [ self._get_judge_score(predictions[0]['judgement-gen-base']), self._get_judge_score(predictions[0]['judgement-base-gen']), ] - else: - raise ValueError(f"Unsupported mode {aggregation_mode}") def get_metrics(self): from nemo_skills.evaluation.arena_utils import get_aggregate_score @@ -125,9 +127,11 @@ def get_metrics(self): metrics = {'num_entries': self.total} metrics.update(get_aggregate_score(self.scores)) metrics['avg_response_length'] = self.lengths / self.total - return metrics + return {self.agg_mode: metrics} def reset(self): self.scores = [] # list of lists self.lengths = 0 self.total = 0 + # Set automatically + self.agg_mode = "greedy" diff --git a/nemo_skills/evaluation/metrics/code_metrics.py b/nemo_skills/evaluation/metrics/code_metrics.py index 6a52183a4..cbd506340 100644 --- a/nemo_skills/evaluation/metrics/code_metrics.py +++ b/nemo_skills/evaluation/metrics/code_metrics.py @@ -27,10 +27,6 @@ def fill_up_missing(self): def is_incomplete(self, elem): return 'is_correct' not in elem or 'is_correct-plus' not in elem - def _update_perf_dict(self, perf_dict, correct, correct_plus): - perf_dict["total_correct"] += correct - perf_dict["total_correct_plus"] += correct_plus - def update(self, predictions): """Updating the evaluation results with the current element. @@ -41,13 +37,16 @@ def update(self, predictions): self.total += 1 if len(predictions) > 1: - correct = any([elem['is_correct'] for elem in predictions]) - correct_plus = any([elem['is_correct-plus'] for elem in predictions]) - self._update_perf_dict(self.agg_mode_dict["best"], correct, correct_plus) + self.agg_mode = f"pass@{len(predictions)}" + + self.total_correct += any([elem['is_correct'] for elem in predictions]) + self.total_correct_plus += any([elem['is_correct-plus'] for elem in predictions]) else: - correct = predictions[0]['is_correct'] - correct_plus = predictions[0]['is_correct-plus'] - self._update_perf_dict(self.agg_mode_dict["greedy"], correct, correct_plus) + # If single prediction, set it to greedy aggregation mode + self.agg_mode = "greedy" + + self.total_correct += predictions[0]['is_correct'] + self.total_correct_plus += predictions[0]['is_correct-plus'] def get_metrics(self): metrics_dict = {} @@ -57,8 +56,11 @@ def get_metrics(self): metrics_dict[agg_mode]["passing_base_tests"] = (agg_metric_dict["total_correct"] / self.total) * 100.0 metrics_dict[agg_mode]["passing_plus_tests"] = (agg_metric_dict["total_correct_plus"] / self.total) * 100.0 - return metrics_dict + return {self.agg_mode: metrics_dict} def reset(self): self.total = 0 - self.agg_mode_dict = defaultdict(lambda: defaultdict(int)) + self.total_correct = 0 + self.total_correct_plus = 0 + # Aggregation mode is automatically set + self.agg_mode = "greedy" diff --git a/nemo_skills/evaluation/metrics/if_metrics.py b/nemo_skills/evaluation/metrics/if_metrics.py index 60e4adce2..621d7dfe5 100644 --- a/nemo_skills/evaluation/metrics/if_metrics.py +++ b/nemo_skills/evaluation/metrics/if_metrics.py @@ -77,16 +77,18 @@ def update(self, predictions): predictions (list[dict]): aggregated predictions across all generations. The content of the file is benchmark specific. """ - # this shouldn't do any heavy calculation, but just read the metric from existing json entry - # all the heavy lifting should be done in the evaluation script - if aggregation_mode == "best": + if len(predictions) > 1: + # Selecting the best among all predictions + self.agg_mode = "best" + self._update_single_stat(self.strict_stats, [pred['strict_eval'] for pred in predictions]) self._update_single_stat(self.loose_stats, [pred['loose_eval'] for pred in predictions]) - elif aggregation_mode == "first": + else: + # Single prediction + self.agg_mode = "greedy" + self._update_single_stat(self.strict_stats, [predictions[0]['strict_eval']]) self._update_single_stat(self.loose_stats, [predictions[0]['loose_eval']]) - else: - raise ValueError(f"Unsupported mode {aggregation_mode}") def get_metrics(self): prompt_total = self.strict_stats['prompt']['total'] @@ -95,7 +97,7 @@ def get_metrics(self): inst_strict = self.strict_stats['instruction']['correct'] / inst_total * 100.0 prompt_loose = self.loose_stats['prompt']['correct'] / prompt_total * 100.0 inst_loose = self.loose_stats['instruction']['correct'] / inst_total * 100.0 - return { + metrics_dict = { "num_prompts": prompt_total, "num_instructions": inst_total, "average_score": (prompt_strict + inst_strict + prompt_loose + inst_loose) / 4, @@ -105,6 +107,8 @@ def get_metrics(self): "instruction_loose_accuracy": inst_loose, } + return {self.agg_mode: metrics_dict} + def reset(self): # the original code also has a deeper breakdown into tier1 scores, # but that's probably too much for us to track at this stage @@ -118,3 +122,6 @@ def reset(self): "instruction": {"total": 0, "correct": 0}, "tier0": {"total": defaultdict(int), "correct": defaultdict(int)}, } + + # Automatically set + self.agg_mode = "greedy" diff --git a/nemo_skills/evaluation/metrics/lean4_metrics.py b/nemo_skills/evaluation/metrics/lean4_metrics.py index 32f4ac33c..c4006f7cb 100644 --- a/nemo_skills/evaluation/metrics/lean4_metrics.py +++ b/nemo_skills/evaluation/metrics/lean4_metrics.py @@ -28,33 +28,37 @@ def is_incomplete(self, elem): incomplete = 'proof_status' not in elem return incomplete - def update(self, predictions, aggregation_mode): + def update(self, predictions): """Updating the evaluation results with the current element. Args: predictions (list[dict]): aggregated predictions across all generations. The content of the file is benchmark specific. - aggregation_mode (str): "best", "first", etc. Might vary by benchmark. """ self.total += 1 - if aggregation_mode == "best": + if len(predictions) > 1: + # Multiple predictions, select the pass@k + self.agg_mode = f"pass@{len(predictions)}" + self.correct_proof += any([elem['proof_status'] == "completed" for elem in predictions]) if all([elem['proof_status'] == "timeout" for elem in predictions]): self.timeout_error += 1 - elif aggregation_mode == "first": + else: + self.agg_mode = "greedy" + self.correct_proof += predictions[0]['proof_status'] == "completed" self.timeout_error += predictions[0]['proof_status'] == "timeout" - else: - raise ValueError(f"Unsupported mode {aggregation_mode}") def get_metrics(self): metrics = {"num_entries": self.total} metrics["lean4_correct"] = self.correct_proof / self.total * 100.0 metrics["timeout_error"] = self.timeout_error / self.total * 100.0 - return metrics + return {self.agg_mode: metrics} def reset(self): self.correct_proof = 0 self.timeout_error = 0 self.total = 0 + # Aggregation mode is automatically set + self.agg_mode = "greedy" diff --git a/nemo_skills/evaluation/metrics/math_metrics.py b/nemo_skills/evaluation/metrics/math_metrics.py index 3cf84edad..744b958b3 100644 --- a/nemo_skills/evaluation/metrics/math_metrics.py +++ b/nemo_skills/evaluation/metrics/math_metrics.py @@ -153,7 +153,7 @@ def update(self, predictions): no_answer = True self.update_comb_metric( - self.agg_mode_dict["pass"], current_correct_sympy, current_correct_judge, no_answer + self.agg_mode_dict[f"pass@{len(predictions)}"], current_correct_sympy, current_correct_judge, no_answer ) # Majority@K @@ -180,7 +180,10 @@ def get_majority_result(predictions, result_extractor): ) self.update_comb_metric( - self.agg_mode_dict["majority"], current_correct_sympy, current_correct_judge, no_answer + self.agg_mode_dict[f"majority@{len(predictions)}"], + current_correct_sympy, + current_correct_judge, + no_answer, ) # Reward Models @@ -210,7 +213,10 @@ def get_reward_best_result(predictions, result_extractor): ) self.update_comb_metric( - self.agg_mode_dict["rm_best"], current_correct_sympy, current_correct_judge, no_answer + self.agg_mode_dict[f"rm_best@{len(predictions)}"], + current_correct_sympy, + current_correct_judge, + no_answer, ) # Reinitialize local vars for tracking prediction correctness @@ -246,7 +252,10 @@ def get_majority_reward_result(predictions, result_extractor): ) self.update_comb_metric( - self.agg_mode_dict["rm_majority"], current_correct_sympy, current_correct_judge, no_answer + self.agg_mode_dict[f"rm_majority@{len(predictions)}"], + current_correct_sympy, + current_correct_judge, + no_answer, ) def get_metrics(self): diff --git a/nemo_skills/evaluation/metrics/mtbench_metrics.py b/nemo_skills/evaluation/metrics/mtbench_metrics.py index 162d286f8..122efdced 100644 --- a/nemo_skills/evaluation/metrics/mtbench_metrics.py +++ b/nemo_skills/evaluation/metrics/mtbench_metrics.py @@ -62,19 +62,22 @@ def fill_up_missing(self): def is_incomplete(self, elem): return 'judgement-turn1' not in elem or 'judgement-turn2' not in elem - def update(self, predictions, aggregation_mode): + def update(self, predictions): """Updating the evaluation results with the current element. Args: predictions (list[dict]): aggregated predictions across all generations. The content of the file is benchmark specific. - aggregation_mode (str): "best", "first", etc. Might vary by benchmark. """ # this shouldn't do any heavy calculation, but just read the metric from existing json entry # all the heavy lifting should be done in the evaluation script self.total += 1 - if aggregation_mode == "best": + + if len(predictions) > 1: # TODO: might all have missing judgement? + # If multiple predictions, set it to "best" aggregation mode + self.agg_mode = "best" + rating1 = max( int(re.search(r'Rating: \[\[(\d+)\]\]', elem['judgement-turn1']).group(1)) for elem in predictions @@ -87,15 +90,16 @@ def update(self, predictions, aggregation_mode): ) category = predictions[0]['category'] self.scores[category].append((rating1, rating2)) - elif aggregation_mode == "first": + else: + # If single prediction, set it to greedy aggregation mode + self.agg_mode = "greedy" + rating1_match = re.search(r'Rating: \[\[(\d+)\]\]', predictions[0]['judgement-turn1']) rating1 = int(rating1_match.group(1)) if rating1_match else None rating2_match = re.search(r'Rating: \[\[(\d+)\]\]', predictions[0]['judgement-turn2']) rating2 = int(rating2_match.group(1)) if rating2_match else None category = predictions[0]['category'] self.scores[category].append((rating1, rating2)) - else: - raise ValueError(f"Unsupported mode {aggregation_mode}") def get_metrics(self): metrics = {'num_entries': self.total} @@ -127,11 +131,12 @@ def get_metrics(self): metrics["missing_rating_turn1"] = none_count_turn1 metrics["missing_rating_turn2"] = none_count_turn2 print("Please see metrics.json for MT-bench per-category breakdown") - return metrics + return {self.agg_mode: metrics} def reset(self): self.scores = defaultdict(list) self.total = 0 + self.agg_mode = "greedy" def max_metrics_to_print(self): """We are only printing the averages, but all other metrics can still be found in metrics.json"""