Changing the return format from metrics computation

NVIDIA · Nov 27, 2024 · 4e8ddb3 · 4e8ddb3
1 parent 261331d
commit 4e8ddb3
Show file tree

Hide file tree

Showing 6 changed files with 75 additions and 44 deletions.
diff --git a/nemo_skills/evaluation/metrics/arena_metrics.py b/nemo_skills/evaluation/metrics/arena_metrics.py
@@ -73,19 +73,20 @@ def fill_up_missing(self):
     def is_incomplete(self, elem):
         return 'judgement-gen-base' not in elem or 'judgement-base-gen' not in elem or 'generation' not in elem
 
-    def update(self, predictions, aggregation_mode):
+    def update(self, predictions):
         """Updating the evaluation results with the current element.
 
         Args:
             predictions (list[dict]): aggregated predictions across all generations.
                 The content of the file is benchmark specific.
-            aggregation_mode (str): "best", "first", etc. Might vary by benchmark.
         """
         # this shouldn't do any heavy calculation, but just read the metric from existing json entry
         # all the heavy lifting should be done in the evaluation script
         self.total += 1
         self.scores.append([])
-        if aggregation_mode == "best":
+        if len(predictions) > 1:
+            self.agg_mode = f"pass@{len(predictions)}"
+
             judge_scores = [self._get_judge_score(elem['judgement-gen-base']) for elem in predictions]
             # adding the best score out of all the generations
             possible_scores = ['A>>B', 'A>B', 'A=B', 'B>A', 'B>>A']
@@ -110,24 +111,27 @@ def update(self, predictions, aggregation_mode):
                     break
             else:
                 self.scores[-1].append(None)  # in case judge didn't generate a valid score
-        elif aggregation_mode == "first":
+        else:
+            # Single prediction
+            self.agg_mode = "greedy"
+
             self.lengths += len(predictions[0]['generation'])
             self.scores[-1] = [
                 self._get_judge_score(predictions[0]['judgement-gen-base']),
                 self._get_judge_score(predictions[0]['judgement-base-gen']),
             ]
-        else:
-            raise ValueError(f"Unsupported mode {aggregation_mode}")
 
     def get_metrics(self):
         from nemo_skills.evaluation.arena_utils import get_aggregate_score
 
         metrics = {'num_entries': self.total}
         metrics.update(get_aggregate_score(self.scores))
         metrics['avg_response_length'] = self.lengths / self.total
-        return metrics
+        return {self.agg_mode: metrics}
 
     def reset(self):
         self.scores = []  # list of lists
         self.lengths = 0
         self.total = 0
+        # Set automatically
+        self.agg_mode = "greedy"
diff --git a/nemo_skills/evaluation/metrics/code_metrics.py b/nemo_skills/evaluation/metrics/code_metrics.py
@@ -27,10 +27,6 @@ def fill_up_missing(self):
     def is_incomplete(self, elem):
         return 'is_correct' not in elem or 'is_correct-plus' not in elem
 
-    def _update_perf_dict(self, perf_dict, correct, correct_plus):
-        perf_dict["total_correct"] += correct
-        perf_dict["total_correct_plus"] += correct_plus
-
     def update(self, predictions):
         """Updating the evaluation results with the current element.
 
@@ -41,13 +37,16 @@ def update(self, predictions):
         self.total += 1
 
         if len(predictions) > 1:
-            correct = any([elem['is_correct'] for elem in predictions])
-            correct_plus = any([elem['is_correct-plus'] for elem in predictions])
-            self._update_perf_dict(self.agg_mode_dict["best"], correct, correct_plus)
+            self.agg_mode = f"pass@{len(predictions)}"
+
+            self.total_correct += any([elem['is_correct'] for elem in predictions])
+            self.total_correct_plus += any([elem['is_correct-plus'] for elem in predictions])
         else:
-            correct = predictions[0]['is_correct']
-            correct_plus = predictions[0]['is_correct-plus']
-            self._update_perf_dict(self.agg_mode_dict["greedy"], correct, correct_plus)
+            # If single prediction, set it to greedy aggregation mode
+            self.agg_mode = "greedy"
+
+            self.total_correct += predictions[0]['is_correct']
+            self.total_correct_plus += predictions[0]['is_correct-plus']
 
     def get_metrics(self):
         metrics_dict = {}
@@ -57,8 +56,11 @@ def get_metrics(self):
             metrics_dict[agg_mode]["passing_base_tests"] = (agg_metric_dict["total_correct"] / self.total) * 100.0
             metrics_dict[agg_mode]["passing_plus_tests"] = (agg_metric_dict["total_correct_plus"] / self.total) * 100.0
 
-        return metrics_dict
+        return {self.agg_mode: metrics_dict}
 
     def reset(self):
         self.total = 0
-        self.agg_mode_dict = defaultdict(lambda: defaultdict(int))
+        self.total_correct = 0
+        self.total_correct_plus = 0
+        # Aggregation mode is automatically set
+        self.agg_mode = "greedy"
diff --git a/nemo_skills/evaluation/metrics/if_metrics.py b/nemo_skills/evaluation/metrics/if_metrics.py
@@ -77,16 +77,18 @@ def update(self, predictions):
             predictions (list[dict]): aggregated predictions across all generations.
                 The content of the file is benchmark specific.
         """
-        # this shouldn't do any heavy calculation, but just read the metric from existing json entry
-        # all the heavy lifting should be done in the evaluation script
-        if aggregation_mode == "best":
+        if len(predictions) > 1:
+            # Selecting the best among all predictions
+            self.agg_mode = "best"
+
             self._update_single_stat(self.strict_stats, [pred['strict_eval'] for pred in predictions])
             self._update_single_stat(self.loose_stats, [pred['loose_eval'] for pred in predictions])
-        elif aggregation_mode == "first":
+        else:
+            # Single prediction
+            self.agg_mode = "greedy"
+
             self._update_single_stat(self.strict_stats, [predictions[0]['strict_eval']])
             self._update_single_stat(self.loose_stats, [predictions[0]['loose_eval']])
-        else:
-            raise ValueError(f"Unsupported mode {aggregation_mode}")
 
     def get_metrics(self):
         prompt_total = self.strict_stats['prompt']['total']
@@ -95,7 +97,7 @@ def get_metrics(self):
         inst_strict = self.strict_stats['instruction']['correct'] / inst_total * 100.0
         prompt_loose = self.loose_stats['prompt']['correct'] / prompt_total * 100.0
         inst_loose = self.loose_stats['instruction']['correct'] / inst_total * 100.0
-        return {
+        metrics_dict = {
             "num_prompts": prompt_total,
             "num_instructions": inst_total,
             "average_score": (prompt_strict + inst_strict + prompt_loose + inst_loose) / 4,
@@ -105,6 +107,8 @@ def get_metrics(self):
             "instruction_loose_accuracy": inst_loose,
         }
 
+        return {self.agg_mode: metrics_dict}
+
     def reset(self):
         # the original code also has a deeper breakdown into tier1 scores,
         # but that's probably too much for us to track at this stage
@@ -118,3 +122,6 @@ def reset(self):
             "instruction": {"total": 0, "correct": 0},
             "tier0": {"total": defaultdict(int), "correct": defaultdict(int)},
         }
+
+        # Automatically set
+        self.agg_mode = "greedy"
diff --git a/nemo_skills/evaluation/metrics/lean4_metrics.py b/nemo_skills/evaluation/metrics/lean4_metrics.py
@@ -28,33 +28,37 @@ def is_incomplete(self, elem):
             incomplete = 'proof_status' not in elem
         return incomplete
 
-    def update(self, predictions, aggregation_mode):
+    def update(self, predictions):
         """Updating the evaluation results with the current element.
 
         Args:
             predictions (list[dict]): aggregated predictions across all generations.
                 The content of the file is benchmark specific.
-            aggregation_mode (str): "best", "first", etc. Might vary by benchmark.
         """
         self.total += 1
 
-        if aggregation_mode == "best":
+        if len(predictions) > 1:
+            # Multiple predictions, select the pass@k
+            self.agg_mode = f"pass@{len(predictions)}"
+
             self.correct_proof += any([elem['proof_status'] == "completed" for elem in predictions])
             if all([elem['proof_status'] == "timeout" for elem in predictions]):
                 self.timeout_error += 1
-        elif aggregation_mode == "first":
+        else:
+            self.agg_mode = "greedy"
+
             self.correct_proof += predictions[0]['proof_status'] == "completed"
             self.timeout_error += predictions[0]['proof_status'] == "timeout"
-        else:
-            raise ValueError(f"Unsupported mode {aggregation_mode}")
 
     def get_metrics(self):
         metrics = {"num_entries": self.total}
         metrics["lean4_correct"] = self.correct_proof / self.total * 100.0
         metrics["timeout_error"] = self.timeout_error / self.total * 100.0
-        return metrics
+        return {self.agg_mode: metrics}
 
     def reset(self):
         self.correct_proof = 0
         self.timeout_error = 0
         self.total = 0
+        # Aggregation mode is automatically set
+        self.agg_mode = "greedy"
diff --git a/nemo_skills/evaluation/metrics/math_metrics.py b/nemo_skills/evaluation/metrics/math_metrics.py
@@ -153,7 +153,7 @@ def update(self, predictions):
                 no_answer = True
 
             self.update_comb_metric(
-                self.agg_mode_dict["pass"], current_correct_sympy, current_correct_judge, no_answer
+                self.agg_mode_dict[f"pass@{len(predictions)}"], current_correct_sympy, current_correct_judge, no_answer
             )
 
             # Majority@K
@@ -180,7 +180,10 @@ def get_majority_result(predictions, result_extractor):
                 )
 
             self.update_comb_metric(
-                self.agg_mode_dict["majority"], current_correct_sympy, current_correct_judge, no_answer
+                self.agg_mode_dict[f"majority@{len(predictions)}"],
+                current_correct_sympy,
+                current_correct_judge,
+                no_answer,
             )
 
             # Reward Models
@@ -210,7 +213,10 @@ def get_reward_best_result(predictions, result_extractor):
                     )
 
                 self.update_comb_metric(
-                    self.agg_mode_dict["rm_best"], current_correct_sympy, current_correct_judge, no_answer
+                    self.agg_mode_dict[f"rm_best@{len(predictions)}"],
+                    current_correct_sympy,
+                    current_correct_judge,
+                    no_answer,
                 )
 
                 # Reinitialize local vars for tracking prediction correctness
@@ -246,7 +252,10 @@ def get_majority_reward_result(predictions, result_extractor):
                     )
 
                 self.update_comb_metric(
-                    self.agg_mode_dict["rm_majority"], current_correct_sympy, current_correct_judge, no_answer
+                    self.agg_mode_dict[f"rm_majority@{len(predictions)}"],
+                    current_correct_sympy,
+                    current_correct_judge,
+                    no_answer,
                 )
 
     def get_metrics(self):

diff --git a/nemo_skills/evaluation/metrics/mtbench_metrics.py b/nemo_skills/evaluation/metrics/mtbench_metrics.py
@@ -62,19 +62,22 @@ def fill_up_missing(self):
     def is_incomplete(self, elem):
         return 'judgement-turn1' not in elem or 'judgement-turn2' not in elem
 
-    def update(self, predictions, aggregation_mode):
+    def update(self, predictions):
         """Updating the evaluation results with the current element.
 
         Args:
             predictions (list[dict]): aggregated predictions across all generations.
                 The content of the file is benchmark specific.
-            aggregation_mode (str): "best", "first", etc. Might vary by benchmark.
         """
         # this shouldn't do any heavy calculation, but just read the metric from existing json entry
         # all the heavy lifting should be done in the evaluation script
         self.total += 1
-        if aggregation_mode == "best":
+
+        if len(predictions) > 1:
             # TODO: might all have missing judgement?
+            # If multiple predictions, set it to "best" aggregation mode
+            self.agg_mode = "best"
+
             rating1 = max(
                 int(re.search(r'Rating: \[\[(\d+)\]\]', elem['judgement-turn1']).group(1))
                 for elem in predictions
@@ -87,15 +90,16 @@ def update(self, predictions, aggregation_mode):
             )
             category = predictions[0]['category']
             self.scores[category].append((rating1, rating2))
-        elif aggregation_mode == "first":
+        else:
+            # If single prediction, set it to greedy aggregation mode
+            self.agg_mode = "greedy"
+
             rating1_match = re.search(r'Rating: \[\[(\d+)\]\]', predictions[0]['judgement-turn1'])
             rating1 = int(rating1_match.group(1)) if rating1_match else None
             rating2_match = re.search(r'Rating: \[\[(\d+)\]\]', predictions[0]['judgement-turn2'])
             rating2 = int(rating2_match.group(1)) if rating2_match else None
             category = predictions[0]['category']
             self.scores[category].append((rating1, rating2))
-        else:
-            raise ValueError(f"Unsupported mode {aggregation_mode}")
 
     def get_metrics(self):
         metrics = {'num_entries': self.total}
@@ -127,11 +131,12 @@ def get_metrics(self):
         metrics["missing_rating_turn1"] = none_count_turn1
         metrics["missing_rating_turn2"] = none_count_turn2
         print("Please see metrics.json for MT-bench per-category breakdown")
-        return metrics
+        return {self.agg_mode: metrics}
 
     def reset(self):
         self.scores = defaultdict(list)
         self.total = 0
+        self.agg_mode = "greedy"
 
     def max_metrics_to_print(self):
         """We are only printing the averages, but all other metrics can still be found in metrics.json"""