From 9b7f9d5782d2620a91bdbc4754c2466a8ebc9390 Mon Sep 17 00:00:00 2001
From: Igor Gitman <igitman@nvidia.com>
Date: Wed, 17 Jul 2024 13:21:39 -0700
Subject: [PATCH] Refactoring + fill majority answer script (#66)

Signed-off-by: Igor Gitman <igitman@nvidia.com>
---
 docs/reproducing-results.md                   |   2 +
 nemo_skills/evaluation/evaluate_results.py    | 106 +------
 .../evaluation/fill_majority_answer.py        | 139 +++++++++
 nemo_skills/evaluation/graders.py             | 109 +++++++
 nemo_skills/evaluation/metrics.py             | 221 ++++++++++++++
 nemo_skills/evaluation/settings.py            |  75 +++++
 .../prepare_sft_data.yaml                     |   2 +
 .../data_preparation_utils/preprocessing.py   |  14 +-
 nemo_skills/finetuning/sft_config.yaml        |  41 +--
 .../finetuning/sft_config_codegen.yaml        | 203 +++++++++++++
 pipeline/compute_metrics.py                   | 275 +-----------------
 pipeline/run_eval.py                          |  15 +-
 pipeline/run_sft.py                           |   4 +-
 pipeline/summarize_results.py                 |  13 +-
 tests/check_help.py                           |   1 +
 tests/gpu-tests/test_finetuning.py            |   5 +-
 tests/gpu-tests/test_generation.py            |   4 +-
 17 files changed, 797 insertions(+), 432 deletions(-)
 create mode 100644 nemo_skills/evaluation/fill_majority_answer.py
 create mode 100644 nemo_skills/evaluation/graders.py
 create mode 100644 nemo_skills/evaluation/metrics.py
 create mode 100644 nemo_skills/evaluation/settings.py
 create mode 100644 nemo_skills/finetuning/sft_config_codegen.yaml

diff --git a/docs/reproducing-results.md b/docs/reproducing-results.md
index 3e108a71e..3eaa925eb 100644
--- a/docs/reproducing-results.md
+++ b/docs/reproducing-results.md
@@ -200,6 +200,8 @@ you can run the following:
       --stages sft prepare_eval \
       --num_nodes 8 \
       --num_gpus 8 \
+      --config-file sft_config_codegen \
+      --with_sandbox \
       ++model.data.train_ds.file_path=/data/sft-data.jsonl \
       ++trainer.sft.max_epochs=4 \
       ++trainer.sft.val_check_interval=4000 \
diff --git a/nemo_skills/evaluation/evaluate_results.py b/nemo_skills/evaluation/evaluate_results.py
index cadae9d44..8bfb2f312 100644
--- a/nemo_skills/evaluation/evaluate_results.py
+++ b/nemo_skills/evaluation/evaluate_results.py
@@ -12,21 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import json
 import logging
-import shutil
-import subprocess
 import sys
-from argparse import Namespace
 from dataclasses import field
-from pathlib import Path
 from typing import Any
 
 import hydra
-from omegaconf import MISSING, OmegaConf
+from omegaconf import MISSING
 
-from nemo_skills.code_execution.sandbox import get_sandbox, sandbox_params
-from nemo_skills.evaluation.code_utils import preprocess_code
+from nemo_skills.code_execution.sandbox import sandbox_params
+from nemo_skills.evaluation.settings import GRADING_MAP
 from nemo_skills.utils import get_help_message, nested_dataclass, setup_logging
 
 LOG = logging.getLogger(__file__)
@@ -43,7 +38,7 @@ class EvaluateResultsConfig:
     # Sandbox configuration {sandbox_params}
     sandbox: dict = field(default_factory=lambda: {'sandbox_type': 'local'})
 
-    eval_type: str = "math"  # math or code
+    eval_type: str = "math"  # math or code  TODO: benchmark?
     eval_config: dict = field(default_factory=dict)
 
     def __post_init__(self):
@@ -56,104 +51,15 @@ def __post_init__(self):
 cs.store(name="base_evaluate_results_config", node=EvaluateResultsConfig)
 
 
-def math_eval(cfg):
-    sandbox = get_sandbox(**cfg.sandbox)
-    sandbox.batch_evaluate_results(
-        prediction_jsonl_files=cfg.prediction_jsonl_files,
-        **cfg.eval_config,
-    )
-
-
-def code_eval(cfg):
-    # TODO: need to move it to a separate docker (either our sandbox or separate srun)
-    from evalplus.evaluate import evaluate
-
-    # processing each generation separately (TODO: evalplus can do it together, but need to figure out the format)
-    for jsonl_file in cfg.prediction_jsonl_files:
-        with open(jsonl_file) as f:
-            samples = [preprocess_code(json.loads(line)) for line in f]
-        # all changes will be done with a new key "completion", so it's ok to write to the same file
-        with open(jsonl_file, "wt", encoding="utf-8") as f:
-            for sample in samples:
-                f.write(json.dumps(sample) + "\n")
-        eval_config = {
-            "samples": jsonl_file,
-            "base_only": False,
-            "parallel": None,
-            "i_just_wanna_run": False,
-            "test_details": False,
-            "min_time_limit": 1,
-            "gt_time_limit_factor": 4.0,
-            "mini": False,
-            "noextreme": False,
-            "version": "default",
-        }
-        eval_config.update(OmegaConf.to_container(cfg.eval_config))
-        evaluate(Namespace(**eval_config))
-        with open(jsonl_file[:-6] + '_eval_results.json', 'rt', encoding="utf-8") as fin:
-            evalplus_grades = json.load(fin)
-        # adding is_correct key to allow compute_metrics to work
-        with open(jsonl_file, "wt", encoding="utf-8") as f:
-            for sample in samples:
-                sample['is_correct'] = evalplus_grades['eval'][sample['task_id']][0]['base_status'] == "pass"
-                sample['is_correct-plus'] = (
-                    sample['is_correct'] and evalplus_grades['eval'][sample['task_id']][0]['plus_status'] == "pass"
-                )
-                f.write(json.dumps(sample) + "\n")
-
-        # moving eval file as otherwise evalplus does not want to recompute metrics if it's present..
-        shutil.move(jsonl_file[:-6] + '_eval_results.json', jsonl_file[:-6] + '_eval_results-saved.json')
-
-
-def ifeval(cfg):
-    for jsonl_file in cfg.prediction_jsonl_files:
-        parent_dir = Path(jsonl_file).absolute().parent
-        cmd = (
-            'cd /opt/benchmarks/google-research && python -m instruction_following_eval.evaluation_main '
-            f'--input_data={jsonl_file} '
-            f'--input_response_data={jsonl_file} '
-            f'--output_dir={parent_dir} '
-        )
-        subprocess.run(cmd, shell=True, check=True)
-        # fusing eval metrics back into the generation file
-        with open(jsonl_file, "rt", encoding="utf-8") as f:
-            samples = [json.loads(line) for line in f]
-
-        with open(parent_dir / 'eval_results_loose.jsonl', 'rt', encoding="utf-8") as f:
-            eval_results = [json.loads(line) for line in f]
-        for sample, eval_result in zip(samples, eval_results):
-            sample['loose_eval'] = eval_result
-
-        with open(parent_dir / 'eval_results_strict.jsonl', 'rt', encoding="utf-8") as f:
-            eval_results = [json.loads(line) for line in f]
-        for sample, eval_result in zip(samples, eval_results):
-            sample['strict_eval'] = eval_result
-
-        with open(jsonl_file, "wt", encoding="utf-8") as f:
-            for sample in samples:
-                f.write(json.dumps(sample) + "\n")
-
-        # removing metric files to avoid reusing them
-        (parent_dir / 'eval_results_loose.jsonl').unlink()
-        (parent_dir / 'eval_results_strict.jsonl').unlink()
-
-
-eval_map = {
-    "math": math_eval,
-    "code": code_eval,
-    "ifeval": ifeval,
-}
-
-
 @hydra.main(version_base=None, config_name="base_evaluate_results_config")
 def evaluate_results(cfg: EvaluateResultsConfig):
     cfg = EvaluateResultsConfig(_init_nested=True, **cfg)
     LOG.info("Config used: %s", cfg)
 
-    if cfg.eval_type not in eval_map:
+    if cfg.eval_type not in GRADING_MAP:
         raise ValueError(f"Unknown eval_type: {cfg.eval_type}")
 
-    eval_map[cfg.eval_type](cfg)
+    GRADING_MAP[cfg.eval_type](cfg)
 
 
 HELP_MESSAGE = get_help_message(
diff --git a/nemo_skills/evaluation/fill_majority_answer.py b/nemo_skills/evaluation/fill_majority_answer.py
new file mode 100644
index 000000000..2a2ca078b
--- /dev/null
+++ b/nemo_skills/evaluation/fill_majority_answer.py
@@ -0,0 +1,139 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import logging
+import sys
+from collections import Counter
+from itertools import zip_longest
+from typing import Any
+
+import hydra
+from omegaconf import MISSING
+from tqdm import tqdm
+
+from nemo_skills.evaluation.metrics import MathEval, read_predictions
+from nemo_skills.utils import get_help_message, nested_dataclass, setup_logging, unroll_files
+
+LOG = logging.getLogger(__file__)
+
+
+@nested_dataclass
+class FillMajorityAnswerConfig:
+    """Top-level parameters for the script"""
+
+    # list of files to use for majority voting.
+    # Can specify multiple patterns separated by space
+    # e.g. "path/to/file1.jsonl path/to/file2.jsonl" or with regex
+    # "test_folder/output-rs*.jsonl"
+    prediction_jsonl_files: Any = MISSING
+
+    # if set to True will error if any responses/data is missing
+    allow_incomplete: bool = False
+
+    # minimum number of majority votes to use the answer.
+    # -1 means use half of the votes, which is a good default value
+    min_votes: int = -1
+
+    # will be used to fill up when not enough votes are available for the majority
+    default_answer: str = "no_answer"
+
+    # will not use any negative answers as this likely indicates bad problems
+    # (at least for GSM8K domain). If running with other data, where negative answers
+    # are common, should be set to False
+    drop_negative_answers: bool = False
+
+    # will not use any non-integer answers as this might indicates bad problems
+    drop_noninteger_answers: bool = False
+
+    def __post_init__(self):
+        """Building data_file from dataset/split_name if not provided directly."""
+        if isinstance(self.prediction_jsonl_files, str):
+            self.prediction_jsonl_files = self.prediction_jsonl_files.split(" ")
+
+
+cs = hydra.core.config_store.ConfigStore.instance()
+cs.store(name="base_fill_majority_answer_conifg", node=FillMajorityAnswerConfig)
+
+
+@hydra.main(version_base=None, config_name="base_fill_majority_answer_conifg")
+def fill_majority_answer(cfg: FillMajorityAnswerConfig):
+    cfg = FillMajorityAnswerConfig(_init_nested=True, **cfg)
+    LOG.info("Config used: %s", cfg)
+
+    file_handles = [open(file, "rt", encoding="utf-8") for file in unroll_files(cfg.prediction_jsonl_files)]
+    if cfg.min_votes < 0:
+        cfg.min_votes = len(file_handles) // 2
+
+    # currently majority is only defined for math evals
+    evaluator = MathEval()
+
+    majority_answers = []
+    all_predictions = []
+    retained_questions = 0
+    for idx, predictions in enumerate(tqdm(zip_longest(*file_handles))):
+        data = read_predictions(predictions, evaluator, cfg.allow_incomplete)
+        all_predictions.append(data)
+        # TODO: currently majority does not take into account equivalent answers written in a different way
+        valid_answers_and_results = [
+            (elem['predicted_answer'], elem['is_correct']) for elem in data if elem['predicted_answer'] is not None
+        ]
+        majority_answers.append(cfg.default_answer)
+        if len(valid_answers_and_results) == 0:
+            continue
+        (majority_answer, _), num_votes = Counter(valid_answers_and_results).most_common(1)[0]
+
+        if num_votes <= cfg.min_votes:
+            continue
+
+        if cfg.drop_negative_answers or cfg.drop_noninteger_answers:
+            try:
+                majority_answer = float(majority_answer)
+            except ValueError:
+                continue
+
+            if cfg.drop_negative_answers and majority_answer < 0:
+                continue
+
+            if cfg.drop_noninteger_answers and not majority_answer.is_integer():
+                continue
+
+        majority_answers[-1] = majority_answer
+        retained_questions += 1
+
+    LOG.info("Total questions: %d, retained questions: %d", len(all_predictions), retained_questions)
+
+    for file_handle in file_handles:
+        file_handle.close()
+
+    # writing the majority answers back to the files
+    file_handles = [open(file, "wt", encoding="utf-8") for file in unroll_files(cfg.prediction_jsonl_files)]
+    for idx, predictions in enumerate(all_predictions):
+        for lidx, handle in enumerate(file_handles):
+            predictions[lidx]["expected_answer"] = majority_answers[idx]
+            handle.write(json.dumps(predictions[lidx]) + "\n")
+
+    for file_handle in file_handles:
+        file_handle.close()
+
+
+HELP_MESSAGE = get_help_message(FillMajorityAnswerConfig)
+
+
+if __name__ == "__main__":
+    if '--help' in sys.argv or '-h' in sys.argv:
+        print(HELP_MESSAGE)
+    else:
+        setup_logging()
+        fill_majority_answer()
diff --git a/nemo_skills/evaluation/graders.py b/nemo_skills/evaluation/graders.py
new file mode 100644
index 000000000..b166c656a
--- /dev/null
+++ b/nemo_skills/evaluation/graders.py
@@ -0,0 +1,109 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import logging
+import shutil
+import subprocess
+from argparse import Namespace
+from pathlib import Path
+
+LOG = logging.getLogger(__file__)
+
+
+def math_eval(cfg):
+    from nemo_skills.code_execution.sandbox import get_sandbox
+
+    sandbox = get_sandbox(**cfg.sandbox)
+    sandbox.batch_evaluate_results(
+        prediction_jsonl_files=cfg.prediction_jsonl_files,
+        **cfg.eval_config,
+    )
+
+
+def code_eval(cfg):
+    # TODO: need to move it to a separate docker (either our sandbox or separate srun)
+    from evalplus.evaluate import evaluate
+    from omegaconf import OmegaConf
+
+    from nemo_skills.evaluation.code_utils import preprocess_code
+
+    # processing each generation separately (TODO: evalplus can do it together, but need to figure out the format)
+    for jsonl_file in cfg.prediction_jsonl_files:
+        with open(jsonl_file) as f:
+            samples = [preprocess_code(json.loads(line)) for line in f]
+        # all changes will be done with a new key "completion", so it's ok to write to the same file
+        with open(jsonl_file, "wt", encoding="utf-8") as f:
+            for sample in samples:
+                f.write(json.dumps(sample) + "\n")
+        eval_config = {
+            "samples": jsonl_file,
+            "base_only": False,
+            "parallel": None,
+            "i_just_wanna_run": False,
+            "test_details": False,
+            "min_time_limit": 1,
+            "gt_time_limit_factor": 4.0,
+            "mini": False,
+            "noextreme": False,
+            "version": "default",
+        }
+        eval_config.update(OmegaConf.to_container(cfg.eval_config))
+        evaluate(Namespace(**eval_config))
+        with open(jsonl_file[:-6] + '_eval_results.json', 'rt', encoding="utf-8") as fin:
+            evalplus_grades = json.load(fin)
+        # adding is_correct key to allow compute_metrics to work
+        with open(jsonl_file, "wt", encoding="utf-8") as f:
+            for sample in samples:
+                sample['is_correct'] = evalplus_grades['eval'][sample['task_id']][0]['base_status'] == "pass"
+                sample['is_correct-plus'] = (
+                    sample['is_correct'] and evalplus_grades['eval'][sample['task_id']][0]['plus_status'] == "pass"
+                )
+                f.write(json.dumps(sample) + "\n")
+
+        # moving eval file as otherwise evalplus does not want to recompute metrics if it's present..
+        shutil.move(jsonl_file[:-6] + '_eval_results.json', jsonl_file[:-6] + '_eval_results-saved.json')
+
+
+def ifeval(cfg):
+    for jsonl_file in cfg.prediction_jsonl_files:
+        parent_dir = Path(jsonl_file).absolute().parent
+        cmd = (
+            'cd /opt/benchmarks/google-research && python -m instruction_following_eval.evaluation_main '
+            f'--input_data={jsonl_file} '
+            f'--input_response_data={jsonl_file} '
+            f'--output_dir={parent_dir} '
+        )
+        subprocess.run(cmd, shell=True, check=True)
+        # fusing eval metrics back into the generation file
+        with open(jsonl_file, "rt", encoding="utf-8") as f:
+            samples = [json.loads(line) for line in f]
+
+        with open(parent_dir / 'eval_results_loose.jsonl', 'rt', encoding="utf-8") as f:
+            eval_results = [json.loads(line) for line in f]
+        for sample, eval_result in zip(samples, eval_results):
+            sample['loose_eval'] = eval_result
+
+        with open(parent_dir / 'eval_results_strict.jsonl', 'rt', encoding="utf-8") as f:
+            eval_results = [json.loads(line) for line in f]
+        for sample, eval_result in zip(samples, eval_results):
+            sample['strict_eval'] = eval_result
+
+        with open(jsonl_file, "wt", encoding="utf-8") as f:
+            for sample in samples:
+                f.write(json.dumps(sample) + "\n")
+
+        # removing metric files to avoid reusing them
+        (parent_dir / 'eval_results_loose.jsonl').unlink()
+        (parent_dir / 'eval_results_strict.jsonl').unlink()
diff --git a/nemo_skills/evaluation/metrics.py b/nemo_skills/evaluation/metrics.py
new file mode 100644
index 000000000..8430af240
--- /dev/null
+++ b/nemo_skills/evaluation/metrics.py
@@ -0,0 +1,221 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import logging
+import sys
+from collections import Counter
+from itertools import zip_longest
+from pathlib import Path
+
+# adding nemo_skills to python path to avoid requiring installation
+sys.path.append(str(Path(__file__).absolute().parents[1]))
+
+from nemo_skills.utils import unroll_files
+
+LOG = logging.getLogger(__file__)
+
+
+class MathEval:
+    def __init__(self):
+        self.reset()
+
+    def fill_up_missing(self):
+        return {'predicted_answer': None, 'is_correct': False}
+
+    def is_incomplete(self, elem):
+        return 'is_correct' not in elem or 'predicted_answer' not in elem
+
+    def update(self, predictions, aggregation_mode):
+        """Updating the evaluation results with the current element.
+
+        Args:
+            predictions (list[dict]): aggregated predictions across all generations.
+                The content of the file is benchmark specific.
+            aggregation_mode (str): "best", "majority", "first", etc. Might vary by benchmark.
+        """
+        # this shouldn't do any heavy calculation, but just read the metric from existing json entry
+        # all the heavy lifting should be done in the evaluation script
+        self.total += 1
+        if aggregation_mode == "best":
+            self.total_correct += any([elem['is_correct'] for elem in predictions])
+            if all([elem['predicted_answer'] is None for elem in predictions]):
+                self.total_no_answer += 1
+        elif aggregation_mode == "majority":
+            # TODO: currently majority does not take into account equivalent answers written in a different way
+            valid_answers_and_results = [
+                (elem['predicted_answer'], elem['is_correct'])
+                for elem in predictions
+                if elem['predicted_answer'] is not None
+            ]
+            if len(valid_answers_and_results) == 0:
+                self.total_no_answer += 1
+            else:
+                majority_result = Counter(valid_answers_and_results).most_common(1)[0][0]
+                self.total_correct += majority_result[1]
+        elif aggregation_mode == "first":
+            self.total_correct += predictions[0]['is_correct']
+            self.total_no_answer += predictions[0]['predicted_answer'] is None
+        else:
+            raise ValueError(f"Unsupported mode {aggregation_mode}")
+
+    def get_metrics(self):
+        return {
+            "num_entries": self.total,
+            "correct_answer": self.total_correct / self.total * 100.0,
+            "wrong_answer": (self.total - self.total_correct - self.total_no_answer) / self.total * 100.0,
+            "no_answer": self.total_no_answer / self.total * 100.0,
+        }
+
+    def reset(self):
+        self.total_correct = 0
+        self.total_no_answer = 0
+        self.total = 0
+
+
+class CodeEval:
+    def __init__(self):
+        self.reset()
+
+    def fill_up_missing(self):
+        return {'is_correct': False, 'is_correct-plus': False}
+
+    def is_incomplete(self, elem):
+        return 'is_correct' not in elem or 'is_correct-plus' not in elem
+
+    def update(self, predictions, aggregation_mode):
+        """Updating the evaluation results with the current element.
+
+        Args:
+            predictions (list[dict]): aggregated predictions across all generations.
+                The content of the file is benchmark specific.
+            aggregation_mode (str): "best", "majority", "first", etc. Might vary by benchmark.
+        """
+        # this shouldn't do any heavy calculation, but just read the metric from existing json entry
+        # all the heavy lifting should be done in the evaluation script
+        self.total += 1
+        if aggregation_mode == "best":
+            self.total_correct += any([elem['is_correct'] for elem in predictions])
+            self.total_correct_plus += any([elem['is_correct-plus'] for elem in predictions])
+        elif aggregation_mode == "first":
+            self.total_correct += predictions[0]['is_correct']
+            self.total_correct_plus += predictions[0]['is_correct-plus']
+        else:
+            raise ValueError(f"Unsupported mode {aggregation_mode}")
+
+    def get_metrics(self):
+        return {
+            "num_entries": self.total,
+            "passing_base_tests": self.total_correct / self.total * 100.0,
+            "passing_plus_tests": self.total_correct_plus / self.total * 100.0,
+        }
+
+    def reset(self):
+        self.total_correct = 0
+        self.total_correct_plus = 0
+        self.total = 0
+
+
+class IFEval:
+    def __init__(self):
+        self.reset()
+
+    def fill_up_missing(self):
+        return {'loose_eval': {'follow_all_instructions': False}, 'strict_eval': {'follow_all_instructions': False}}
+
+    def is_incomplete(self, elem):
+        incomplete = 'loose_eval' not in elem or 'strict_eval' not in elem
+        if incomplete:
+            return False
+        return (
+            'follow_all_instructions' not in elem['loose_eval'] or 'follow_all_instructions' not in elem['strict_eval']
+        )
+
+    def update(self, predictions, aggregation_mode):
+        """Updating the evaluation results with the current element.
+
+        Args:
+            predictions (list[dict]): aggregated predictions across all generations.
+                The content of the file is benchmark specific.
+            aggregation_mode (str): "best", "majority", "first", etc. Might vary by benchmark.
+        """
+        # this shouldn't do any heavy calculation, but just read the metric from existing json entry
+        # all the heavy lifting should be done in the evaluation script
+        self.total += 1
+        if aggregation_mode == "best":
+            self.total_correct_loose += any([elem['loose_eval']['follow_all_instructions'] for elem in predictions])
+            self.total_correct_strict += any([elem['strict_eval']['follow_all_instructions'] for elem in predictions])
+        elif aggregation_mode == "first":
+            self.total_correct_loose += predictions[0]['loose_eval']['follow_all_instructions']
+            self.total_correct_strict += predictions[0]['strict_eval']['follow_all_instructions']
+        else:
+            raise ValueError(f"Unsupported mode {aggregation_mode}")
+
+    def get_metrics(self):
+        return {
+            "num_entries": self.total,
+            "strict_accuracy": self.total_correct_strict / self.total * 100.0,
+            "loose_accuracy": self.total_correct_loose / self.total * 100.0,
+        }
+
+    def reset(self):
+        self.total_correct_loose = 0
+        self.total_correct_strict = 0
+        self.total = 0
+
+
+def read_predictions(predictions, evaluator, allow_incomplete=False):
+    data = []
+    for prediction in predictions:
+        if not prediction:  # could have missing predictions
+            if not allow_incomplete:
+                raise RuntimeError("Some data is missing!")
+            data.append(evaluator.fill_up_missing())
+            continue
+        prediction_dict = json.loads(prediction)
+        if not prediction_dict:
+            if not allow_incomplete:
+                raise RuntimeError("Some data is missing!")
+            data.append(evaluator.fill_up_missing())
+            continue
+        if evaluator.is_incomplete(prediction_dict):
+            if not allow_incomplete:
+                raise RuntimeError("Some data is missing!")
+            data.append(evaluator.fill_up_missing())
+            continue
+        data.append(prediction_dict)
+
+    return data
+
+
+def compute_metrics(
+    prediction_jsonl_files,
+    evaluator,
+    allow_incomplete=False,
+    max_samples=-1,
+    aggregation_mode='first',
+):
+    file_handles = [open(file, "rt", encoding="utf-8") for file in unroll_files(prediction_jsonl_files)]
+
+    evaluator.reset()
+    for idx, predictions in enumerate(zip_longest(*file_handles)):
+        if idx == max_samples:
+            break
+        data = read_predictions(predictions, evaluator, allow_incomplete)
+        evaluator.update(data, aggregation_mode)
+
+    for file_handle in file_handles:
+        file_handle.close()
+
+    return evaluator.get_metrics()
diff --git a/nemo_skills/evaluation/settings.py b/nemo_skills/evaluation/settings.py
new file mode 100644
index 000000000..5e555ec47
--- /dev/null
+++ b/nemo_skills/evaluation/settings.py
@@ -0,0 +1,75 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# a collection of settings required to correctly running evaluations for different benchmarks
+# in addition to what's in this file, there is also some prompt engineering that needs
+# to happen in eval_map.py inside prompt folder for specific models
+
+from nemo_skills.evaluation.graders import code_eval, ifeval, math_eval
+from nemo_skills.evaluation.metrics import CodeEval, IFEval, MathEval
+
+MATH_BENCHMARKS = [
+    'algebra222',
+    'asdiv',
+    'functional',
+    'gsm-hard',
+    'gsm-ic-2step',
+    'gsm-ic-mstep',
+    'gsm-plus',
+    'gsm8k',
+    'math',
+    'mawps',
+    'svamp',
+    'tabmwp',
+    'math-odyssey',
+    'aime-2024',
+]
+CODE_BENCHMARKS = ['human-eval', 'mbpp']
+
+
+# ------------------------------- metrics settings -----------------------------
+EVALUATOR_MAP = {
+    "ifeval": IFEval,
+    "mmlu": MathEval,  # TODO: update this
+}
+
+for benchmark in MATH_BENCHMARKS:
+    EVALUATOR_MAP[benchmark] = MathEval
+
+for benchmark in CODE_BENCHMARKS:
+    EVALUATOR_MAP[benchmark] = CodeEval
+# ------------------------------------------------------------------------------
+
+# -------------------------------- eval settings -------------------------------
+EXTRA_EVAL_ARGS = {
+    # some benchmarks require specific extra arguments, which are defined here
+    'human-eval': '++eval_type=code ++eval_config.dataset=humaneval',
+    'mbpp': '++eval_type=code ++eval_config.dataset=mbpp',
+    'ifeval': '++eval_type=ifeval',
+}
+
+# TODO: better name?
+GRADING_MAP = {
+    "math": math_eval,  # that's default. TODO: should we do this per-benchmark?
+    "code": code_eval,
+    "ifeval": ifeval,
+}
+# ------------------------------------------------------------------------------
+
+# --------------------------------- gen settings -------------------------------
+EXTRA_GENERATION_ARGS = {
+    # some benchmarks require specific extra arguments, which are defined here
+    'ifeval': '++generation_key=response',
+}
+# ------------------------------------------------------------------------------
diff --git a/nemo_skills/finetuning/data_preparation_utils/prepare_sft_data.yaml b/nemo_skills/finetuning/data_preparation_utils/prepare_sft_data.yaml
index d9c9dc9ab..98dc6e5ba 100644
--- a/nemo_skills/finetuning/data_preparation_utils/prepare_sft_data.yaml
+++ b/nemo_skills/finetuning/data_preparation_utils/prepare_sft_data.yaml
@@ -16,6 +16,7 @@ num_output_samples: null
 
 prompt_type: openmathinstruct/sft
 chat_format: false  # whether to use NeMo's chat format
+generation_suffix: ""  # suffix to add to the end of expected response (e.g. <|eot_id|>)
 
 filters:
   solution_key: generation  # key to filter solutions by
@@ -92,4 +93,5 @@ processors:
     output_manifest_file: ${output_path}
     prompt_type: ${prompt_type}
     chat_format: ${chat_format}
+    generation_suffix: ${generation_suffix}
     metadata: ${metadata}
diff --git a/nemo_skills/finetuning/data_preparation_utils/preprocessing.py b/nemo_skills/finetuning/data_preparation_utils/preprocessing.py
index f0c1b190c..c49e8ecae 100644
--- a/nemo_skills/finetuning/data_preparation_utils/preprocessing.py
+++ b/nemo_skills/finetuning/data_preparation_utils/preprocessing.py
@@ -229,11 +229,21 @@ def process(self):
 
 
 class WriteFinalSftManifest(BaseProcessor):
-    def __init__(self, prompt_type: str, chat_format: bool = False, metadata: Optional[Dict] = None, **kwargs):
+    def __init__(
+        self,
+        prompt_type: str,
+        chat_format: bool = False,
+        generation_suffix: str = "",
+        metadata: Optional[Dict] = None,
+        **kwargs,
+    ):
         super().__init__(**kwargs)
         self.prompt_type = prompt_type
         self.chat_format = chat_format
         self.metadata = metadata
+        self.generation_suffix = generation_suffix
+        if self.generation_suffix and self.chat_format:
+            raise ValueError("generation_suffix can only be used with chat_format=False")
         if not self.metadata:
             self.metadata = {}
 
@@ -266,7 +276,7 @@ def process(self):
                     elem['type'] = None
                 else:
                     elem["input"] = prompt.build_string(input_dict={"question": elem['question']})
-                    elem["output"] = elem.pop("generation")
+                    elem["output"] = elem.pop("generation") + self.generation_suffix
                 elem.update(self.metadata)
                 fout.write(json.dumps(elem) + "\n")
                 samples_count += 1
diff --git a/nemo_skills/finetuning/sft_config.yaml b/nemo_skills/finetuning/sft_config.yaml
index bc5d9f523..2d464f711 100644
--- a/nemo_skills/finetuning/sft_config.yaml
+++ b/nemo_skills/finetuning/sft_config.yaml
@@ -27,15 +27,9 @@ trainer:
     val_check_interval: 1000  # TODO: currently float values are not supported
     save_interval: ${.val_check_interval}
 
-    limit_val_batches: 1.0
+    limit_val_batches: 1  # since loss is not meaningful by default we are limiting the validation batches to 1
     gradient_clip_val: null
 
-    inference_metrics:
-      code_generation_accuracy:
-        _target_: nemo_skills.finetuning.code_generation_accuracy.CodeGenerationAccuracyMetric
-        sandbox_cfg:
-          sandbox_type: local
-
   # do not change these
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
@@ -56,9 +50,9 @@ exp_manager:
   resume_ignore_no_checkpoint: True
   create_checkpoint_callback: True
   checkpoint_callback_params:
-    monitor: val_code_generation_accuracy
+    monitor: val_loss
     save_top_k: 8
-    mode: max
+    mode: min
     save_nemo_on_train_end: False
     filename: 'megatron_gpt_sft--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}-{consumed_samples}-{epoch}'
     model_parallel_size: ${model.tensor_model_parallel_size}
@@ -94,29 +88,10 @@ model:
   seq_len_interpolation_factor: null # if not None, seq_len_interpolation_factor will match the base model's value
   use_flash_attention: null # if not None, will match the base model's value
 
-  hidden_dropout: 0.1
-  attention_dropout: 0.1
-  ffn_dropout: 0.1
-
-  inference:
-    sampling_params:
-      use_greedy: True
-      # most other parameters are not used if greedy is True
-      temperature: 0.7
-      top_k: 0
-      top_p: 0.95
-      repetition_penalty: 1.0
-      add_BOS: False
-      all_probs: False
-      compute_logprob: False
-      end_strings: ["<|endoftext|>", "<extra_id_1>"]
-    length_params:
-      min_length: 0
-      max_length: 512
-    strategy:
-      _target_: nemo_skills.inference.inference_strategy.CodeExecutionStrategy
-      sandbox_cfg:
-        sandbox_type: local
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  ffn_dropout: 0.0
+
   peft:
     peft_scheme: "none"  # ["lora", "none"]
     restore_from_path: null
@@ -152,7 +127,7 @@ model:
       # Example of how each dataset is formatted
       # {'input': 'John von Neumann\nVon Neumann made fundamental contributions .... Q: What did the math of artificial viscosity do?', 'output': 'smoothed the shock transition without sacrificing basic physics'}
       file_path: ??? # Path to a JSONL file corresponding to the source data. Data format is identical to validation_ds.
-      global_batch_size: 128
+      global_batch_size: 256
       micro_batch_size: 1
       shuffle: True
       memmap_workers: null
diff --git a/nemo_skills/finetuning/sft_config_codegen.yaml b/nemo_skills/finetuning/sft_config_codegen.yaml
new file mode 100644
index 000000000..bc5d9f523
--- /dev/null
+++ b/nemo_skills/finetuning/sft_config_codegen.yaml
@@ -0,0 +1,203 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: megatron_gpt_sft
+
+trainer:
+  num_nodes: 1
+  devices: 1
+  accelerator: gpu
+  precision: bf16
+
+  sft:
+    max_epochs: 2
+    max_steps: -1
+
+    val_check_interval: 1000  # TODO: currently float values are not supported
+    save_interval: ${.val_check_interval}
+
+    limit_val_batches: 1.0
+    gradient_clip_val: null
+
+    inference_metrics:
+      code_generation_accuracy:
+        _target_: nemo_skills.finetuning.code_generation_accuracy.CodeGenerationAccuracyMetric
+        sandbox_cfg:
+          sandbox_type: local
+
+  # do not change these
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_time: null
+  max_epochs: ${.sft.max_epochs}
+  max_steps: ${.sft.max_steps}
+
+exp_manager:
+  explicit_log_dir: null
+  exp_dir: null
+  name: ${name}
+  create_wandb_logger: False
+  wandb_logger_kwargs:
+    project: null
+    name: null
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: val_code_generation_accuracy
+    save_top_k: 8
+    mode: max
+    save_nemo_on_train_end: False
+    filename: 'megatron_gpt_sft--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}-{consumed_samples}-{epoch}'
+    model_parallel_size: ${model.tensor_model_parallel_size}
+    save_best_model: False   # need to keep this false otherwise it will create multiple last.ckpt files because restore reset the previous best model
+
+model:
+  seed: 1234
+  tensor_model_parallel_size: 1 # intra-layer model parallelism
+  pipeline_model_parallel_size: 1 # inter-layer model parallelism
+  restore_from_path: ??? # Path to an existing p-tuned/prompt tuned .nemo model you wish to add new tasks to or run inference with
+  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
+  save_nemo_on_validation_end: True # Saves an inference ready .nemo file every time a checkpoint is saved during training.
+  sync_batch_comm: False
+  megatron_amp_O2: True
+  encoder_seq_length: 4096  # the sequence length of the encoder model, it will be overwriten by loaded GPT model
+
+  ## Sequence Parallelism
+  # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially
+  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+  sequence_parallel: False
+
+  ## Activation Checkpoint
+  activations_checkpoint_granularity: full # 'selective' or 'full'
+  activations_checkpoint_method: uniform # 'uniform', 'block', not used with 'selective'
+  # 'uniform' divides the total number of transformer layers and checkpoints the input activation
+  # of each chunk at the specified granularity
+  # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
+  activations_checkpoint_num_layers: 1 # not used with 'selective'
+  activations_checkpoint_layers_per_pipeline: null
+  # This feature is valid only when used with pipeline-model-parallelism. More details in megatron_gpt_config.yaml.
+  answer_only_loss: True # not used right now
+  gradient_as_bucket_view: False
+  seq_len_interpolation_factor: null # if not None, seq_len_interpolation_factor will match the base model's value
+  use_flash_attention: null # if not None, will match the base model's value
+
+  hidden_dropout: 0.1
+  attention_dropout: 0.1
+  ffn_dropout: 0.1
+
+  inference:
+    sampling_params:
+      use_greedy: True
+      # most other parameters are not used if greedy is True
+      temperature: 0.7
+      top_k: 0
+      top_p: 0.95
+      repetition_penalty: 1.0
+      add_BOS: False
+      all_probs: False
+      compute_logprob: False
+      end_strings: ["<|endoftext|>", "<extra_id_1>"]
+    length_params:
+      min_length: 0
+      max_length: 512
+    strategy:
+      _target_: nemo_skills.inference.inference_strategy.CodeExecutionStrategy
+      sandbox_cfg:
+        sandbox_type: local
+  peft:
+    peft_scheme: "none"  # ["lora", "none"]
+    restore_from_path: null
+
+    lora_tuning:
+      target_modules: ['attention_qkv'] # currently only support attention_qkv
+      adapter_dim: 32
+      adapter_dropout: 0.0
+      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      layer_selection:  null  # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
+      weight_tying: False
+      position_embedding_strategy: null # used only when weight_tying is True
+
+  data:
+    chat: False # whether use chatbot data or not
+    chat_prompt_tokens:  # special tokens for the chat prompts, a dictionary of {token_type: token}. note that some tokenizer may combine the characters at the junction between {end_of_turn}{turn_start}. e.g. '<im end><im start>', the '><' sometimes is merged to be a single token. This is not supported, try to avoid
+      system_turn_start: "<extra_id_0>"
+      turn_start: "<extra_id_1>"
+      label_start: "<extra_id_2>"
+      end_of_turn: "\x0A"  # \0x0A is '\n'
+      end_of_name: "\x0A"  # \0x0A is '\n'
+
+    sample: False # create the index mapping files for the sample data, so max_steps * global_batch_size can be larger than the dataset size
+    num_workers: 1
+    dataloader_type: single  # only supports single
+    train_ds:
+      # Example of how to specify paths to multiple datasets
+      # file_names:
+      #   - /path/to/squad.jsonl
+      #   - /path/to/mnli.jsonl
+      #   - /path/to/boolq.jsonl
+      # Example of how each dataset is formatted
+      # {'input': 'John von Neumann\nVon Neumann made fundamental contributions .... Q: What did the math of artificial viscosity do?', 'output': 'smoothed the shock transition without sacrificing basic physics'}
+      file_path: ??? # Path to a JSONL file corresponding to the source data. Data format is identical to validation_ds.
+      global_batch_size: 128
+      micro_batch_size: 1
+      shuffle: True
+      memmap_workers: null
+      max_seq_length: 4096
+      min_seq_length: 1
+      drop_last: True
+      label_key: 'output'
+      add_eos: True
+      add_sep: False
+      add_bos: False
+      truncation_field: "input" # # Can be multiple keys separated with ',' Options: keys in prompt_template
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: "{input}{output}" # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+      hf_dataset: False # Whether to load the json file with the HuggingFace dataset. otherwise, will load the jsonl file with the JSONLMemMapDataset.
+      truncation_method: 'right' # Truncation from which position, Options: ['left', 'right']
+
+    validation_ds:
+      file_path: ??? # Path to a JSONL file corresponding to the source data. Data format is identical to validation_ds.
+      global_batch_size: ${model.data.train_ds.global_batch_size}
+      micro_batch_size: ${model.data.train_ds.micro_batch_size}
+      shuffle: False
+      memmap_workers: ${model.data.train_ds.memmap_workers}
+      max_seq_length: ${model.data.train_ds.max_seq_length}
+      min_seq_length: 1
+      drop_last: False
+      label_key: ${model.data.train_ds.label_key}
+      add_eos: ${model.data.train_ds.add_eos}
+      add_sep: ${model.data.train_ds.add_sep}
+      add_bos: ${model.data.train_ds.add_bos}
+      truncation_field: ${model.data.train_ds.truncation_field} # Options: keys in prompt_template
+      index_mapping_dir: null # Path to a directory to write index mapping files.
+      prompt_template: ${model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
+      hf_dataset: False # Whether to load the json file with the HuggingFace dataset. otherwise, will load the jsonl file with the JSONLMemMapDataset.
+      truncation_method: 'right' # Truncation from which position, Options: ['left', 'right']
+      output_original_text: True  # needed for the proper metrics support
+
+  optim:
+    name: distributed_fused_adam # Supports distributed optimizer for memory savings. To enable, set to 'distributed_fused_adam'. Needs Apex to be built with specific args to work.
+    lr: 1e-6
+    weight_decay: 0.01
+    betas:
+    - 0.9
+    - 0.98
+    # sched:
+    #   name: CosineAnnealing
+    #   warmup_steps: 0
+    #   constant_steps: 0
+    #   min_lr: 1e-6
\ No newline at end of file
diff --git a/pipeline/compute_metrics.py b/pipeline/compute_metrics.py
index ecbf9ffc9..84ffb5bce 100644
--- a/pipeline/compute_metrics.py
+++ b/pipeline/compute_metrics.py
@@ -16,287 +16,18 @@
 import json
 import logging
 import sys
-from collections import Counter, defaultdict
-from itertools import zip_longest
 from pathlib import Path
 
 # adding nemo_skills to python path to avoid requiring installation
 sys.path.append(str(Path(__file__).absolute().parents[1]))
 
-from nemo_skills.utils import setup_logging, unroll_files
+from nemo_skills.evaluation.metrics import compute_metrics
+from nemo_skills.evaluation.settings import EVALUATOR_MAP
+from nemo_skills.utils import setup_logging
 
 LOG = logging.getLogger(__file__)
 
 
-class MathEval:
-    def __init__(self):
-        self.reset()
-
-    def fill_up_missing(self):
-        return {'predicted_answer': None, 'is_correct': False}
-
-    def is_incomplete(self, elem):
-        return 'is_correct' not in elem or 'predicted_answer' not in elem
-
-    def update(self, predictions, aggregation_mode):
-        """Updating the evaluation results with the current element.
-
-        Args:
-            predictions (list[dict]): aggregated predictions across all generations.
-                The content of the file is benchmark specific.
-            aggregation_mode (str): "best", "majority", "first", etc. Might vary by benchmark.
-        """
-        # this shouldn't do any heavy calculation, but just read the metric from existing json entry
-        # all the heavy lifting should be done in the evaluation script
-        self.total += 1
-        if aggregation_mode == "best":
-            self.total_correct += any([elem['is_correct'] for elem in predictions])
-            if all([elem['predicted_answer'] is None for elem in predictions]):
-                self.total_no_answer += 1
-        elif aggregation_mode == "majority":
-            # TODO: currently majority does not take into account equivalent answers written in a different way
-            valid_answers_and_results = [
-                (elem['predicted_answer'], elem['is_correct'])
-                for elem in predictions
-                if elem['predicted_answer'] is not None
-            ]
-            if len(valid_answers_and_results) == 0:
-                self.total_no_answer += 1
-            else:
-                majority_result = Counter(valid_answers_and_results).most_common(1)[0][0]
-                self.total_correct += majority_result[1]
-        elif aggregation_mode == "first":
-            self.total_correct += predictions[0]['is_correct']
-            self.total_no_answer += predictions[0]['predicted_answer'] is None
-        else:
-            raise ValueError(f"Unsupported mode {aggregation_mode}")
-
-    def get_metrics(self):
-        return {
-            "num_entries": self.total,
-            "correct_answer": self.total_correct / self.total * 100.0,
-            "wrong_answer": (self.total - self.total_correct - self.total_no_answer) / self.total * 100.0,
-            "no_answer": self.total_no_answer / self.total * 100.0,
-        }
-
-    def reset(self):
-        self.total_correct = 0
-        self.total_no_answer = 0
-        self.total = 0
-
-
-class CodeEval:
-    def __init__(self):
-        self.reset()
-
-    def fill_up_missing(self):
-        return {'is_correct': False, 'is_correct-plus': False}
-
-    def is_incomplete(self, elem):
-        return 'is_correct' not in elem or 'is_correct-plus' not in elem
-
-    def update(self, predictions, aggregation_mode):
-        """Updating the evaluation results with the current element.
-
-        Args:
-            predictions (list[dict]): aggregated predictions across all generations.
-                The content of the file is benchmark specific.
-            aggregation_mode (str): "best", "majority", "first", etc. Might vary by benchmark.
-        """
-        # this shouldn't do any heavy calculation, but just read the metric from existing json entry
-        # all the heavy lifting should be done in the evaluation script
-        self.total += 1
-        if aggregation_mode == "best":
-            self.total_correct += any([elem['is_correct'] for elem in predictions])
-            self.total_correct_plus += any([elem['is_correct-plus'] for elem in predictions])
-        elif aggregation_mode == "first":
-            self.total_correct += predictions[0]['is_correct']
-            self.total_correct_plus += predictions[0]['is_correct-plus']
-        else:
-            raise ValueError(f"Unsupported mode {aggregation_mode}")
-
-    def get_metrics(self):
-        return {
-            "num_entries": self.total,
-            "passing_base_tests": self.total_correct / self.total * 100.0,
-            "passing_plus_tests": self.total_correct_plus / self.total * 100.0,
-        }
-
-    def reset(self):
-        self.total_correct = 0
-        self.total_correct_plus = 0
-        self.total = 0
-
-
-class IFEval:
-    # loosely adapted from
-    # https://github.com/google-research/google-research/blob/master/instruction_following_eval/evaluation_main.py
-
-    required_keys = ['follow_instruction_list', 'instruction_id_list']
-
-    def __init__(self):
-        self.reset()
-
-    def fill_up_missing(self):
-        return {
-            'loose_eval': {key: [] for key in self.required_keys},
-            'strict_eval': {key: [] for key in self.required_keys},
-        }
-
-    def is_incomplete(self, elem):
-        incomplete = 'loose_eval' not in elem or 'strict_eval' not in elem
-        if incomplete:
-            return True
-
-        if any([key not in elem['loose_eval'] for key in self.required_keys]):
-            return True
-
-        if any([key not in elem['strict_eval'] for key in self.required_keys]):
-            return True
-
-        return False
-
-    def _update_single_stat(self, stats_dict, elems):
-        """Will update using the pass@k strategy (just pass a single-element list to get greedy)."""
-        # has to be the same across all elements as they are solutions for the same question
-        instruction_id_list = elems[0]['instruction_id_list']
-        # computing "pass@k" score
-        follow_instruction_list = elems[0]['follow_instruction_list']
-        for elem in elems:
-            follow_instruction_list = [
-                follow_instruction_list[i] or elem['follow_instruction_list'][i]
-                for i in range(len(follow_instruction_list))
-            ]
-
-        stats_dict['prompt']['total'] += 1
-        if all(follow_instruction_list):
-            stats_dict['prompt']['correct'] += 1
-
-        stats_dict['instruction']['total'] += len(instruction_id_list)
-        stats_dict['instruction']['correct'] += sum(follow_instruction_list)
-
-        for instruction_id, followed_or_not in zip(instruction_id_list, follow_instruction_list):
-            instruction_id = instruction_id.split(":")[0]
-            stats_dict['tier0']['total'][instruction_id] += 1
-            if followed_or_not:
-                stats_dict['tier0']['correct'][instruction_id] += 1
-
-    def update(self, predictions, aggregation_mode):
-        """Updating the evaluation results with the current element.
-
-        Args:
-            predictions (list[dict]): aggregated predictions across all generations.
-                The content of the file is benchmark specific.
-            aggregation_mode (str): "best", "majority", "first", etc. Might vary by benchmark.
-        """
-        # this shouldn't do any heavy calculation, but just read the metric from existing json entry
-        # all the heavy lifting should be done in the evaluation script
-        if aggregation_mode == "best":
-            self._update_single_stat(self.strict_stats, [pred['strict_eval'] for pred in predictions])
-            self._update_single_stat(self.loose_stats, [pred['loose_eval'] for pred in predictions])
-        elif aggregation_mode == "first":
-            self._update_single_stat(self.strict_stats, [predictions[0]['strict_eval']])
-            self._update_single_stat(self.loose_stats, [predictions[0]['loose_eval']])
-        else:
-            raise ValueError(f"Unsupported mode {aggregation_mode}")
-
-    def get_metrics(self):
-        prompt_total = self.strict_stats['prompt']['total']
-        inst_total = self.strict_stats['instruction']['total']
-        return {
-            "num_prompts": prompt_total,
-            "num_instructions": inst_total,
-            "prompt_strict_accuracy": self.strict_stats['prompt']['correct'] / prompt_total * 100.0,
-            "instruction_strict_accuracy": self.strict_stats['instruction']['correct'] / inst_total * 100.0,
-            "prompt_loose_accuracy": self.loose_stats['prompt']['correct'] / prompt_total * 100.0,
-            "instruction_loose_accuracy": self.loose_stats['instruction']['correct'] / inst_total * 100.0,
-        }
-
-    def reset(self):
-        # the original code also has a deeper breakdown into tier1 scores,
-        # but that's probably too much for us to track at this stage
-        self.strict_stats = {
-            "prompt": {"total": 0, "correct": 0},
-            "instruction": {"total": 0, "correct": 0},
-            "tier0": {"total": defaultdict(int), "correct": defaultdict(int)},
-        }
-        self.loose_stats = {
-            "prompt": {"total": 0, "correct": 0},
-            "instruction": {"total": 0, "correct": 0},
-            "tier0": {"total": defaultdict(int), "correct": defaultdict(int)},
-        }
-
-
-def compute_metrics(
-    prediction_jsonl_files,
-    evaluator,
-    allow_incomplete=False,
-    max_samples=-1,
-    aggregation_mode='first',
-):
-    file_handles = [open(file, "rt", encoding="utf-8") for file in unroll_files(prediction_jsonl_files)]
-
-    evaluator.reset()
-    for idx, lines in enumerate(zip_longest(*file_handles)):
-        if idx == max_samples:
-            break
-        data = []
-        for line in lines:
-            if not line:  # could have missing predictions
-                if not allow_incomplete:
-                    raise RuntimeError("Some data is missing!")
-                data.append(evaluator.fill_up_missing())
-                continue
-            line_dict = json.loads(line)
-            if not line_dict:
-                if not allow_incomplete:
-                    raise RuntimeError("Some data is missing!")
-                data.append(evaluator.fill_up_missing())
-                continue
-            if evaluator.is_incomplete(line_dict):
-                if not allow_incomplete:
-                    raise RuntimeError("Some data is missing!")
-                data.append(evaluator.fill_up_missing())
-                continue
-            data.append(line_dict)
-
-        evaluator.update(data, aggregation_mode)
-
-    for file_handle in file_handles:
-        file_handle.close()
-
-    return evaluator.get_metrics()
-
-
-# TODO: move all benchmark-specific things to some unified place
-math_benchmarks = [
-    'algebra222',
-    'asdiv',
-    'functional',
-    'gsm-hard',
-    'gsm-ic-2step',
-    'gsm-ic-mstep',
-    'gsm-plus',
-    'gsm8k',
-    'math',
-    'mawps',
-    'svamp',
-    'tabmwp',
-]
-code_benchmarks = ['human-eval', 'mbpp']
-
-EVALUATOR_MAP = {
-    "ifeval": IFEval,
-    "mmlu": MathEval,  # TODO: update this
-}
-
-for benchmark in math_benchmarks:
-    EVALUATOR_MAP[benchmark] = MathEval
-
-for benchmark in code_benchmarks:
-    EVALUATOR_MAP[benchmark] = CodeEval
-
-
 if __name__ == '__main__':
     setup_logging(disable_hydra_logs=False)
     parser = argparse.ArgumentParser()
diff --git a/pipeline/run_eval.py b/pipeline/run_eval.py
index c403b994c..f7a8356e6 100644
--- a/pipeline/run_eval.py
+++ b/pipeline/run_eval.py
@@ -29,6 +29,7 @@
 To see all supported agruments, nemo_skills package needs to be installed.
 Please note that it is not recommended to install Python packages on a slurm cluster login node.
 """
+from nemo_skills.evaluation.settings import EXTRA_EVAL_ARGS, EXTRA_GENERATION_ARGS
 from nemo_skills.utils import setup_logging
 
 SCRIPT_HELP = """
@@ -46,7 +47,7 @@ def get_greedy_cmd(
     benchmark, output_name='output-greedy.jsonl', extra_eval_args="", extra_arguments="", eval_map=None
 ):
     extra_eval_args = f"{EXTRA_EVAL_ARGS.get(benchmark, '')} {extra_eval_args}"
-    extra_arguments = f"{EXTRA_ARGS.get(benchmark, '')} {extra_arguments}"
+    extra_arguments = f"{EXTRA_GENERATION_ARGS.get(benchmark, '')} {extra_arguments}"
     if eval_map:
         extra_arguments = f"+prompt={eval_map.get(benchmark, eval_map['default'])} {extra_arguments}"
     return f"""echo "Evaluating benchmark {benchmark}" && \
@@ -92,18 +93,6 @@ def get_sampling_cmd(benchmark, random_seed, extra_eval_args="", extra_arguments
 MOUNTS = "{NEMO_SKILLS_CODE}:/code,{model_path}:/model,{output_dir}:/results"
 JOB_NAME = "eval-{model_name}"
 
-EXTRA_EVAL_ARGS = {
-    # some benchmarks require specific extra arguments, which are defined here
-    'human-eval': '++eval_type=code ++eval_config.dataset=humaneval',
-    'mbpp': '++eval_type=code ++eval_config.dataset=mbpp',
-    'ifeval': '++eval_type=ifeval',
-}
-
-EXTRA_ARGS = {
-    # some benchmarks require specific extra arguments, which are defined here
-    'ifeval': '++generation_key=response',
-}
-
 
 if __name__ == "__main__":
     setup_logging(disable_hydra_logs=False)
diff --git a/pipeline/run_sft.py b/pipeline/run_sft.py
index 3a1aa79b7..2fb6751e5 100644
--- a/pipeline/run_sft.py
+++ b/pipeline/run_sft.py
@@ -63,6 +63,7 @@
     parser.add_argument(
         "--validation_dataset",
         default="gsm8k",
+        # TODO: how to disable it by default?
         help="Validation dataset to use. Make sure it exists inside datasets folder",
     )
     parser.add_argument("--num_nodes", type=int, default=1)
@@ -71,6 +72,7 @@
         "--disable_wandb", action="store_true", help="Disable wandb logging and use tensorboard instead"
     )
     parser.add_argument("--chat_format", action="store_true", help="Use chat format for SFT data")
+    parser.add_argument("--with_sandbox", action="store_true", help="If sandbox is required for code generation")
     parser.add_argument(
         "--partition",
         required=False,
@@ -142,5 +144,5 @@
         container=CLUSTER_CONFIG["containers"]["nemo"],
         mounts=MOUNTS.format(**format_dict),
         partition=args.partition,
-        with_sandbox=True,
+        with_sandbox=args.with_sandbox,
     )
diff --git a/pipeline/summarize_results.py b/pipeline/summarize_results.py
index 31c78999f..7070968a6 100644
--- a/pipeline/summarize_results.py
+++ b/pipeline/summarize_results.py
@@ -17,7 +17,6 @@
 import argparse
 import glob
 import json
-import logging
 import subprocess
 import sys
 from collections import defaultdict
@@ -29,6 +28,8 @@
 
 from compute_metrics import EVALUATOR_MAP, compute_metrics
 
+from nemo_skills.evaluation.metrics import MathEval
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument(
@@ -58,7 +59,7 @@
         if not Path(benchmark_path).is_dir():
             continue
         try:
-            evaluator = EVALUATOR_MAP[benchmark]()
+            evaluator = EVALUATOR_MAP.get(benchmark, MathEval)()
             if benchmark in ['human-eval', 'mbpp']:
                 if Path(f'{benchmark_path}/output-greedy.jsonl').exists():
                     results[benchmark]['greedy'] = compute_metrics(
@@ -99,22 +100,22 @@
     lines_to_write = []
     for benchmark, benchmark_results in results.items():
         max_widths = {}
-        max_widths['evaluation mode'] = len('evaluation mode')
+        max_widths['evaluation_mode'] = len('evaluation_mode')
         for eval_mode, metrics in benchmark_results.items():
             for metric_key, metric_value in metrics.items():
                 max_widths[metric_key] = max(
                     max_widths.get(metric_key, len(metric_key)),
                     len(f"{metric_value:.2f}" if isinstance(metric_value, float) else str(metric_value)),
                 )
-            max_widths['evaluation mode'] = max(max_widths['evaluation mode'], len(eval_mode))
+            max_widths['evaluation_mode'] = max(max_widths['evaluation_mode'], len(eval_mode))
 
         total_width = sum(max_widths.values()) + (len(max_widths) - 1) * 3
         print(f' {benchmark} '.center(total_width, '-'))
-        headers = ['evaluation mode'] + list(list(benchmark_results.values())[0].keys())
+        headers = ['evaluation_mode'] + list(list(benchmark_results.values())[0].keys())
         print(' | '.join([f'{header:<{max_widths[header]}}' for header in headers]))
 
         for eval_mode, metrics in benchmark_results.items():
-            values = [f'{eval_mode:<{max_widths["evaluation mode"]}}']
+            values = [f'{eval_mode:<{max_widths["evaluation_mode"]}}']
             for metric_key, metric_value in metrics.items():
                 if isinstance(metric_value, float):
                     metric_value = f"{metric_value:.2f}"
diff --git a/tests/check_help.py b/tests/check_help.py
index 6cc022e2e..4494b6ee9 100644
--- a/tests/check_help.py
+++ b/tests/check_help.py
@@ -35,6 +35,7 @@
 skills_script_list = [
     'nemo_skills/inference/generate_solutions.py',
     'nemo_skills/evaluation/evaluate_results.py',
+    'nemo_skills/evaluation/fill_majority_answer.py',
 ]
 
 
diff --git a/tests/gpu-tests/test_finetuning.py b/tests/gpu-tests/test_finetuning.py
index cdf1d678f..8da5e811d 100644
--- a/tests/gpu-tests/test_finetuning.py
+++ b/tests/gpu-tests/test_finetuning.py
@@ -17,7 +17,6 @@
 # you'd also need 2+ GPUs to run this test
 # the metrics are assuming llama3-8b-base as the model and will fail for other models
 
-import json
 import os
 import subprocess
 import sys
@@ -25,8 +24,8 @@
 
 import pytest
 
-sys.path.append(str(Path(__file__).absolute().parents[2] / 'pipeline'))
-from compute_metrics import MathEval, compute_metrics
+sys.path.append(str(Path(__file__).absolute().parents[1]))
+from nemo_skills.evaluation.metrics import MathEval, compute_metrics
 
 
 def test_sft_pipeline():
diff --git a/tests/gpu-tests/test_generation.py b/tests/gpu-tests/test_generation.py
index 52d1b6302..92b124060 100644
--- a/tests/gpu-tests/test_generation.py
+++ b/tests/gpu-tests/test_generation.py
@@ -25,8 +25,8 @@
 
 import pytest
 
-sys.path.append(str(Path(__file__).absolute().parents[2] / 'pipeline'))
-from compute_metrics import MathEval, compute_metrics
+sys.path.append(str(Path(__file__).absolute().parents[1]))
+from nemo_skills.evaluation.metrics import MathEval, compute_metrics
 
 
 def test_trtllm_run_eval():