diff --git a/docs/reproducing-results.md b/docs/reproducing-results.md index 3e108a71e..3eaa925eb 100644 --- a/docs/reproducing-results.md +++ b/docs/reproducing-results.md @@ -200,6 +200,8 @@ you can run the following: --stages sft prepare_eval \ --num_nodes 8 \ --num_gpus 8 \ + --config-file sft_config_codegen \ + --with_sandbox \ ++model.data.train_ds.file_path=/data/sft-data.jsonl \ ++trainer.sft.max_epochs=4 \ ++trainer.sft.val_check_interval=4000 \ diff --git a/nemo_skills/evaluation/evaluate_results.py b/nemo_skills/evaluation/evaluate_results.py index cadae9d44..8bfb2f312 100644 --- a/nemo_skills/evaluation/evaluate_results.py +++ b/nemo_skills/evaluation/evaluate_results.py @@ -12,21 +12,16 @@ # See the License for the specific language governing permissions and # limitations under the License. -import json import logging -import shutil -import subprocess import sys -from argparse import Namespace from dataclasses import field -from pathlib import Path from typing import Any import hydra -from omegaconf import MISSING, OmegaConf +from omegaconf import MISSING -from nemo_skills.code_execution.sandbox import get_sandbox, sandbox_params -from nemo_skills.evaluation.code_utils import preprocess_code +from nemo_skills.code_execution.sandbox import sandbox_params +from nemo_skills.evaluation.settings import GRADING_MAP from nemo_skills.utils import get_help_message, nested_dataclass, setup_logging LOG = logging.getLogger(__file__) @@ -43,7 +38,7 @@ class EvaluateResultsConfig: # Sandbox configuration {sandbox_params} sandbox: dict = field(default_factory=lambda: {'sandbox_type': 'local'}) - eval_type: str = "math" # math or code + eval_type: str = "math" # math or code TODO: benchmark? eval_config: dict = field(default_factory=dict) def __post_init__(self): @@ -56,104 +51,15 @@ def __post_init__(self): cs.store(name="base_evaluate_results_config", node=EvaluateResultsConfig) -def math_eval(cfg): - sandbox = get_sandbox(**cfg.sandbox) - sandbox.batch_evaluate_results( - prediction_jsonl_files=cfg.prediction_jsonl_files, - **cfg.eval_config, - ) - - -def code_eval(cfg): - # TODO: need to move it to a separate docker (either our sandbox or separate srun) - from evalplus.evaluate import evaluate - - # processing each generation separately (TODO: evalplus can do it together, but need to figure out the format) - for jsonl_file in cfg.prediction_jsonl_files: - with open(jsonl_file) as f: - samples = [preprocess_code(json.loads(line)) for line in f] - # all changes will be done with a new key "completion", so it's ok to write to the same file - with open(jsonl_file, "wt", encoding="utf-8") as f: - for sample in samples: - f.write(json.dumps(sample) + "\n") - eval_config = { - "samples": jsonl_file, - "base_only": False, - "parallel": None, - "i_just_wanna_run": False, - "test_details": False, - "min_time_limit": 1, - "gt_time_limit_factor": 4.0, - "mini": False, - "noextreme": False, - "version": "default", - } - eval_config.update(OmegaConf.to_container(cfg.eval_config)) - evaluate(Namespace(**eval_config)) - with open(jsonl_file[:-6] + '_eval_results.json', 'rt', encoding="utf-8") as fin: - evalplus_grades = json.load(fin) - # adding is_correct key to allow compute_metrics to work - with open(jsonl_file, "wt", encoding="utf-8") as f: - for sample in samples: - sample['is_correct'] = evalplus_grades['eval'][sample['task_id']][0]['base_status'] == "pass" - sample['is_correct-plus'] = ( - sample['is_correct'] and evalplus_grades['eval'][sample['task_id']][0]['plus_status'] == "pass" - ) - f.write(json.dumps(sample) + "\n") - - # moving eval file as otherwise evalplus does not want to recompute metrics if it's present.. - shutil.move(jsonl_file[:-6] + '_eval_results.json', jsonl_file[:-6] + '_eval_results-saved.json') - - -def ifeval(cfg): - for jsonl_file in cfg.prediction_jsonl_files: - parent_dir = Path(jsonl_file).absolute().parent - cmd = ( - 'cd /opt/benchmarks/google-research && python -m instruction_following_eval.evaluation_main ' - f'--input_data={jsonl_file} ' - f'--input_response_data={jsonl_file} ' - f'--output_dir={parent_dir} ' - ) - subprocess.run(cmd, shell=True, check=True) - # fusing eval metrics back into the generation file - with open(jsonl_file, "rt", encoding="utf-8") as f: - samples = [json.loads(line) for line in f] - - with open(parent_dir / 'eval_results_loose.jsonl', 'rt', encoding="utf-8") as f: - eval_results = [json.loads(line) for line in f] - for sample, eval_result in zip(samples, eval_results): - sample['loose_eval'] = eval_result - - with open(parent_dir / 'eval_results_strict.jsonl', 'rt', encoding="utf-8") as f: - eval_results = [json.loads(line) for line in f] - for sample, eval_result in zip(samples, eval_results): - sample['strict_eval'] = eval_result - - with open(jsonl_file, "wt", encoding="utf-8") as f: - for sample in samples: - f.write(json.dumps(sample) + "\n") - - # removing metric files to avoid reusing them - (parent_dir / 'eval_results_loose.jsonl').unlink() - (parent_dir / 'eval_results_strict.jsonl').unlink() - - -eval_map = { - "math": math_eval, - "code": code_eval, - "ifeval": ifeval, -} - - @hydra.main(version_base=None, config_name="base_evaluate_results_config") def evaluate_results(cfg: EvaluateResultsConfig): cfg = EvaluateResultsConfig(_init_nested=True, **cfg) LOG.info("Config used: %s", cfg) - if cfg.eval_type not in eval_map: + if cfg.eval_type not in GRADING_MAP: raise ValueError(f"Unknown eval_type: {cfg.eval_type}") - eval_map[cfg.eval_type](cfg) + GRADING_MAP[cfg.eval_type](cfg) HELP_MESSAGE = get_help_message( diff --git a/nemo_skills/evaluation/fill_majority_answer.py b/nemo_skills/evaluation/fill_majority_answer.py new file mode 100644 index 000000000..2a2ca078b --- /dev/null +++ b/nemo_skills/evaluation/fill_majority_answer.py @@ -0,0 +1,139 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import logging +import sys +from collections import Counter +from itertools import zip_longest +from typing import Any + +import hydra +from omegaconf import MISSING +from tqdm import tqdm + +from nemo_skills.evaluation.metrics import MathEval, read_predictions +from nemo_skills.utils import get_help_message, nested_dataclass, setup_logging, unroll_files + +LOG = logging.getLogger(__file__) + + +@nested_dataclass +class FillMajorityAnswerConfig: + """Top-level parameters for the script""" + + # list of files to use for majority voting. + # Can specify multiple patterns separated by space + # e.g. "path/to/file1.jsonl path/to/file2.jsonl" or with regex + # "test_folder/output-rs*.jsonl" + prediction_jsonl_files: Any = MISSING + + # if set to True will error if any responses/data is missing + allow_incomplete: bool = False + + # minimum number of majority votes to use the answer. + # -1 means use half of the votes, which is a good default value + min_votes: int = -1 + + # will be used to fill up when not enough votes are available for the majority + default_answer: str = "no_answer" + + # will not use any negative answers as this likely indicates bad problems + # (at least for GSM8K domain). If running with other data, where negative answers + # are common, should be set to False + drop_negative_answers: bool = False + + # will not use any non-integer answers as this might indicates bad problems + drop_noninteger_answers: bool = False + + def __post_init__(self): + """Building data_file from dataset/split_name if not provided directly.""" + if isinstance(self.prediction_jsonl_files, str): + self.prediction_jsonl_files = self.prediction_jsonl_files.split(" ") + + +cs = hydra.core.config_store.ConfigStore.instance() +cs.store(name="base_fill_majority_answer_conifg", node=FillMajorityAnswerConfig) + + +@hydra.main(version_base=None, config_name="base_fill_majority_answer_conifg") +def fill_majority_answer(cfg: FillMajorityAnswerConfig): + cfg = FillMajorityAnswerConfig(_init_nested=True, **cfg) + LOG.info("Config used: %s", cfg) + + file_handles = [open(file, "rt", encoding="utf-8") for file in unroll_files(cfg.prediction_jsonl_files)] + if cfg.min_votes < 0: + cfg.min_votes = len(file_handles) // 2 + + # currently majority is only defined for math evals + evaluator = MathEval() + + majority_answers = [] + all_predictions = [] + retained_questions = 0 + for idx, predictions in enumerate(tqdm(zip_longest(*file_handles))): + data = read_predictions(predictions, evaluator, cfg.allow_incomplete) + all_predictions.append(data) + # TODO: currently majority does not take into account equivalent answers written in a different way + valid_answers_and_results = [ + (elem['predicted_answer'], elem['is_correct']) for elem in data if elem['predicted_answer'] is not None + ] + majority_answers.append(cfg.default_answer) + if len(valid_answers_and_results) == 0: + continue + (majority_answer, _), num_votes = Counter(valid_answers_and_results).most_common(1)[0] + + if num_votes <= cfg.min_votes: + continue + + if cfg.drop_negative_answers or cfg.drop_noninteger_answers: + try: + majority_answer = float(majority_answer) + except ValueError: + continue + + if cfg.drop_negative_answers and majority_answer < 0: + continue + + if cfg.drop_noninteger_answers and not majority_answer.is_integer(): + continue + + majority_answers[-1] = majority_answer + retained_questions += 1 + + LOG.info("Total questions: %d, retained questions: %d", len(all_predictions), retained_questions) + + for file_handle in file_handles: + file_handle.close() + + # writing the majority answers back to the files + file_handles = [open(file, "wt", encoding="utf-8") for file in unroll_files(cfg.prediction_jsonl_files)] + for idx, predictions in enumerate(all_predictions): + for lidx, handle in enumerate(file_handles): + predictions[lidx]["expected_answer"] = majority_answers[idx] + handle.write(json.dumps(predictions[lidx]) + "\n") + + for file_handle in file_handles: + file_handle.close() + + +HELP_MESSAGE = get_help_message(FillMajorityAnswerConfig) + + +if __name__ == "__main__": + if '--help' in sys.argv or '-h' in sys.argv: + print(HELP_MESSAGE) + else: + setup_logging() + fill_majority_answer() diff --git a/nemo_skills/evaluation/graders.py b/nemo_skills/evaluation/graders.py new file mode 100644 index 000000000..b166c656a --- /dev/null +++ b/nemo_skills/evaluation/graders.py @@ -0,0 +1,109 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import logging +import shutil +import subprocess +from argparse import Namespace +from pathlib import Path + +LOG = logging.getLogger(__file__) + + +def math_eval(cfg): + from nemo_skills.code_execution.sandbox import get_sandbox + + sandbox = get_sandbox(**cfg.sandbox) + sandbox.batch_evaluate_results( + prediction_jsonl_files=cfg.prediction_jsonl_files, + **cfg.eval_config, + ) + + +def code_eval(cfg): + # TODO: need to move it to a separate docker (either our sandbox or separate srun) + from evalplus.evaluate import evaluate + from omegaconf import OmegaConf + + from nemo_skills.evaluation.code_utils import preprocess_code + + # processing each generation separately (TODO: evalplus can do it together, but need to figure out the format) + for jsonl_file in cfg.prediction_jsonl_files: + with open(jsonl_file) as f: + samples = [preprocess_code(json.loads(line)) for line in f] + # all changes will be done with a new key "completion", so it's ok to write to the same file + with open(jsonl_file, "wt", encoding="utf-8") as f: + for sample in samples: + f.write(json.dumps(sample) + "\n") + eval_config = { + "samples": jsonl_file, + "base_only": False, + "parallel": None, + "i_just_wanna_run": False, + "test_details": False, + "min_time_limit": 1, + "gt_time_limit_factor": 4.0, + "mini": False, + "noextreme": False, + "version": "default", + } + eval_config.update(OmegaConf.to_container(cfg.eval_config)) + evaluate(Namespace(**eval_config)) + with open(jsonl_file[:-6] + '_eval_results.json', 'rt', encoding="utf-8") as fin: + evalplus_grades = json.load(fin) + # adding is_correct key to allow compute_metrics to work + with open(jsonl_file, "wt", encoding="utf-8") as f: + for sample in samples: + sample['is_correct'] = evalplus_grades['eval'][sample['task_id']][0]['base_status'] == "pass" + sample['is_correct-plus'] = ( + sample['is_correct'] and evalplus_grades['eval'][sample['task_id']][0]['plus_status'] == "pass" + ) + f.write(json.dumps(sample) + "\n") + + # moving eval file as otherwise evalplus does not want to recompute metrics if it's present.. + shutil.move(jsonl_file[:-6] + '_eval_results.json', jsonl_file[:-6] + '_eval_results-saved.json') + + +def ifeval(cfg): + for jsonl_file in cfg.prediction_jsonl_files: + parent_dir = Path(jsonl_file).absolute().parent + cmd = ( + 'cd /opt/benchmarks/google-research && python -m instruction_following_eval.evaluation_main ' + f'--input_data={jsonl_file} ' + f'--input_response_data={jsonl_file} ' + f'--output_dir={parent_dir} ' + ) + subprocess.run(cmd, shell=True, check=True) + # fusing eval metrics back into the generation file + with open(jsonl_file, "rt", encoding="utf-8") as f: + samples = [json.loads(line) for line in f] + + with open(parent_dir / 'eval_results_loose.jsonl', 'rt', encoding="utf-8") as f: + eval_results = [json.loads(line) for line in f] + for sample, eval_result in zip(samples, eval_results): + sample['loose_eval'] = eval_result + + with open(parent_dir / 'eval_results_strict.jsonl', 'rt', encoding="utf-8") as f: + eval_results = [json.loads(line) for line in f] + for sample, eval_result in zip(samples, eval_results): + sample['strict_eval'] = eval_result + + with open(jsonl_file, "wt", encoding="utf-8") as f: + for sample in samples: + f.write(json.dumps(sample) + "\n") + + # removing metric files to avoid reusing them + (parent_dir / 'eval_results_loose.jsonl').unlink() + (parent_dir / 'eval_results_strict.jsonl').unlink() diff --git a/nemo_skills/evaluation/metrics.py b/nemo_skills/evaluation/metrics.py new file mode 100644 index 000000000..8430af240 --- /dev/null +++ b/nemo_skills/evaluation/metrics.py @@ -0,0 +1,221 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import logging +import sys +from collections import Counter +from itertools import zip_longest +from pathlib import Path + +# adding nemo_skills to python path to avoid requiring installation +sys.path.append(str(Path(__file__).absolute().parents[1])) + +from nemo_skills.utils import unroll_files + +LOG = logging.getLogger(__file__) + + +class MathEval: + def __init__(self): + self.reset() + + def fill_up_missing(self): + return {'predicted_answer': None, 'is_correct': False} + + def is_incomplete(self, elem): + return 'is_correct' not in elem or 'predicted_answer' not in elem + + def update(self, predictions, aggregation_mode): + """Updating the evaluation results with the current element. + + Args: + predictions (list[dict]): aggregated predictions across all generations. + The content of the file is benchmark specific. + aggregation_mode (str): "best", "majority", "first", etc. Might vary by benchmark. + """ + # this shouldn't do any heavy calculation, but just read the metric from existing json entry + # all the heavy lifting should be done in the evaluation script + self.total += 1 + if aggregation_mode == "best": + self.total_correct += any([elem['is_correct'] for elem in predictions]) + if all([elem['predicted_answer'] is None for elem in predictions]): + self.total_no_answer += 1 + elif aggregation_mode == "majority": + # TODO: currently majority does not take into account equivalent answers written in a different way + valid_answers_and_results = [ + (elem['predicted_answer'], elem['is_correct']) + for elem in predictions + if elem['predicted_answer'] is not None + ] + if len(valid_answers_and_results) == 0: + self.total_no_answer += 1 + else: + majority_result = Counter(valid_answers_and_results).most_common(1)[0][0] + self.total_correct += majority_result[1] + elif aggregation_mode == "first": + self.total_correct += predictions[0]['is_correct'] + self.total_no_answer += predictions[0]['predicted_answer'] is None + else: + raise ValueError(f"Unsupported mode {aggregation_mode}") + + def get_metrics(self): + return { + "num_entries": self.total, + "correct_answer": self.total_correct / self.total * 100.0, + "wrong_answer": (self.total - self.total_correct - self.total_no_answer) / self.total * 100.0, + "no_answer": self.total_no_answer / self.total * 100.0, + } + + def reset(self): + self.total_correct = 0 + self.total_no_answer = 0 + self.total = 0 + + +class CodeEval: + def __init__(self): + self.reset() + + def fill_up_missing(self): + return {'is_correct': False, 'is_correct-plus': False} + + def is_incomplete(self, elem): + return 'is_correct' not in elem or 'is_correct-plus' not in elem + + def update(self, predictions, aggregation_mode): + """Updating the evaluation results with the current element. + + Args: + predictions (list[dict]): aggregated predictions across all generations. + The content of the file is benchmark specific. + aggregation_mode (str): "best", "majority", "first", etc. Might vary by benchmark. + """ + # this shouldn't do any heavy calculation, but just read the metric from existing json entry + # all the heavy lifting should be done in the evaluation script + self.total += 1 + if aggregation_mode == "best": + self.total_correct += any([elem['is_correct'] for elem in predictions]) + self.total_correct_plus += any([elem['is_correct-plus'] for elem in predictions]) + elif aggregation_mode == "first": + self.total_correct += predictions[0]['is_correct'] + self.total_correct_plus += predictions[0]['is_correct-plus'] + else: + raise ValueError(f"Unsupported mode {aggregation_mode}") + + def get_metrics(self): + return { + "num_entries": self.total, + "passing_base_tests": self.total_correct / self.total * 100.0, + "passing_plus_tests": self.total_correct_plus / self.total * 100.0, + } + + def reset(self): + self.total_correct = 0 + self.total_correct_plus = 0 + self.total = 0 + + +class IFEval: + def __init__(self): + self.reset() + + def fill_up_missing(self): + return {'loose_eval': {'follow_all_instructions': False}, 'strict_eval': {'follow_all_instructions': False}} + + def is_incomplete(self, elem): + incomplete = 'loose_eval' not in elem or 'strict_eval' not in elem + if incomplete: + return False + return ( + 'follow_all_instructions' not in elem['loose_eval'] or 'follow_all_instructions' not in elem['strict_eval'] + ) + + def update(self, predictions, aggregation_mode): + """Updating the evaluation results with the current element. + + Args: + predictions (list[dict]): aggregated predictions across all generations. + The content of the file is benchmark specific. + aggregation_mode (str): "best", "majority", "first", etc. Might vary by benchmark. + """ + # this shouldn't do any heavy calculation, but just read the metric from existing json entry + # all the heavy lifting should be done in the evaluation script + self.total += 1 + if aggregation_mode == "best": + self.total_correct_loose += any([elem['loose_eval']['follow_all_instructions'] for elem in predictions]) + self.total_correct_strict += any([elem['strict_eval']['follow_all_instructions'] for elem in predictions]) + elif aggregation_mode == "first": + self.total_correct_loose += predictions[0]['loose_eval']['follow_all_instructions'] + self.total_correct_strict += predictions[0]['strict_eval']['follow_all_instructions'] + else: + raise ValueError(f"Unsupported mode {aggregation_mode}") + + def get_metrics(self): + return { + "num_entries": self.total, + "strict_accuracy": self.total_correct_strict / self.total * 100.0, + "loose_accuracy": self.total_correct_loose / self.total * 100.0, + } + + def reset(self): + self.total_correct_loose = 0 + self.total_correct_strict = 0 + self.total = 0 + + +def read_predictions(predictions, evaluator, allow_incomplete=False): + data = [] + for prediction in predictions: + if not prediction: # could have missing predictions + if not allow_incomplete: + raise RuntimeError("Some data is missing!") + data.append(evaluator.fill_up_missing()) + continue + prediction_dict = json.loads(prediction) + if not prediction_dict: + if not allow_incomplete: + raise RuntimeError("Some data is missing!") + data.append(evaluator.fill_up_missing()) + continue + if evaluator.is_incomplete(prediction_dict): + if not allow_incomplete: + raise RuntimeError("Some data is missing!") + data.append(evaluator.fill_up_missing()) + continue + data.append(prediction_dict) + + return data + + +def compute_metrics( + prediction_jsonl_files, + evaluator, + allow_incomplete=False, + max_samples=-1, + aggregation_mode='first', +): + file_handles = [open(file, "rt", encoding="utf-8") for file in unroll_files(prediction_jsonl_files)] + + evaluator.reset() + for idx, predictions in enumerate(zip_longest(*file_handles)): + if idx == max_samples: + break + data = read_predictions(predictions, evaluator, allow_incomplete) + evaluator.update(data, aggregation_mode) + + for file_handle in file_handles: + file_handle.close() + + return evaluator.get_metrics() diff --git a/nemo_skills/evaluation/settings.py b/nemo_skills/evaluation/settings.py new file mode 100644 index 000000000..5e555ec47 --- /dev/null +++ b/nemo_skills/evaluation/settings.py @@ -0,0 +1,75 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# a collection of settings required to correctly running evaluations for different benchmarks +# in addition to what's in this file, there is also some prompt engineering that needs +# to happen in eval_map.py inside prompt folder for specific models + +from nemo_skills.evaluation.graders import code_eval, ifeval, math_eval +from nemo_skills.evaluation.metrics import CodeEval, IFEval, MathEval + +MATH_BENCHMARKS = [ + 'algebra222', + 'asdiv', + 'functional', + 'gsm-hard', + 'gsm-ic-2step', + 'gsm-ic-mstep', + 'gsm-plus', + 'gsm8k', + 'math', + 'mawps', + 'svamp', + 'tabmwp', + 'math-odyssey', + 'aime-2024', +] +CODE_BENCHMARKS = ['human-eval', 'mbpp'] + + +# ------------------------------- metrics settings ----------------------------- +EVALUATOR_MAP = { + "ifeval": IFEval, + "mmlu": MathEval, # TODO: update this +} + +for benchmark in MATH_BENCHMARKS: + EVALUATOR_MAP[benchmark] = MathEval + +for benchmark in CODE_BENCHMARKS: + EVALUATOR_MAP[benchmark] = CodeEval +# ------------------------------------------------------------------------------ + +# -------------------------------- eval settings ------------------------------- +EXTRA_EVAL_ARGS = { + # some benchmarks require specific extra arguments, which are defined here + 'human-eval': '++eval_type=code ++eval_config.dataset=humaneval', + 'mbpp': '++eval_type=code ++eval_config.dataset=mbpp', + 'ifeval': '++eval_type=ifeval', +} + +# TODO: better name? +GRADING_MAP = { + "math": math_eval, # that's default. TODO: should we do this per-benchmark? + "code": code_eval, + "ifeval": ifeval, +} +# ------------------------------------------------------------------------------ + +# --------------------------------- gen settings ------------------------------- +EXTRA_GENERATION_ARGS = { + # some benchmarks require specific extra arguments, which are defined here + 'ifeval': '++generation_key=response', +} +# ------------------------------------------------------------------------------ diff --git a/nemo_skills/finetuning/data_preparation_utils/prepare_sft_data.yaml b/nemo_skills/finetuning/data_preparation_utils/prepare_sft_data.yaml index d9c9dc9ab..98dc6e5ba 100644 --- a/nemo_skills/finetuning/data_preparation_utils/prepare_sft_data.yaml +++ b/nemo_skills/finetuning/data_preparation_utils/prepare_sft_data.yaml @@ -16,6 +16,7 @@ num_output_samples: null prompt_type: openmathinstruct/sft chat_format: false # whether to use NeMo's chat format +generation_suffix: "" # suffix to add to the end of expected response (e.g. <|eot_id|>) filters: solution_key: generation # key to filter solutions by @@ -92,4 +93,5 @@ processors: output_manifest_file: ${output_path} prompt_type: ${prompt_type} chat_format: ${chat_format} + generation_suffix: ${generation_suffix} metadata: ${metadata} diff --git a/nemo_skills/finetuning/data_preparation_utils/preprocessing.py b/nemo_skills/finetuning/data_preparation_utils/preprocessing.py index f0c1b190c..c49e8ecae 100644 --- a/nemo_skills/finetuning/data_preparation_utils/preprocessing.py +++ b/nemo_skills/finetuning/data_preparation_utils/preprocessing.py @@ -229,11 +229,21 @@ def process(self): class WriteFinalSftManifest(BaseProcessor): - def __init__(self, prompt_type: str, chat_format: bool = False, metadata: Optional[Dict] = None, **kwargs): + def __init__( + self, + prompt_type: str, + chat_format: bool = False, + generation_suffix: str = "", + metadata: Optional[Dict] = None, + **kwargs, + ): super().__init__(**kwargs) self.prompt_type = prompt_type self.chat_format = chat_format self.metadata = metadata + self.generation_suffix = generation_suffix + if self.generation_suffix and self.chat_format: + raise ValueError("generation_suffix can only be used with chat_format=False") if not self.metadata: self.metadata = {} @@ -266,7 +276,7 @@ def process(self): elem['type'] = None else: elem["input"] = prompt.build_string(input_dict={"question": elem['question']}) - elem["output"] = elem.pop("generation") + elem["output"] = elem.pop("generation") + self.generation_suffix elem.update(self.metadata) fout.write(json.dumps(elem) + "\n") samples_count += 1 diff --git a/nemo_skills/finetuning/sft_config.yaml b/nemo_skills/finetuning/sft_config.yaml index bc5d9f523..2d464f711 100644 --- a/nemo_skills/finetuning/sft_config.yaml +++ b/nemo_skills/finetuning/sft_config.yaml @@ -27,15 +27,9 @@ trainer: val_check_interval: 1000 # TODO: currently float values are not supported save_interval: ${.val_check_interval} - limit_val_batches: 1.0 + limit_val_batches: 1 # since loss is not meaningful by default we are limiting the validation batches to 1 gradient_clip_val: null - inference_metrics: - code_generation_accuracy: - _target_: nemo_skills.finetuning.code_generation_accuracy.CodeGenerationAccuracyMetric - sandbox_cfg: - sandbox_type: local - # do not change these logger: False # logger provided by exp_manager enable_checkpointing: False @@ -56,9 +50,9 @@ exp_manager: resume_ignore_no_checkpoint: True create_checkpoint_callback: True checkpoint_callback_params: - monitor: val_code_generation_accuracy + monitor: val_loss save_top_k: 8 - mode: max + mode: min save_nemo_on_train_end: False filename: 'megatron_gpt_sft--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}-{consumed_samples}-{epoch}' model_parallel_size: ${model.tensor_model_parallel_size} @@ -94,29 +88,10 @@ model: seq_len_interpolation_factor: null # if not None, seq_len_interpolation_factor will match the base model's value use_flash_attention: null # if not None, will match the base model's value - hidden_dropout: 0.1 - attention_dropout: 0.1 - ffn_dropout: 0.1 - - inference: - sampling_params: - use_greedy: True - # most other parameters are not used if greedy is True - temperature: 0.7 - top_k: 0 - top_p: 0.95 - repetition_penalty: 1.0 - add_BOS: False - all_probs: False - compute_logprob: False - end_strings: ["<|endoftext|>", ""] - length_params: - min_length: 0 - max_length: 512 - strategy: - _target_: nemo_skills.inference.inference_strategy.CodeExecutionStrategy - sandbox_cfg: - sandbox_type: local + hidden_dropout: 0.0 + attention_dropout: 0.0 + ffn_dropout: 0.0 + peft: peft_scheme: "none" # ["lora", "none"] restore_from_path: null @@ -152,7 +127,7 @@ model: # Example of how each dataset is formatted # {'input': 'John von Neumann\nVon Neumann made fundamental contributions .... Q: What did the math of artificial viscosity do?', 'output': 'smoothed the shock transition without sacrificing basic physics'} file_path: ??? # Path to a JSONL file corresponding to the source data. Data format is identical to validation_ds. - global_batch_size: 128 + global_batch_size: 256 micro_batch_size: 1 shuffle: True memmap_workers: null diff --git a/nemo_skills/finetuning/sft_config_codegen.yaml b/nemo_skills/finetuning/sft_config_codegen.yaml new file mode 100644 index 000000000..bc5d9f523 --- /dev/null +++ b/nemo_skills/finetuning/sft_config_codegen.yaml @@ -0,0 +1,203 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: megatron_gpt_sft + +trainer: + num_nodes: 1 + devices: 1 + accelerator: gpu + precision: bf16 + + sft: + max_epochs: 2 + max_steps: -1 + + val_check_interval: 1000 # TODO: currently float values are not supported + save_interval: ${.val_check_interval} + + limit_val_batches: 1.0 + gradient_clip_val: null + + inference_metrics: + code_generation_accuracy: + _target_: nemo_skills.finetuning.code_generation_accuracy.CodeGenerationAccuracyMetric + sandbox_cfg: + sandbox_type: local + + # do not change these + logger: False # logger provided by exp_manager + enable_checkpointing: False + use_distributed_sampler: False + max_time: null + max_epochs: ${.sft.max_epochs} + max_steps: ${.sft.max_steps} + +exp_manager: + explicit_log_dir: null + exp_dir: null + name: ${name} + create_wandb_logger: False + wandb_logger_kwargs: + project: null + name: null + resume_if_exists: True + resume_ignore_no_checkpoint: True + create_checkpoint_callback: True + checkpoint_callback_params: + monitor: val_code_generation_accuracy + save_top_k: 8 + mode: max + save_nemo_on_train_end: False + filename: 'megatron_gpt_sft--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}-{consumed_samples}-{epoch}' + model_parallel_size: ${model.tensor_model_parallel_size} + save_best_model: False # need to keep this false otherwise it will create multiple last.ckpt files because restore reset the previous best model + +model: + seed: 1234 + tensor_model_parallel_size: 1 # intra-layer model parallelism + pipeline_model_parallel_size: 1 # inter-layer model parallelism + restore_from_path: ??? # Path to an existing p-tuned/prompt tuned .nemo model you wish to add new tasks to or run inference with + resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. + save_nemo_on_validation_end: True # Saves an inference ready .nemo file every time a checkpoint is saved during training. + sync_batch_comm: False + megatron_amp_O2: True + encoder_seq_length: 4096 # the sequence length of the encoder model, it will be overwriten by loaded GPT model + + ## Sequence Parallelism + # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially + # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details. + sequence_parallel: False + + ## Activation Checkpoint + activations_checkpoint_granularity: full # 'selective' or 'full' + activations_checkpoint_method: uniform # 'uniform', 'block', not used with 'selective' + # 'uniform' divides the total number of transformer layers and checkpoints the input activation + # of each chunk at the specified granularity + # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity + activations_checkpoint_num_layers: 1 # not used with 'selective' + activations_checkpoint_layers_per_pipeline: null + # This feature is valid only when used with pipeline-model-parallelism. More details in megatron_gpt_config.yaml. + answer_only_loss: True # not used right now + gradient_as_bucket_view: False + seq_len_interpolation_factor: null # if not None, seq_len_interpolation_factor will match the base model's value + use_flash_attention: null # if not None, will match the base model's value + + hidden_dropout: 0.1 + attention_dropout: 0.1 + ffn_dropout: 0.1 + + inference: + sampling_params: + use_greedy: True + # most other parameters are not used if greedy is True + temperature: 0.7 + top_k: 0 + top_p: 0.95 + repetition_penalty: 1.0 + add_BOS: False + all_probs: False + compute_logprob: False + end_strings: ["<|endoftext|>", ""] + length_params: + min_length: 0 + max_length: 512 + strategy: + _target_: nemo_skills.inference.inference_strategy.CodeExecutionStrategy + sandbox_cfg: + sandbox_type: local + peft: + peft_scheme: "none" # ["lora", "none"] + restore_from_path: null + + lora_tuning: + target_modules: ['attention_qkv'] # currently only support attention_qkv + adapter_dim: 32 + adapter_dropout: 0.0 + column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal + row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal + layer_selection: null # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers + weight_tying: False + position_embedding_strategy: null # used only when weight_tying is True + + data: + chat: False # whether use chatbot data or not + chat_prompt_tokens: # special tokens for the chat prompts, a dictionary of {token_type: token}. note that some tokenizer may combine the characters at the junction between {end_of_turn}{turn_start}. e.g. '', the '><' sometimes is merged to be a single token. This is not supported, try to avoid + system_turn_start: "" + turn_start: "" + label_start: "" + end_of_turn: "\x0A" # \0x0A is '\n' + end_of_name: "\x0A" # \0x0A is '\n' + + sample: False # create the index mapping files for the sample data, so max_steps * global_batch_size can be larger than the dataset size + num_workers: 1 + dataloader_type: single # only supports single + train_ds: + # Example of how to specify paths to multiple datasets + # file_names: + # - /path/to/squad.jsonl + # - /path/to/mnli.jsonl + # - /path/to/boolq.jsonl + # Example of how each dataset is formatted + # {'input': 'John von Neumann\nVon Neumann made fundamental contributions .... Q: What did the math of artificial viscosity do?', 'output': 'smoothed the shock transition without sacrificing basic physics'} + file_path: ??? # Path to a JSONL file corresponding to the source data. Data format is identical to validation_ds. + global_batch_size: 128 + micro_batch_size: 1 + shuffle: True + memmap_workers: null + max_seq_length: 4096 + min_seq_length: 1 + drop_last: True + label_key: 'output' + add_eos: True + add_sep: False + add_bos: False + truncation_field: "input" # # Can be multiple keys separated with ',' Options: keys in prompt_template + index_mapping_dir: null # Path to a directory to write index mapping files. + prompt_template: "{input}{output}" # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}" + hf_dataset: False # Whether to load the json file with the HuggingFace dataset. otherwise, will load the jsonl file with the JSONLMemMapDataset. + truncation_method: 'right' # Truncation from which position, Options: ['left', 'right'] + + validation_ds: + file_path: ??? # Path to a JSONL file corresponding to the source data. Data format is identical to validation_ds. + global_batch_size: ${model.data.train_ds.global_batch_size} + micro_batch_size: ${model.data.train_ds.micro_batch_size} + shuffle: False + memmap_workers: ${model.data.train_ds.memmap_workers} + max_seq_length: ${model.data.train_ds.max_seq_length} + min_seq_length: 1 + drop_last: False + label_key: ${model.data.train_ds.label_key} + add_eos: ${model.data.train_ds.add_eos} + add_sep: ${model.data.train_ds.add_sep} + add_bos: ${model.data.train_ds.add_bos} + truncation_field: ${model.data.train_ds.truncation_field} # Options: keys in prompt_template + index_mapping_dir: null # Path to a directory to write index mapping files. + prompt_template: ${model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}" + hf_dataset: False # Whether to load the json file with the HuggingFace dataset. otherwise, will load the jsonl file with the JSONLMemMapDataset. + truncation_method: 'right' # Truncation from which position, Options: ['left', 'right'] + output_original_text: True # needed for the proper metrics support + + optim: + name: distributed_fused_adam # Supports distributed optimizer for memory savings. To enable, set to 'distributed_fused_adam'. Needs Apex to be built with specific args to work. + lr: 1e-6 + weight_decay: 0.01 + betas: + - 0.9 + - 0.98 + # sched: + # name: CosineAnnealing + # warmup_steps: 0 + # constant_steps: 0 + # min_lr: 1e-6 \ No newline at end of file diff --git a/pipeline/compute_metrics.py b/pipeline/compute_metrics.py index ecbf9ffc9..84ffb5bce 100644 --- a/pipeline/compute_metrics.py +++ b/pipeline/compute_metrics.py @@ -16,287 +16,18 @@ import json import logging import sys -from collections import Counter, defaultdict -from itertools import zip_longest from pathlib import Path # adding nemo_skills to python path to avoid requiring installation sys.path.append(str(Path(__file__).absolute().parents[1])) -from nemo_skills.utils import setup_logging, unroll_files +from nemo_skills.evaluation.metrics import compute_metrics +from nemo_skills.evaluation.settings import EVALUATOR_MAP +from nemo_skills.utils import setup_logging LOG = logging.getLogger(__file__) -class MathEval: - def __init__(self): - self.reset() - - def fill_up_missing(self): - return {'predicted_answer': None, 'is_correct': False} - - def is_incomplete(self, elem): - return 'is_correct' not in elem or 'predicted_answer' not in elem - - def update(self, predictions, aggregation_mode): - """Updating the evaluation results with the current element. - - Args: - predictions (list[dict]): aggregated predictions across all generations. - The content of the file is benchmark specific. - aggregation_mode (str): "best", "majority", "first", etc. Might vary by benchmark. - """ - # this shouldn't do any heavy calculation, but just read the metric from existing json entry - # all the heavy lifting should be done in the evaluation script - self.total += 1 - if aggregation_mode == "best": - self.total_correct += any([elem['is_correct'] for elem in predictions]) - if all([elem['predicted_answer'] is None for elem in predictions]): - self.total_no_answer += 1 - elif aggregation_mode == "majority": - # TODO: currently majority does not take into account equivalent answers written in a different way - valid_answers_and_results = [ - (elem['predicted_answer'], elem['is_correct']) - for elem in predictions - if elem['predicted_answer'] is not None - ] - if len(valid_answers_and_results) == 0: - self.total_no_answer += 1 - else: - majority_result = Counter(valid_answers_and_results).most_common(1)[0][0] - self.total_correct += majority_result[1] - elif aggregation_mode == "first": - self.total_correct += predictions[0]['is_correct'] - self.total_no_answer += predictions[0]['predicted_answer'] is None - else: - raise ValueError(f"Unsupported mode {aggregation_mode}") - - def get_metrics(self): - return { - "num_entries": self.total, - "correct_answer": self.total_correct / self.total * 100.0, - "wrong_answer": (self.total - self.total_correct - self.total_no_answer) / self.total * 100.0, - "no_answer": self.total_no_answer / self.total * 100.0, - } - - def reset(self): - self.total_correct = 0 - self.total_no_answer = 0 - self.total = 0 - - -class CodeEval: - def __init__(self): - self.reset() - - def fill_up_missing(self): - return {'is_correct': False, 'is_correct-plus': False} - - def is_incomplete(self, elem): - return 'is_correct' not in elem or 'is_correct-plus' not in elem - - def update(self, predictions, aggregation_mode): - """Updating the evaluation results with the current element. - - Args: - predictions (list[dict]): aggregated predictions across all generations. - The content of the file is benchmark specific. - aggregation_mode (str): "best", "majority", "first", etc. Might vary by benchmark. - """ - # this shouldn't do any heavy calculation, but just read the metric from existing json entry - # all the heavy lifting should be done in the evaluation script - self.total += 1 - if aggregation_mode == "best": - self.total_correct += any([elem['is_correct'] for elem in predictions]) - self.total_correct_plus += any([elem['is_correct-plus'] for elem in predictions]) - elif aggregation_mode == "first": - self.total_correct += predictions[0]['is_correct'] - self.total_correct_plus += predictions[0]['is_correct-plus'] - else: - raise ValueError(f"Unsupported mode {aggregation_mode}") - - def get_metrics(self): - return { - "num_entries": self.total, - "passing_base_tests": self.total_correct / self.total * 100.0, - "passing_plus_tests": self.total_correct_plus / self.total * 100.0, - } - - def reset(self): - self.total_correct = 0 - self.total_correct_plus = 0 - self.total = 0 - - -class IFEval: - # loosely adapted from - # https://github.com/google-research/google-research/blob/master/instruction_following_eval/evaluation_main.py - - required_keys = ['follow_instruction_list', 'instruction_id_list'] - - def __init__(self): - self.reset() - - def fill_up_missing(self): - return { - 'loose_eval': {key: [] for key in self.required_keys}, - 'strict_eval': {key: [] for key in self.required_keys}, - } - - def is_incomplete(self, elem): - incomplete = 'loose_eval' not in elem or 'strict_eval' not in elem - if incomplete: - return True - - if any([key not in elem['loose_eval'] for key in self.required_keys]): - return True - - if any([key not in elem['strict_eval'] for key in self.required_keys]): - return True - - return False - - def _update_single_stat(self, stats_dict, elems): - """Will update using the pass@k strategy (just pass a single-element list to get greedy).""" - # has to be the same across all elements as they are solutions for the same question - instruction_id_list = elems[0]['instruction_id_list'] - # computing "pass@k" score - follow_instruction_list = elems[0]['follow_instruction_list'] - for elem in elems: - follow_instruction_list = [ - follow_instruction_list[i] or elem['follow_instruction_list'][i] - for i in range(len(follow_instruction_list)) - ] - - stats_dict['prompt']['total'] += 1 - if all(follow_instruction_list): - stats_dict['prompt']['correct'] += 1 - - stats_dict['instruction']['total'] += len(instruction_id_list) - stats_dict['instruction']['correct'] += sum(follow_instruction_list) - - for instruction_id, followed_or_not in zip(instruction_id_list, follow_instruction_list): - instruction_id = instruction_id.split(":")[0] - stats_dict['tier0']['total'][instruction_id] += 1 - if followed_or_not: - stats_dict['tier0']['correct'][instruction_id] += 1 - - def update(self, predictions, aggregation_mode): - """Updating the evaluation results with the current element. - - Args: - predictions (list[dict]): aggregated predictions across all generations. - The content of the file is benchmark specific. - aggregation_mode (str): "best", "majority", "first", etc. Might vary by benchmark. - """ - # this shouldn't do any heavy calculation, but just read the metric from existing json entry - # all the heavy lifting should be done in the evaluation script - if aggregation_mode == "best": - self._update_single_stat(self.strict_stats, [pred['strict_eval'] for pred in predictions]) - self._update_single_stat(self.loose_stats, [pred['loose_eval'] for pred in predictions]) - elif aggregation_mode == "first": - self._update_single_stat(self.strict_stats, [predictions[0]['strict_eval']]) - self._update_single_stat(self.loose_stats, [predictions[0]['loose_eval']]) - else: - raise ValueError(f"Unsupported mode {aggregation_mode}") - - def get_metrics(self): - prompt_total = self.strict_stats['prompt']['total'] - inst_total = self.strict_stats['instruction']['total'] - return { - "num_prompts": prompt_total, - "num_instructions": inst_total, - "prompt_strict_accuracy": self.strict_stats['prompt']['correct'] / prompt_total * 100.0, - "instruction_strict_accuracy": self.strict_stats['instruction']['correct'] / inst_total * 100.0, - "prompt_loose_accuracy": self.loose_stats['prompt']['correct'] / prompt_total * 100.0, - "instruction_loose_accuracy": self.loose_stats['instruction']['correct'] / inst_total * 100.0, - } - - def reset(self): - # the original code also has a deeper breakdown into tier1 scores, - # but that's probably too much for us to track at this stage - self.strict_stats = { - "prompt": {"total": 0, "correct": 0}, - "instruction": {"total": 0, "correct": 0}, - "tier0": {"total": defaultdict(int), "correct": defaultdict(int)}, - } - self.loose_stats = { - "prompt": {"total": 0, "correct": 0}, - "instruction": {"total": 0, "correct": 0}, - "tier0": {"total": defaultdict(int), "correct": defaultdict(int)}, - } - - -def compute_metrics( - prediction_jsonl_files, - evaluator, - allow_incomplete=False, - max_samples=-1, - aggregation_mode='first', -): - file_handles = [open(file, "rt", encoding="utf-8") for file in unroll_files(prediction_jsonl_files)] - - evaluator.reset() - for idx, lines in enumerate(zip_longest(*file_handles)): - if idx == max_samples: - break - data = [] - for line in lines: - if not line: # could have missing predictions - if not allow_incomplete: - raise RuntimeError("Some data is missing!") - data.append(evaluator.fill_up_missing()) - continue - line_dict = json.loads(line) - if not line_dict: - if not allow_incomplete: - raise RuntimeError("Some data is missing!") - data.append(evaluator.fill_up_missing()) - continue - if evaluator.is_incomplete(line_dict): - if not allow_incomplete: - raise RuntimeError("Some data is missing!") - data.append(evaluator.fill_up_missing()) - continue - data.append(line_dict) - - evaluator.update(data, aggregation_mode) - - for file_handle in file_handles: - file_handle.close() - - return evaluator.get_metrics() - - -# TODO: move all benchmark-specific things to some unified place -math_benchmarks = [ - 'algebra222', - 'asdiv', - 'functional', - 'gsm-hard', - 'gsm-ic-2step', - 'gsm-ic-mstep', - 'gsm-plus', - 'gsm8k', - 'math', - 'mawps', - 'svamp', - 'tabmwp', -] -code_benchmarks = ['human-eval', 'mbpp'] - -EVALUATOR_MAP = { - "ifeval": IFEval, - "mmlu": MathEval, # TODO: update this -} - -for benchmark in math_benchmarks: - EVALUATOR_MAP[benchmark] = MathEval - -for benchmark in code_benchmarks: - EVALUATOR_MAP[benchmark] = CodeEval - - if __name__ == '__main__': setup_logging(disable_hydra_logs=False) parser = argparse.ArgumentParser() diff --git a/pipeline/run_eval.py b/pipeline/run_eval.py index c403b994c..f7a8356e6 100644 --- a/pipeline/run_eval.py +++ b/pipeline/run_eval.py @@ -29,6 +29,7 @@ To see all supported agruments, nemo_skills package needs to be installed. Please note that it is not recommended to install Python packages on a slurm cluster login node. """ +from nemo_skills.evaluation.settings import EXTRA_EVAL_ARGS, EXTRA_GENERATION_ARGS from nemo_skills.utils import setup_logging SCRIPT_HELP = """ @@ -46,7 +47,7 @@ def get_greedy_cmd( benchmark, output_name='output-greedy.jsonl', extra_eval_args="", extra_arguments="", eval_map=None ): extra_eval_args = f"{EXTRA_EVAL_ARGS.get(benchmark, '')} {extra_eval_args}" - extra_arguments = f"{EXTRA_ARGS.get(benchmark, '')} {extra_arguments}" + extra_arguments = f"{EXTRA_GENERATION_ARGS.get(benchmark, '')} {extra_arguments}" if eval_map: extra_arguments = f"+prompt={eval_map.get(benchmark, eval_map['default'])} {extra_arguments}" return f"""echo "Evaluating benchmark {benchmark}" && \ @@ -92,18 +93,6 @@ def get_sampling_cmd(benchmark, random_seed, extra_eval_args="", extra_arguments MOUNTS = "{NEMO_SKILLS_CODE}:/code,{model_path}:/model,{output_dir}:/results" JOB_NAME = "eval-{model_name}" -EXTRA_EVAL_ARGS = { - # some benchmarks require specific extra arguments, which are defined here - 'human-eval': '++eval_type=code ++eval_config.dataset=humaneval', - 'mbpp': '++eval_type=code ++eval_config.dataset=mbpp', - 'ifeval': '++eval_type=ifeval', -} - -EXTRA_ARGS = { - # some benchmarks require specific extra arguments, which are defined here - 'ifeval': '++generation_key=response', -} - if __name__ == "__main__": setup_logging(disable_hydra_logs=False) diff --git a/pipeline/run_sft.py b/pipeline/run_sft.py index 3a1aa79b7..2fb6751e5 100644 --- a/pipeline/run_sft.py +++ b/pipeline/run_sft.py @@ -63,6 +63,7 @@ parser.add_argument( "--validation_dataset", default="gsm8k", + # TODO: how to disable it by default? help="Validation dataset to use. Make sure it exists inside datasets folder", ) parser.add_argument("--num_nodes", type=int, default=1) @@ -71,6 +72,7 @@ "--disable_wandb", action="store_true", help="Disable wandb logging and use tensorboard instead" ) parser.add_argument("--chat_format", action="store_true", help="Use chat format for SFT data") + parser.add_argument("--with_sandbox", action="store_true", help="If sandbox is required for code generation") parser.add_argument( "--partition", required=False, @@ -142,5 +144,5 @@ container=CLUSTER_CONFIG["containers"]["nemo"], mounts=MOUNTS.format(**format_dict), partition=args.partition, - with_sandbox=True, + with_sandbox=args.with_sandbox, ) diff --git a/pipeline/summarize_results.py b/pipeline/summarize_results.py index 31c78999f..7070968a6 100644 --- a/pipeline/summarize_results.py +++ b/pipeline/summarize_results.py @@ -17,7 +17,6 @@ import argparse import glob import json -import logging import subprocess import sys from collections import defaultdict @@ -29,6 +28,8 @@ from compute_metrics import EVALUATOR_MAP, compute_metrics +from nemo_skills.evaluation.metrics import MathEval + if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( @@ -58,7 +59,7 @@ if not Path(benchmark_path).is_dir(): continue try: - evaluator = EVALUATOR_MAP[benchmark]() + evaluator = EVALUATOR_MAP.get(benchmark, MathEval)() if benchmark in ['human-eval', 'mbpp']: if Path(f'{benchmark_path}/output-greedy.jsonl').exists(): results[benchmark]['greedy'] = compute_metrics( @@ -99,22 +100,22 @@ lines_to_write = [] for benchmark, benchmark_results in results.items(): max_widths = {} - max_widths['evaluation mode'] = len('evaluation mode') + max_widths['evaluation_mode'] = len('evaluation_mode') for eval_mode, metrics in benchmark_results.items(): for metric_key, metric_value in metrics.items(): max_widths[metric_key] = max( max_widths.get(metric_key, len(metric_key)), len(f"{metric_value:.2f}" if isinstance(metric_value, float) else str(metric_value)), ) - max_widths['evaluation mode'] = max(max_widths['evaluation mode'], len(eval_mode)) + max_widths['evaluation_mode'] = max(max_widths['evaluation_mode'], len(eval_mode)) total_width = sum(max_widths.values()) + (len(max_widths) - 1) * 3 print(f' {benchmark} '.center(total_width, '-')) - headers = ['evaluation mode'] + list(list(benchmark_results.values())[0].keys()) + headers = ['evaluation_mode'] + list(list(benchmark_results.values())[0].keys()) print(' | '.join([f'{header:<{max_widths[header]}}' for header in headers])) for eval_mode, metrics in benchmark_results.items(): - values = [f'{eval_mode:<{max_widths["evaluation mode"]}}'] + values = [f'{eval_mode:<{max_widths["evaluation_mode"]}}'] for metric_key, metric_value in metrics.items(): if isinstance(metric_value, float): metric_value = f"{metric_value:.2f}" diff --git a/tests/check_help.py b/tests/check_help.py index 6cc022e2e..4494b6ee9 100644 --- a/tests/check_help.py +++ b/tests/check_help.py @@ -35,6 +35,7 @@ skills_script_list = [ 'nemo_skills/inference/generate_solutions.py', 'nemo_skills/evaluation/evaluate_results.py', + 'nemo_skills/evaluation/fill_majority_answer.py', ] diff --git a/tests/gpu-tests/test_finetuning.py b/tests/gpu-tests/test_finetuning.py index cdf1d678f..8da5e811d 100644 --- a/tests/gpu-tests/test_finetuning.py +++ b/tests/gpu-tests/test_finetuning.py @@ -17,7 +17,6 @@ # you'd also need 2+ GPUs to run this test # the metrics are assuming llama3-8b-base as the model and will fail for other models -import json import os import subprocess import sys @@ -25,8 +24,8 @@ import pytest -sys.path.append(str(Path(__file__).absolute().parents[2] / 'pipeline')) -from compute_metrics import MathEval, compute_metrics +sys.path.append(str(Path(__file__).absolute().parents[1])) +from nemo_skills.evaluation.metrics import MathEval, compute_metrics def test_sft_pipeline(): diff --git a/tests/gpu-tests/test_generation.py b/tests/gpu-tests/test_generation.py index 52d1b6302..92b124060 100644 --- a/tests/gpu-tests/test_generation.py +++ b/tests/gpu-tests/test_generation.py @@ -25,8 +25,8 @@ import pytest -sys.path.append(str(Path(__file__).absolute().parents[2] / 'pipeline')) -from compute_metrics import MathEval, compute_metrics +sys.path.append(str(Path(__file__).absolute().parents[1])) +from nemo_skills.evaluation.metrics import MathEval, compute_metrics def test_trtllm_run_eval():