diff --git a/execution/executors.py b/execution/executors.py index e290b395..e89b2d8f 100644 --- a/execution/executors.py +++ b/execution/executors.py @@ -172,7 +172,7 @@ def process_output(self, output: str, tokenizer_eos_token: str) -> str: # for llama-based model return output.lstrip().split("\n\n")[0].split(";")[0].strip() else: - return output.lstrip().split(tokenizer_eos_token)[0].split("\n\n")[0].split(";")[0].strip() + return output.lstrip().split(tokenizer_eos_token)[0].split("\n\n")[0].split(";")[0].strip().replace("-- ", "") @overrides def exec_result_eq(self, program_dict_1: Dict[str, Any], program_dict_2: Dict[str, Any]) -> bool: @@ -246,7 +246,7 @@ def gold_program_len(self, example: Dict[str, Any]) -> int: @overrides def process_output(self, output: str, tokenizer_eos_token: str) -> str: - return output.split("### Task End ###")[0] # NOTE: we can't strip because python code need to maintain the indentation + return output.lstrip().split("### Task End ###")[0] # NOTE: we can't strip because python code need to maintain the indentation @overrides def exec_result_eq(self, program_dict_1: Dict[str, Any], program_dict_2: Dict[str, Any]) -> bool: @@ -339,6 +339,67 @@ def exec_result_eq(self, program_dict_1: Dict[str, Any], program_dict_2: Dict[st except Exception: return False + @classmethod + def real_exec_program(cls, program: str, example: Dict[str, Any]) -> Tuple[int, Union[str, List, Dict]]: + result = execute(program, output_locals=True) + + if result["result"] == "passed": + if "answer" in result["locals"]: + try: + executed_answer = float(result["locals"]["answer"]["str_value"]) + except Exception: + executed_answer = result["locals"]["answer"]["str_value"] + exec_match = executed_answer == example["answer"] + + # the executed_answer needs to be a state dict + state_dict = dict() + for k, v in result["locals"].items(): + state_dict[k] = v["str_value"] + executed_answer = state_dict + else: + executed_answer = "ERROR: no answer variable" + exec_match = -1 + else: + executed_answer = "ERROR: program failed to execute" + exec_match = -1 + + return exec_match, executed_answer + +class MathExecutorGPT(BaseExecutor): + def __init__(self, **kwargs): + super().__init__(**kwargs) + + @overrides + def cache_key_func(self, program: str, example: Dict[str, Any]) -> str: + return example["question"] + " | " + program + + @overrides + def program_len(self, program: str) -> int: + return python_program_len(program) + + @overrides + def gold_program_len(self, example: Dict[str, Any]) -> int: + return 0 + + @overrides + def process_output(self, output: str, tokenizer_eos_token: str) -> str: + return output.lstrip().split(tokenizer_eos_token)[0].strip() + + @overrides + def exec_result_eq(self, program_dict_1: Dict[str, Any], program_dict_2: Dict[str, Any]) -> bool: + if isinstance(program_dict_1['exec_result'], str) or isinstance(program_dict_2['exec_result'], str): + return False + else: + try: + str_match = program_dict_1['exec_result']["answer"] == program_dict_2['exec_result']["answer"] + if str_match: + return True + else: + numeric_match = abs(float(program_dict_1['exec_result']["answer"]) - float(program_dict_2['exec_result']["answer"])) < 1e-6 + return numeric_match + except Exception: + return False + @classmethod def real_exec_program(cls, program: str, example: Dict[str, Any]) -> Tuple[int, Union[str, List, Dict]]: result = execute(program, output_locals=True) diff --git a/finetuning/lightning_modules/models/openai_model.py b/finetuning/lightning_modules/models/openai_model.py index 000322ae..60fe13a9 100644 --- a/finetuning/lightning_modules/models/openai_model.py +++ b/finetuning/lightning_modules/models/openai_model.py @@ -141,7 +141,7 @@ def __init__(self, ) -> None: SUPPORTED_OPENAI_MODELS = ["code-davinci-002", "code-cushman-002", "code-cushman-001", "code-davinci-001", - "gpt-3.5-turbo", "text-davinci-003", "text-davinci-002","gpt-4"] + "gpt-3.5-turbo", "text-davinci-003", "text-davinci-002","gpt-4", "gpt-4-0314", "gpt-3.5-turbo-0301", "gpt-3.5-turbo-0613", "gpt-4-0613"] assert engine in SUPPORTED_OPENAI_MODELS, f"OpenAIModel only supports {SUPPORTED_OPENAI_MODELS}" self.engine = engine diff --git a/finetuning/lightning_modules/models/seq2seq_model.py b/finetuning/lightning_modules/models/seq2seq_model.py index 885724f4..1d42e76f 100644 --- a/finetuning/lightning_modules/models/seq2seq_model.py +++ b/finetuning/lightning_modules/models/seq2seq_model.py @@ -156,7 +156,7 @@ def generate_and_post_process(self, temp = temperature # https://github.com/THUDM/ChatGLM-6B/issues/31 - if "santacoder" in self.transformer_model_name or "gpt-neox-20b" in self.transformer_model_name or "replit" in self.transformer_model_name: + if "santacoder" in self.transformer_model_name or "gpt-neox-20b" in self.transformer_model_name or "replit" in self.transformer_model_name or "Llama-2" in self.transformer_model_name or "mpt" in self.transformer_model_name: use_sample = False generation_results = self.model.generate(input_ids=input_ids, attention_mask=attention_mask, do_sample=use_sample, diff --git a/finetuning/lightning_modules/models/seq2seq_model_util.py b/finetuning/lightning_modules/models/seq2seq_model_util.py index ad13c97b..34f3eadc 100755 --- a/finetuning/lightning_modules/models/seq2seq_model_util.py +++ b/finetuning/lightning_modules/models/seq2seq_model_util.py @@ -65,7 +65,7 @@ def get_model(model_name: str, gradient_checkpointing=gradient_ckpt, use_cache=not gradient_ckpt) if len(additional_special_tokens) > 0: model.resize_token_embeddings(len(tokenizer)) - elif model_name in ["EleutherAI/gpt-neox-20b", "EleutherAI/pythia-1.4b-deduped", "EleutherAI/pythia-6.9b-deduped", "EleutherAI/pythia-12b-deduped", "databricks/dolly-v2-7b", "databricks/dolly-v2-12b"]: + elif model_name in ["EleutherAI/gpt-neox-20b", "databricks/dolly-v2-7b", "databricks/dolly-v2-12b"] or "EleutherAI/pythia-" in model_name: tokenizer = AutoTokenizer.from_pretrained(model_name) tokenizer.pad_token = tokenizer.eos_token @@ -91,16 +91,17 @@ def get_model(model_name: str, # gradient_checkpointing=gradient_ckpt, use_cache=not gradient_ckpt, **additional_init_args) - elif model_name.startswith("Salesforce/codegen-"): + elif model_name.startswith("Salesforce/codegen") or model_name.startswith("Salesforce/xgen"): tokenizer = AutoTokenizer.from_pretrained(model_name, - additional_special_tokens=additional_special_tokens) + additional_special_tokens=additional_special_tokens, + trust_remote_code=True) tokenizer.pad_token = tokenizer.eos_token if not tokenizer_only: model = AutoModelForCausalLM.from_pretrained(model_name, pad_token_id=tokenizer.eos_token_id, torch_dtype=torch.float16, - # device_map="auto", + trust_remote_code=True, use_cache=True) elif model_name.startswith("bigscience/bloom-"): tokenizer = AutoTokenizer.from_pretrained(model_name, @@ -145,17 +146,26 @@ def get_model(model_name: str, if not tokenizer_only: model = BartForSequenceClassification.from_pretrained(model_name, num_labels=2) - elif "llama" in model_name.lower() or "alpaca" in model_name.lower(): + elif "llama" in model_name.lower() or "alpaca" in model_name.lower() or "vicuna" in model_name.lower() or "lemur" in model_name.lower(): tokenizer = LlamaTokenizer.from_pretrained(model_name, additional_special_tokens=additional_special_tokens) tokenizer.pad_token = tokenizer.eos_token if not tokenizer_only: - model = LlamaForCausalLM.from_pretrained(model_name, - pad_token_id=tokenizer.eos_token_id, - torch_dtype=torch.float16) - if len(additional_special_tokens) > 0: - model.resize_token_embeddings(len(tokenizer)) + if "30" in model_name or "34" in model_name: + model = LlamaForCausalLM.from_pretrained(model_name, + pad_token_id=tokenizer.eos_token_id, + load_in_8bit=True) + elif "65" in model_name or "70" in model_name or "lemur" in model_name.lower(): + model = LlamaForCausalLM.from_pretrained(model_name, + pad_token_id=tokenizer.eos_token_id, + load_in_4bit=True, + device_map="auto") + else: + model = LlamaForCausalLM.from_pretrained(model_name, + pad_token_id=tokenizer.eos_token_id) + # if len(additional_special_tokens) > 0: + # model.resize_token_embeddings(len(tokenizer)) elif model_name == "bigcode/santacoder": tokenizer = AutoTokenizer.from_pretrained(model_name, additional_special_tokens=additional_special_tokens) @@ -169,7 +179,7 @@ def get_model(model_name: str, ) if len(additional_special_tokens) > 0: model.resize_token_embeddings(len(tokenizer)) - elif model_name in ["bigcode/starcoder", "HuggingFaceH4/starchat-alpha"]: + elif model_name in ["bigcode/starcoder", "HuggingFaceH4/starchat-alpha", "bigcode/starcoderplus", "WizardLM/WizardCoder-15B-V1.0"]: tokenizer = AutoTokenizer.from_pretrained(model_name, additional_special_tokens=additional_special_tokens) tokenizer.pad_token = tokenizer.eos_token @@ -181,7 +191,7 @@ def get_model(model_name: str, trust_remote_code=True) if len(additional_special_tokens) > 0: model.resize_token_embeddings(len(tokenizer)) - elif model_name == "replit/replit-code-v1-3b": + elif model_name == "replit/replit-code-v1-3b" or "mpt" in model_name: tokenizer = AutoTokenizer.from_pretrained(model_name, additional_special_tokens=additional_special_tokens, trust_remote_code=True) @@ -190,7 +200,7 @@ def get_model(model_name: str, if not tokenizer_only: model = AutoModelForCausalLM.from_pretrained(model_name, pad_token_id=tokenizer.eos_token_id, - torch_dtype=torch.float16, + load_in_8bit=True if "30" in model_name else False, trust_remote_code=True) if len(additional_special_tokens) > 0: model.resize_token_embeddings(len(tokenizer)) @@ -201,11 +211,11 @@ def get_model(model_name: str, tokenizer.pad_token = tokenizer.eos_token # to accomandate the length of openai models and the prompt - if engine in ["code-davinci-002", "gpt-4"]: + if engine in ["code-davinci-002"] or engine.startswith("gpt-4"): model_length = 8001 elif engine in ["code-cushman-001", "code-cushman-002"]: model_length = 1024 - elif engine in ["text-davinci-002", "text-davinci-003", "gpt-3.5-turbo"]: + elif engine in ["text-davinci-002", "text-davinci-003"] or engine.startswith("gpt-3.5-turbo"): model_length = 4096 tokenizer.model_max_length = model_length