Yale-LILY · yilunzhao · Sep 19, 2023
diff --git a/execution/executors.py b/execution/executors.py
@@ -172,7 +172,7 @@ def process_output(self, output: str, tokenizer_eos_token: str) -> str:
             # for llama-based model
             return output.lstrip().split("\n\n")[0].split(";")[0].strip()
         else:
-            return output.lstrip().split(tokenizer_eos_token)[0].split("\n\n")[0].split(";")[0].strip()
+            return output.lstrip().split(tokenizer_eos_token)[0].split("\n\n")[0].split(";")[0].strip().replace("-- ", "")
 
     @overrides
     def exec_result_eq(self, program_dict_1: Dict[str, Any], program_dict_2: Dict[str, Any]) -> bool:
@@ -246,7 +246,7 @@ def gold_program_len(self, example: Dict[str, Any]) -> int:
 
     @overrides
     def process_output(self, output: str, tokenizer_eos_token: str) -> str:
-        return output.split("### Task End ###")[0] # NOTE: we can't strip because python code need to maintain the indentation
+        return output.lstrip().split("### Task End ###")[0] # NOTE: we can't strip because python code need to maintain the indentation
 
     @overrides
     def exec_result_eq(self, program_dict_1: Dict[str, Any], program_dict_2: Dict[str, Any]) -> bool:
@@ -339,6 +339,67 @@ def exec_result_eq(self, program_dict_1: Dict[str, Any], program_dict_2: Dict[st
             except Exception:
                 return False
 
+    @classmethod
+    def real_exec_program(cls, program: str, example: Dict[str, Any]) -> Tuple[int, Union[str, List, Dict]]:
+        result = execute(program, output_locals=True)
+
+        if result["result"] == "passed":
+            if "answer" in result["locals"]:
+                try:
+                    executed_answer = float(result["locals"]["answer"]["str_value"])
+                except Exception:
+                    executed_answer = result["locals"]["answer"]["str_value"]
+                exec_match = executed_answer == example["answer"]
+
+                # the executed_answer needs to be a state dict
+                state_dict = dict()
+                for k, v in result["locals"].items():
+                    state_dict[k] = v["str_value"]
+                executed_answer = state_dict
+            else:
+                executed_answer = "ERROR: no answer variable"
+                exec_match = -1
+        else:
+            executed_answer = "ERROR: program failed to execute"
+            exec_match = -1
+
+        return exec_match, executed_answer
+
+class MathExecutorGPT(BaseExecutor):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    @overrides
+    def cache_key_func(self, program: str, example: Dict[str, Any]) -> str:
+        return example["question"] + " | " + program
+
+    @overrides
+    def program_len(self, program: str) -> int:
+        return python_program_len(program)
+
+    @overrides
+    def gold_program_len(self, example: Dict[str, Any]) -> int:
+        return 0
+
+    @overrides
+    def process_output(self, output: str, tokenizer_eos_token: str) -> str:
+        return output.lstrip().split(tokenizer_eos_token)[0].strip()
+
+    @overrides
+    def exec_result_eq(self, program_dict_1: Dict[str, Any], program_dict_2: Dict[str, Any]) -> bool:
+        if isinstance(program_dict_1['exec_result'], str) or isinstance(program_dict_2['exec_result'], str):
+            return False
+        else:
+            try:
+                str_match = program_dict_1['exec_result']["answer"] == program_dict_2['exec_result']["answer"]
+                if str_match:
+                    return True
+                else:
+                    numeric_match = abs(float(program_dict_1['exec_result']["answer"]) - float(program_dict_2['exec_result']["answer"])) < 1e-6
+                    return numeric_match
+            except Exception:
+                return False
+
     @classmethod
     def real_exec_program(cls, program: str, example: Dict[str, Any]) -> Tuple[int, Union[str, List, Dict]]:
         result = execute(program, output_locals=True)

diff --git a/finetuning/lightning_modules/models/openai_model.py b/finetuning/lightning_modules/models/openai_model.py
@@ -141,7 +141,7 @@ def __init__(self,
                  ) -> None:
         SUPPORTED_OPENAI_MODELS = ["code-davinci-002", "code-cushman-002", 
                                    "code-cushman-001", "code-davinci-001", 
-                                   "gpt-3.5-turbo", "text-davinci-003", "text-davinci-002","gpt-4"]
+                                   "gpt-3.5-turbo", "text-davinci-003", "text-davinci-002","gpt-4", "gpt-4-0314", "gpt-3.5-turbo-0301", "gpt-3.5-turbo-0613", "gpt-4-0613"]
         assert engine in SUPPORTED_OPENAI_MODELS, f"OpenAIModel only supports {SUPPORTED_OPENAI_MODELS}"
 
         self.engine = engine

diff --git a/finetuning/lightning_modules/models/seq2seq_model.py b/finetuning/lightning_modules/models/seq2seq_model.py
@@ -156,7 +156,7 @@ def generate_and_post_process(self,
             temp = temperature
 
         # https://github.com/THUDM/ChatGLM-6B/issues/31
-        if "santacoder" in self.transformer_model_name or "gpt-neox-20b" in self.transformer_model_name or "replit" in self.transformer_model_name:
+        if "santacoder" in self.transformer_model_name or "gpt-neox-20b" in self.transformer_model_name or "replit" in self.transformer_model_name or "Llama-2" in self.transformer_model_name or "mpt" in self.transformer_model_name:
             use_sample = False
 
         generation_results = self.model.generate(input_ids=input_ids, attention_mask=attention_mask, do_sample=use_sample, 

diff --git a/finetuning/lightning_modules/models/seq2seq_model_util.py b/finetuning/lightning_modules/models/seq2seq_model_util.py
@@ -65,7 +65,7 @@ def get_model(model_name: str,
                                                         gradient_checkpointing=gradient_ckpt, use_cache=not gradient_ckpt)
             if len(additional_special_tokens) > 0:
                 model.resize_token_embeddings(len(tokenizer))
-    elif model_name in ["EleutherAI/gpt-neox-20b", "EleutherAI/pythia-1.4b-deduped", "EleutherAI/pythia-6.9b-deduped", "EleutherAI/pythia-12b-deduped", "databricks/dolly-v2-7b", "databricks/dolly-v2-12b"]:
+    elif model_name in ["EleutherAI/gpt-neox-20b", "databricks/dolly-v2-7b", "databricks/dolly-v2-12b"] or "EleutherAI/pythia-" in model_name:
         tokenizer = AutoTokenizer.from_pretrained(model_name)
         tokenizer.pad_token = tokenizer.eos_token
 
@@ -91,16 +91,17 @@ def get_model(model_name: str,
                                                                # gradient_checkpointing=gradient_ckpt, 
                                                                use_cache=not gradient_ckpt,
                                                                **additional_init_args)
-    elif model_name.startswith("Salesforce/codegen-"):
+    elif model_name.startswith("Salesforce/codegen") or model_name.startswith("Salesforce/xgen"):
         tokenizer = AutoTokenizer.from_pretrained(model_name,
-                                                    additional_special_tokens=additional_special_tokens)
+                                                    additional_special_tokens=additional_special_tokens,
+                                                    trust_remote_code=True)
         tokenizer.pad_token = tokenizer.eos_token
 
         if not tokenizer_only:
             model = AutoModelForCausalLM.from_pretrained(model_name, 
                                                     pad_token_id=tokenizer.eos_token_id, 
                                                     torch_dtype=torch.float16, 
-                                                    # device_map="auto",
+                                                    trust_remote_code=True,
                                                     use_cache=True)
     elif model_name.startswith("bigscience/bloom-"):
         tokenizer = AutoTokenizer.from_pretrained(model_name,
@@ -145,17 +146,26 @@ def get_model(model_name: str,
 
         if not tokenizer_only:
             model = BartForSequenceClassification.from_pretrained(model_name, num_labels=2)
-    elif "llama" in model_name.lower() or "alpaca" in model_name.lower():
+    elif "llama" in model_name.lower() or "alpaca" in model_name.lower() or "vicuna" in model_name.lower() or "lemur" in model_name.lower():
         tokenizer = LlamaTokenizer.from_pretrained(model_name,
                                                     additional_special_tokens=additional_special_tokens)
         tokenizer.pad_token = tokenizer.eos_token
 
         if not tokenizer_only:
-            model = LlamaForCausalLM.from_pretrained(model_name, 
-                                                    pad_token_id=tokenizer.eos_token_id, 
-                                                    torch_dtype=torch.float16)
-            if len(additional_special_tokens) > 0:
-                model.resize_token_embeddings(len(tokenizer))
+            if "30" in model_name or "34" in model_name:
+                model = LlamaForCausalLM.from_pretrained(model_name, 
+                                                        pad_token_id=tokenizer.eos_token_id, 
+                                                        load_in_8bit=True)
+            elif "65" in model_name or "70" in model_name or "lemur" in model_name.lower():
+                model = LlamaForCausalLM.from_pretrained(model_name, 
+                                                        pad_token_id=tokenizer.eos_token_id, 
+                                                        load_in_4bit=True,
+                                                        device_map="auto")
+            else:
+                model = LlamaForCausalLM.from_pretrained(model_name, 
+                                                        pad_token_id=tokenizer.eos_token_id)
+            # if len(additional_special_tokens) > 0:
+            #     model.resize_token_embeddings(len(tokenizer))
     elif model_name == "bigcode/santacoder":
         tokenizer = AutoTokenizer.from_pretrained(model_name,
                                                     additional_special_tokens=additional_special_tokens)
@@ -169,7 +179,7 @@ def get_model(model_name: str,
                                                         )
             if len(additional_special_tokens) > 0:
                 model.resize_token_embeddings(len(tokenizer))
-    elif model_name in ["bigcode/starcoder", "HuggingFaceH4/starchat-alpha"]:
+    elif model_name in ["bigcode/starcoder", "HuggingFaceH4/starchat-alpha", "bigcode/starcoderplus", "WizardLM/WizardCoder-15B-V1.0"]:
         tokenizer = AutoTokenizer.from_pretrained(model_name,
                                                     additional_special_tokens=additional_special_tokens)
         tokenizer.pad_token = tokenizer.eos_token
@@ -181,7 +191,7 @@ def get_model(model_name: str,
                                                         trust_remote_code=True)
             if len(additional_special_tokens) > 0:
                 model.resize_token_embeddings(len(tokenizer))
-    elif model_name == "replit/replit-code-v1-3b":
+    elif model_name == "replit/replit-code-v1-3b" or "mpt" in model_name:
         tokenizer = AutoTokenizer.from_pretrained(model_name,
                                                     additional_special_tokens=additional_special_tokens,
                                                     trust_remote_code=True)
@@ -190,7 +200,7 @@ def get_model(model_name: str,
         if not tokenizer_only:
             model = AutoModelForCausalLM.from_pretrained(model_name,        
                                                         pad_token_id=tokenizer.eos_token_id, 
-                                                        torch_dtype=torch.float16,
+                                                        load_in_8bit=True if "30" in model_name else False,
                                                         trust_remote_code=True)
             if len(additional_special_tokens) > 0:
                 model.resize_token_embeddings(len(tokenizer))
@@ -201,11 +211,11 @@ def get_model(model_name: str,
         tokenizer.pad_token = tokenizer.eos_token
 
         # to accomandate the length of openai models and the prompt
-        if engine in ["code-davinci-002", "gpt-4"]:
+        if engine in ["code-davinci-002"] or engine.startswith("gpt-4"):
             model_length = 8001
         elif engine in ["code-cushman-001", "code-cushman-002"]:
             model_length = 1024
-        elif engine in ["text-davinci-002", "text-davinci-003", "gpt-3.5-turbo"]:
+        elif engine in ["text-davinci-002", "text-davinci-003"] or engine.startswith("gpt-3.5-turbo"):
             model_length = 4096
 
         tokenizer.model_max_length = model_length