tatsu-lab · louisbrulenaudet · Feb 20, 2024
diff --git a/generate_instruction.py b/generate_instruction.py
@@ -24,8 +24,26 @@
 import fire
 
 
-def encode_prompt(prompt_instructions):
-    """Encode multiple prompt instructions into a single string."""
+def encode_prompt(prompt_instructions:list) -> str:
+    """
+    Encode multiple prompt instructions into a single string.
+
+    This function reads a file named "prompt.txt" and appends its content to a string. 
+    Then, it iterates over a list of prompt instructions, each containing an instruction, input, and output.
+    The function constructs a formatted string representation of each instruction, input, and output, 
+    appending it to the prompt string.
+
+    Parameters
+    ----------
+    prompt_instructions : list of dict
+        A list containing dictionaries with keys "instruction", "input", and "output", representing 
+        the instructions, inputs, and expected outputs for the prompt.
+
+    Returns
+    -------
+    prompt : str
+        A single string encoding all the prompt instructions along with the content of "prompt.txt".
+    """
     prompt = open("./prompt.txt").read() + "\n"
 
     for idx, task_dict in enumerate(prompt_instructions):
@@ -41,7 +59,23 @@ def encode_prompt(prompt_instructions):
     return prompt
 
 
-def post_process_gpt3_response(num_prompt_instructions, response):
+def post_process_gpt3_response(num_prompt_instructions:int, response:dict) -> list:
+    """
+    Post-processes the response generated by GPT-3, extracting formatted instructions, inputs, and outputs.
+
+    Parameters
+    ----------
+    num_prompt_instructions : int
+        The number of prompt instructions used in the GPT-3 response.
+
+    response : dict or None
+        The response generated by GPT-3, containing the text and finish reason.
+
+    Returns
+    -------
+    instructions : list of dict
+        A list of dictionaries representing formatted instructions, inputs, and outputs extracted from the GPT-3 response.
+    """
     if response is None:
         return []
     raw_instructions = f"{num_prompt_instructions+1}. Instruction:" + response["text"]
@@ -103,21 +137,73 @@ def post_process_gpt3_response(num_prompt_instructions, response):
     return instructions
 
 
-def find_word_in_string(w, s):
+def find_word_in_string(w:str, s:str):
+    """
+    Search for a word within a string, ignoring case and word boundaries.
+
+    Parameters
+    ----------
+    w : str
+        The word to search for within the string.
+
+    s : str
+        The string in which to search for the word.
+
+    Returns
+    -------
+    match : re.Match or None
+        A match object if the word is found within the string, otherwise None.
+    """
     return re.compile(r"\b({0})\b".format(w), flags=re.IGNORECASE).search(s)
 
 
 def generate_instruction_following_data(
-    output_dir="./",
-    seed_tasks_path="./seed_tasks.jsonl",
-    num_instructions_to_generate=100,
-    model_name="text-davinci-003",
-    num_prompt_instructions=3,
-    request_batch_size=5,
-    temperature=1.0,
-    top_p=1.0,
-    num_cpus=16,
+    output_dir:str="./",
+    seed_tasks_path:str="./seed_tasks.jsonl",
+    num_instructions_to_generate:int=100,
+    model_name:str="text-davinci-003",
+    num_prompt_instructions:int=3,
+    request_batch_size:int=5,
+    temperature:float=1.0,
+    top_p:float=1.0,
+    num_cpus:int=16,
 ):
+    """
+    Generate instructions following the provided seed data using the GPT-3 model.
+
+    Parameters
+    ----------
+    output_dir : str, optional
+        The directory where the generated instructions will be saved. Default is "./".
+
+    seed_tasks_path : str, optional
+        The path to the file containing seed tasks in JSONL format. Default is "./seed_tasks.jsonl".
+
+    num_instructions_to_generate : int, optional
+        The number of instructions to generate. Default is 100.
+
+    model_name : str, optional
+        The name of the GPT-3 model to use. Default is "text-davinci-003".
+
+    num_prompt_instructions : int, optional
+        The number of prompt instructions to use for generating each instruction. Default is 3.
+
+    request_batch_size : int, optional
+        The batch size for making requests to the GPT-3 model. Default is 5.
+
+    temperature : float, optional
+        The temperature parameter for sampling from the model distribution. Default is 1.0.
+
+    top_p : float, optional
+        The cumulative probability for nucleus sampling. Default is 1.0.
+
+    num_cpus : int, optional
+        The number of CPUs to use for parallel processing. Default is 16.
+
+    Returns
+    -------
+    None
+    """
     seed_tasks = [json.loads(l) for l in open(seed_tasks_path, "r")]
     seed_instruction_data = [
         {"instruction": t["instruction"], "input": t["instances"][0]["input"], "output": t["instances"][0]["output"]}
@@ -209,7 +295,18 @@ def generate_instruction_following_data(
         utils.jdump(machine_instruction_data, os.path.join(output_dir, "regen.json"))
 
 
-def main(task, **kwargs):
+def main(task:str, **kwargs):
+    """
+    Main function for executing specific tasks.
+
+    Parameters
+    ----------
+    task : str
+        The name of the task to execute.
+
+    **kwargs : dict
+        Additional keyword arguments specific to the task.
+    """
     globals()[task](**kwargs)
 
 

diff --git a/train.py b/train.py
@@ -67,9 +67,25 @@ def smart_tokenizer_and_embedding_resize(
     tokenizer: transformers.PreTrainedTokenizer,
     model: transformers.PreTrainedModel,
 ):
-    """Resize tokenizer and embedding.
+    """
+    Resize tokenizer and embedding.
+
+    This is the unoptimized version that may make your embedding size not be divisible by 64.
+
+    Parameters
+    ----------
+    special_tokens_dict : Dict
+        A dictionary containing special tokens to be added to the tokenizer.
+
+    tokenizer : transformers.PreTrainedTokenizer
+        The tokenizer instance.
 
-    Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
+    model : transformers.PreTrainedModel
+        The model instance.
+
+    Returns
+    -------
+    None
     """
     num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
     model.resize_token_embeddings(len(tokenizer))
@@ -86,7 +102,22 @@ def smart_tokenizer_and_embedding_resize(
 
 
 def _tokenize_fn(strings: Sequence[str], tokenizer: transformers.PreTrainedTokenizer) -> Dict:
-    """Tokenize a list of strings."""
+    """
+    Tokenize a list of strings.
+
+    Parameters
+    ----------
+    strings : Sequence[str]
+        A list of strings to tokenize.
+
+    tokenizer : transformers.PreTrainedTokenizer
+        The tokenizer instance.
+
+    Returns
+    -------
+    Dict
+        A dictionary containing the tokenized input ids and their corresponding lengths.
+    """
     tokenized_list = [
         tokenizer(
             text,
@@ -114,7 +145,25 @@ def preprocess(
     targets: Sequence[str],
     tokenizer: transformers.PreTrainedTokenizer,
 ) -> Dict:
-    """Preprocess the data by tokenizing."""
+    """
+    Preprocess the data by tokenizing.
+
+    Parameters
+    ----------
+    sources : Sequence[str]
+        A sequence of source strings.
+
+    targets : Sequence[str]
+        A sequence of target strings.
+
+    tokenizer : transformers.PreTrainedTokenizer
+        The tokenizer instance.
+
+    Returns
+    -------
+    Dict
+        A dictionary containing the tokenized input ids and labels.
+    """
     examples = [s + t for s, t in zip(sources, targets)]
     examples_tokenized, sources_tokenized = [_tokenize_fn(strings, tokenizer) for strings in (examples, sources)]
     input_ids = examples_tokenized["input_ids"]
@@ -128,6 +177,17 @@ class SupervisedDataset(Dataset):
     """Dataset for supervised fine-tuning."""
 
     def __init__(self, data_path: str, tokenizer: transformers.PreTrainedTokenizer):
+        """
+        Initialize the SupervisedDataset.
+
+        Parameters
+        ----------
+        data_path : str
+            The path to the data file.
+
+        tokenizer : transformers.PreTrainedTokenizer
+            The tokenizer instance.
+        """
         super(SupervisedDataset, self).__init__()
         logging.warning("Loading data...")
         list_data_dict = utils.jload(data_path)
@@ -146,20 +206,60 @@ def __init__(self, data_path: str, tokenizer: transformers.PreTrainedTokenizer):
         self.input_ids = data_dict["input_ids"]
         self.labels = data_dict["labels"]
 
-    def __len__(self):
+    def __len__(self) -> int:
+        """
+        Return the length of the dataset.
+
+        Parameters
+        ----------
+        None
+
+        Returns
+        -------
+        int
+            Length of the dataset.
+        """
         return len(self.input_ids)
 
     def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        """
+        Get an item from the dataset.
+
+        Parameters
+        ----------
+        i : int
+            Index of the item to retrieve.
+
+        Returns
+        -------
+        Dict[str, torch.Tensor]
+            A dictionary containing the input_ids and labels tensors for the specified item.
+        """
         return dict(input_ids=self.input_ids[i], labels=self.labels[i])
 
 
 @dataclass
 class DataCollatorForSupervisedDataset(object):
-    """Collate examples for supervised fine-tuning."""
+    """
+    Collate examples for supervised fine-tuning.
+    """
 
     tokenizer: transformers.PreTrainedTokenizer
 
     def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
+        """
+        Collate examples.
+
+        Parameters
+        ----------
+        instances : Sequence[Dict]
+            A sequence of examples.
+
+        Returns
+        -------
+        Dict[str, torch.Tensor]
+            A dictionary containing the input ids, labels, and attention mask.
+        """
         input_ids, labels = tuple([instance[key] for instance in instances] for key in ("input_ids", "labels"))
         input_ids = torch.nn.utils.rnn.pad_sequence(
             input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id
@@ -173,13 +273,41 @@ def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
 
 
 def make_supervised_data_module(tokenizer: transformers.PreTrainedTokenizer, data_args) -> Dict:
-    """Make dataset and collator for supervised fine-tuning."""
+    """
+    Make dataset and collator for supervised fine-tuning.
+
+    Parameters
+    ----------
+    tokenizer : transformers.PreTrainedTokenizer
+        The tokenizer instance.
+
+    data_args : Any
+        Additional data arguments.
+
+    Returns
+    -------
+    Dict
+        A dictionary containing the train dataset, evaluation dataset, and data collator.
+    """
     train_dataset = SupervisedDataset(tokenizer=tokenizer, data_path=data_args.data_path)
     data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
     return dict(train_dataset=train_dataset, eval_dataset=None, data_collator=data_collator)
 
 
 def train():
+    """
+    Train the model.
+
+    Parses model, data, and training arguments, initializes the model and tokenizer, preprocesses the data, and then trains the model.
+
+    Parameters
+    ----------
+    None
+
+    Returns
+    -------
+    None
+    """
     parser = transformers.HfArgumentParser((ModelArguments, DataArguments, TrainingArguments))
     model_args, data_args, training_args = parser.parse_args_into_dataclasses()