Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

In-code documentation and linting update #310

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
125 changes: 111 additions & 14 deletions generate_instruction.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,26 @@
import fire


def encode_prompt(prompt_instructions):
"""Encode multiple prompt instructions into a single string."""
def encode_prompt(prompt_instructions:list) -> str:
"""
Encode multiple prompt instructions into a single string.

This function reads a file named "prompt.txt" and appends its content to a string.
Then, it iterates over a list of prompt instructions, each containing an instruction, input, and output.
The function constructs a formatted string representation of each instruction, input, and output,
appending it to the prompt string.

Parameters
----------
prompt_instructions : list of dict
A list containing dictionaries with keys "instruction", "input", and "output", representing
the instructions, inputs, and expected outputs for the prompt.

Returns
-------
prompt : str
A single string encoding all the prompt instructions along with the content of "prompt.txt".
"""
prompt = open("./prompt.txt").read() + "\n"

for idx, task_dict in enumerate(prompt_instructions):
Expand All @@ -41,7 +59,23 @@ def encode_prompt(prompt_instructions):
return prompt


def post_process_gpt3_response(num_prompt_instructions, response):
def post_process_gpt3_response(num_prompt_instructions:int, response:dict) -> list:
"""
Post-processes the response generated by GPT-3, extracting formatted instructions, inputs, and outputs.

Parameters
----------
num_prompt_instructions : int
The number of prompt instructions used in the GPT-3 response.

response : dict or None
The response generated by GPT-3, containing the text and finish reason.

Returns
-------
instructions : list of dict
A list of dictionaries representing formatted instructions, inputs, and outputs extracted from the GPT-3 response.
"""
if response is None:
return []
raw_instructions = f"{num_prompt_instructions+1}. Instruction:" + response["text"]
Expand Down Expand Up @@ -103,21 +137,73 @@ def post_process_gpt3_response(num_prompt_instructions, response):
return instructions


def find_word_in_string(w, s):
def find_word_in_string(w:str, s:str):
"""
Search for a word within a string, ignoring case and word boundaries.

Parameters
----------
w : str
The word to search for within the string.

s : str
The string in which to search for the word.

Returns
-------
match : re.Match or None
A match object if the word is found within the string, otherwise None.
"""
return re.compile(r"\b({0})\b".format(w), flags=re.IGNORECASE).search(s)


def generate_instruction_following_data(
output_dir="./",
seed_tasks_path="./seed_tasks.jsonl",
num_instructions_to_generate=100,
model_name="text-davinci-003",
num_prompt_instructions=3,
request_batch_size=5,
temperature=1.0,
top_p=1.0,
num_cpus=16,
output_dir:str="./",
seed_tasks_path:str="./seed_tasks.jsonl",
num_instructions_to_generate:int=100,
model_name:str="text-davinci-003",
num_prompt_instructions:int=3,
request_batch_size:int=5,
temperature:float=1.0,
top_p:float=1.0,
num_cpus:int=16,
):
"""
Generate instructions following the provided seed data using the GPT-3 model.

Parameters
----------
output_dir : str, optional
The directory where the generated instructions will be saved. Default is "./".

seed_tasks_path : str, optional
The path to the file containing seed tasks in JSONL format. Default is "./seed_tasks.jsonl".

num_instructions_to_generate : int, optional
The number of instructions to generate. Default is 100.

model_name : str, optional
The name of the GPT-3 model to use. Default is "text-davinci-003".

num_prompt_instructions : int, optional
The number of prompt instructions to use for generating each instruction. Default is 3.

request_batch_size : int, optional
The batch size for making requests to the GPT-3 model. Default is 5.

temperature : float, optional
The temperature parameter for sampling from the model distribution. Default is 1.0.

top_p : float, optional
The cumulative probability for nucleus sampling. Default is 1.0.

num_cpus : int, optional
The number of CPUs to use for parallel processing. Default is 16.

Returns
-------
None
"""
seed_tasks = [json.loads(l) for l in open(seed_tasks_path, "r")]
seed_instruction_data = [
{"instruction": t["instruction"], "input": t["instances"][0]["input"], "output": t["instances"][0]["output"]}
Expand Down Expand Up @@ -209,7 +295,18 @@ def generate_instruction_following_data(
utils.jdump(machine_instruction_data, os.path.join(output_dir, "regen.json"))


def main(task, **kwargs):
def main(task:str, **kwargs):
"""
Main function for executing specific tasks.

Parameters
----------
task : str
The name of the task to execute.

**kwargs : dict
Additional keyword arguments specific to the task.
"""
globals()[task](**kwargs)


Expand Down
142 changes: 135 additions & 7 deletions train.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,9 +67,25 @@ def smart_tokenizer_and_embedding_resize(
tokenizer: transformers.PreTrainedTokenizer,
model: transformers.PreTrainedModel,
):
"""Resize tokenizer and embedding.
"""
Resize tokenizer and embedding.

This is the unoptimized version that may make your embedding size not be divisible by 64.

Parameters
----------
special_tokens_dict : Dict
A dictionary containing special tokens to be added to the tokenizer.

tokenizer : transformers.PreTrainedTokenizer
The tokenizer instance.

Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
model : transformers.PreTrainedModel
The model instance.

Returns
-------
None
"""
num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
model.resize_token_embeddings(len(tokenizer))
Expand All @@ -86,7 +102,22 @@ def smart_tokenizer_and_embedding_resize(


def _tokenize_fn(strings: Sequence[str], tokenizer: transformers.PreTrainedTokenizer) -> Dict:
"""Tokenize a list of strings."""
"""
Tokenize a list of strings.

Parameters
----------
strings : Sequence[str]
A list of strings to tokenize.

tokenizer : transformers.PreTrainedTokenizer
The tokenizer instance.

Returns
-------
Dict
A dictionary containing the tokenized input ids and their corresponding lengths.
"""
tokenized_list = [
tokenizer(
text,
Expand Down Expand Up @@ -114,7 +145,25 @@ def preprocess(
targets: Sequence[str],
tokenizer: transformers.PreTrainedTokenizer,
) -> Dict:
"""Preprocess the data by tokenizing."""
"""
Preprocess the data by tokenizing.

Parameters
----------
sources : Sequence[str]
A sequence of source strings.

targets : Sequence[str]
A sequence of target strings.

tokenizer : transformers.PreTrainedTokenizer
The tokenizer instance.

Returns
-------
Dict
A dictionary containing the tokenized input ids and labels.
"""
examples = [s + t for s, t in zip(sources, targets)]
examples_tokenized, sources_tokenized = [_tokenize_fn(strings, tokenizer) for strings in (examples, sources)]
input_ids = examples_tokenized["input_ids"]
Expand All @@ -128,6 +177,17 @@ class SupervisedDataset(Dataset):
"""Dataset for supervised fine-tuning."""

def __init__(self, data_path: str, tokenizer: transformers.PreTrainedTokenizer):
"""
Initialize the SupervisedDataset.

Parameters
----------
data_path : str
The path to the data file.

tokenizer : transformers.PreTrainedTokenizer
The tokenizer instance.
"""
super(SupervisedDataset, self).__init__()
logging.warning("Loading data...")
list_data_dict = utils.jload(data_path)
Expand All @@ -146,20 +206,60 @@ def __init__(self, data_path: str, tokenizer: transformers.PreTrainedTokenizer):
self.input_ids = data_dict["input_ids"]
self.labels = data_dict["labels"]

def __len__(self):
def __len__(self) -> int:
"""
Return the length of the dataset.

Parameters
----------
None

Returns
-------
int
Length of the dataset.
"""
return len(self.input_ids)

def __getitem__(self, i) -> Dict[str, torch.Tensor]:
"""
Get an item from the dataset.

Parameters
----------
i : int
Index of the item to retrieve.

Returns
-------
Dict[str, torch.Tensor]
A dictionary containing the input_ids and labels tensors for the specified item.
"""
return dict(input_ids=self.input_ids[i], labels=self.labels[i])


@dataclass
class DataCollatorForSupervisedDataset(object):
"""Collate examples for supervised fine-tuning."""
"""
Collate examples for supervised fine-tuning.
"""

tokenizer: transformers.PreTrainedTokenizer

def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
"""
Collate examples.

Parameters
----------
instances : Sequence[Dict]
A sequence of examples.

Returns
-------
Dict[str, torch.Tensor]
A dictionary containing the input ids, labels, and attention mask.
"""
input_ids, labels = tuple([instance[key] for instance in instances] for key in ("input_ids", "labels"))
input_ids = torch.nn.utils.rnn.pad_sequence(
input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id
Expand All @@ -173,13 +273,41 @@ def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:


def make_supervised_data_module(tokenizer: transformers.PreTrainedTokenizer, data_args) -> Dict:
"""Make dataset and collator for supervised fine-tuning."""
"""
Make dataset and collator for supervised fine-tuning.

Parameters
----------
tokenizer : transformers.PreTrainedTokenizer
The tokenizer instance.

data_args : Any
Additional data arguments.

Returns
-------
Dict
A dictionary containing the train dataset, evaluation dataset, and data collator.
"""
train_dataset = SupervisedDataset(tokenizer=tokenizer, data_path=data_args.data_path)
data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
return dict(train_dataset=train_dataset, eval_dataset=None, data_collator=data_collator)


def train():
"""
Train the model.

Parses model, data, and training arguments, initializes the model and tokenizer, preprocesses the data, and then trains the model.

Parameters
----------
None

Returns
-------
None
"""
parser = transformers.HfArgumentParser((ModelArguments, DataArguments, TrainingArguments))
model_args, data_args, training_args = parser.parse_args_into_dataclasses()

Expand Down
Loading