-
Notifications
You must be signed in to change notification settings - Fork 13
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #64 from flairNLP/fine_tune_eval
Add experiments for instruction fine-tuned on NER
- Loading branch information
Showing
4 changed files
with
386 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
# Evaluation of LLMs NLP Downstream Task capabilities through instruction-fine-tuning | ||
|
||
## Setup | ||
|
||
Install library | ||
|
||
```bash | ||
# In root of project | ||
python -m pip install -e . | ||
``` | ||
|
||
Install relevant requirements for experiment | ||
|
||
```bash | ||
python -m pip install -r requirements.txt | ||
``` | ||
|
||
## Fine-tune Model | ||
|
||
```bash | ||
torchrun --nproc_per_node=2 train.py \ | ||
--model_name_or_path "../llama_hf" \ | ||
--bf16 True \ | ||
--output_dir dar_llama_big_noinp_clean \ | ||
--num_train_epochs 3 \ | ||
--per_device_train_batch_size 4 \ | ||
--per_device_eval_batch_size 4 \ | ||
--gradient_accumulation_steps 16 \ | ||
--evaluation_strategy "no" \ | ||
--save_strategy "epoch" \ | ||
--save_total_limit 3 \ | ||
--learning_rate 2e-5 \ | ||
--weight_decay 0. \ | ||
--warmup_ratio 0.03 \ | ||
--lr_scheduler_type "cosine" \ | ||
--logging_steps 1 \ | ||
--fsdp "full_shard auto_wrap" \ | ||
--fsdp_transformer_layer_cls_to_wrap 'LlamaDecoderLayer' \ | ||
--tf32 True | ||
``` | ||
|
||
## Evaluate LLM with library | ||
|
||
This will generate NER tokens based on the CONLL03 evaluation split and evaluate at it against the gold labels. | ||
|
||
```bash | ||
python evaluate.py --model_name_or_path "<HF_MODEL>" | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,131 @@ | ||
import argparse | ||
import os | ||
|
||
from datasets import load_dataset, load_from_disk, Dataset | ||
from haystack.nodes import PromptNode | ||
|
||
from sklearn.metrics import classification_report, accuracy_score | ||
from sklearn.preprocessing import MultiLabelBinarizer | ||
|
||
from fabricator import DatasetGenerator | ||
from fabricator.prompts import BasePrompt | ||
from fabricator.samplers import random_sampler | ||
|
||
|
||
ner_prompt = ( | ||
"Given the following text. Annotate the example and choose your annotations from: {}" | ||
) | ||
|
||
def main(args): | ||
|
||
dataset = load_dataset(args.dataset_name, split=args.split) | ||
|
||
|
||
prompt = BasePrompt( | ||
task_description=ner_prompt, | ||
generate_data_for_column="ner_tags", | ||
fewshot_example_columns="tokens", | ||
label_options={0: "O", 1: "B-PER", 2: "I-PER", 3: "B-ORG", 4: "I-ORG", 5: "B-LOC", 6: "I-LOC"}, | ||
) | ||
|
||
unlabeled_data = random_sampler(dataset, 30) | ||
|
||
if not args.use_cached: | ||
|
||
# "tiiuae/falcon-7b-instruct" | ||
# timdettmers/guanaco-33b-merged | ||
prompt_node = PromptNode( | ||
model_name_or_path=args.model_name_or_path, | ||
api_key=os.environ.get("HF_API_KEY"), | ||
) | ||
|
||
|
||
generator = DatasetGenerator(prompt_node) | ||
generated_dataset: Dataset = generator.generate( | ||
prompt_template=prompt, | ||
unlabeled_dataset=unlabeled_data, | ||
max_prompt_calls=30, | ||
timeout_per_prompt=2, | ||
) | ||
|
||
generated_dataset.save_to_disk("generated_dataset_starchat") | ||
|
||
else: | ||
generated_dataset = load_from_disk("generated_dataset") | ||
|
||
|
||
evaluate(dataset, generated_dataset) | ||
|
||
|
||
def post_process(generated_samples): | ||
"""Some heuristics to clean up the generated samples""" | ||
|
||
def _post_process(generated_sample): | ||
|
||
cleaned_tags = [] | ||
|
||
for tag in generated_sample["ner_tags"]: | ||
try: | ||
cleaned_tags.append(int(tag)) | ||
except ValueError: | ||
if tag == "-": | ||
cleaned_tags.append(0) | ||
elif tag.startswith("[") and tag.endswith("]") and len(tag) > 2: | ||
try: | ||
cleaned_tags.append(int(tag[1:-1])) | ||
except ValueError: | ||
cleaned_tags.append(0) | ||
|
||
generated_sample["ner_tags"] = cleaned_tags | ||
|
||
return generated_sample | ||
|
||
return generated_samples.map(_post_process) | ||
|
||
def build_gold_and_prediction_pairs(dataset, generated_dataset): | ||
"""Builds a list of gold and predicted labels for each sample in the dataset""" | ||
|
||
golds = [] | ||
predictions = [] | ||
|
||
for generated_sample in generated_dataset: | ||
|
||
for gold_sample in dataset: | ||
|
||
if generated_sample["tokens"] == gold_sample["tokens"]: | ||
golds.append(gold_sample["ner_tags"]) | ||
predictions.append(generated_sample["ner_tags"]) | ||
|
||
|
||
return golds, predictions | ||
|
||
def calculate_metrics(golds, predictions): | ||
mlb = MultiLabelBinarizer() | ||
golds = mlb.fit_transform(golds) | ||
predictions = mlb.transform(predictions) | ||
acc = accuracy_score(golds, predictions) | ||
report = classification_report(golds, predictions) | ||
# Print the results | ||
print(f"Accuracy: {acc}") | ||
print(f"Classification Report:\n{report}") | ||
|
||
|
||
def evaluate(dataset, generated_dataset): | ||
generated_dataset = post_process(generated_dataset) | ||
print(f"Using {generated_dataset} of samples from the generated dataset") | ||
golds, predictions = build_gold_and_prediction_pairs(dataset, generated_dataset) | ||
calculate_metrics(golds, predictions) | ||
|
||
|
||
if __name__ == "__main__": | ||
|
||
parser = argparse.ArgumentParser() | ||
|
||
parser.add_argument("--model_name_or_path", type=str, default="EleutherAI/pythia-70M-deduped") | ||
parser.add_argument("--dataset_name", type=str, default="conll2003") | ||
parser.add_argument("--split", type=str, default="validation") | ||
parser.add_argument("--use_cached", type=bool, default=False) | ||
|
||
args = parser.parse_args() | ||
|
||
main(args) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
torch | ||
transformers | ||
accelerate |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,204 @@ | ||
import copy | ||
import logging | ||
from dataclasses import dataclass, field | ||
from typing import Dict, Optional, Sequence | ||
|
||
import torch | ||
import transformers | ||
from torch.utils.data import Dataset | ||
from transformers import Trainer | ||
|
||
IGNORE_INDEX = -100 | ||
DEFAULT_PAD_TOKEN = "[PAD]" | ||
DEFAULT_EOS_TOKEN = "</s>" | ||
DEFAULT_BOS_TOKEN = "<s>" | ||
DEFAULT_UNK_TOKEN = "<unk>" | ||
|
||
|
||
ner_prompt = ( | ||
"Write a response to the question or task specified in the instruction. " | ||
"Note the input that provides further context.\n\n" | ||
"Instruction:\n{instruction}\n\nResponse:" | ||
), | ||
|
||
@dataclass | ||
class ModelArguments: | ||
model_name_or_path: Optional[str] = field(default="facebook/opt-125m") | ||
|
||
|
||
@dataclass | ||
class DataArguments: | ||
data_path: str = field(default=None, metadata={"help": "Path to the training data."}) | ||
|
||
|
||
@dataclass | ||
class TrainingArguments(transformers.TrainingArguments): | ||
cache_dir: Optional[str] = field(default=None) | ||
optim: str = field(default="adamw_torch") | ||
model_max_length: int = field( | ||
default=512, | ||
metadata={"help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)."}, | ||
) | ||
|
||
|
||
def smart_tokenizer_and_embedding_resize( | ||
special_tokens_dict: Dict, | ||
tokenizer: transformers.PreTrainedTokenizer, | ||
model: transformers.PreTrainedModel, | ||
): | ||
"""Resize tokenizer and embedding. | ||
Note: This is the unoptimized version that may make your embedding size not be divisible by 64. | ||
""" | ||
num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict) | ||
model.resize_token_embeddings(len(tokenizer)) | ||
|
||
if num_new_tokens > 0: | ||
input_embeddings = model.get_input_embeddings().weight.data | ||
output_embeddings = model.get_output_embeddings().weight.data | ||
|
||
input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True) | ||
output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True) | ||
|
||
input_embeddings[-num_new_tokens:] = input_embeddings_avg | ||
output_embeddings[-num_new_tokens:] = output_embeddings_avg | ||
|
||
|
||
def _tokenize_fn(strings: Sequence[str], tokenizer: transformers.PreTrainedTokenizer) -> Dict: | ||
"""Tokenize a list of strings.""" | ||
tokenized_list = [ | ||
tokenizer( | ||
text, | ||
return_tensors="pt", | ||
padding="longest", | ||
max_length=tokenizer.model_max_length, | ||
truncation=True, | ||
) | ||
for text in strings | ||
] | ||
input_ids = labels = [tokenized.input_ids[0] for tokenized in tokenized_list] | ||
input_ids_lens = labels_lens = [ | ||
tokenized.input_ids.ne(tokenizer.pad_token_id).sum().item() for tokenized in tokenized_list | ||
] | ||
return dict( | ||
input_ids=input_ids, | ||
labels=labels, | ||
input_ids_lens=input_ids_lens, | ||
labels_lens=labels_lens, | ||
) | ||
|
||
|
||
def preprocess( | ||
sources: Sequence[str], | ||
targets: Sequence[str], | ||
tokenizer: transformers.PreTrainedTokenizer, | ||
) -> Dict: | ||
"""Preprocess the data by tokenizing.""" | ||
examples = [s + t for s, t in zip(sources, targets)] | ||
examples_tokenized, sources_tokenized = [_tokenize_fn(strings, tokenizer) for strings in (examples, sources)] | ||
input_ids = examples_tokenized["input_ids"] | ||
labels = copy.deepcopy(input_ids) | ||
for label, source_len in zip(labels, sources_tokenized["input_ids_lens"]): | ||
label[:source_len] = IGNORE_INDEX | ||
return dict(input_ids=input_ids, labels=labels) | ||
|
||
|
||
class SupervisedDataset(Dataset): | ||
"""Dataset for supervised fine-tuning.""" | ||
|
||
def __init__(self, list_dataset_dict, tokenizer: transformers.PreTrainedTokenizer): | ||
super(SupervisedDataset, self).__init__() | ||
logging.warning("Loading data...") | ||
|
||
sources = [ | ||
ner_prompt.format_map(example) for example in list_data_dict | ||
] | ||
targets = [f"{example['output']}{tokenizer.eos_token}" for example in list_data_dict] | ||
|
||
logging.warning("Tokenizing inputs... This may take some time...") | ||
data_dict = preprocess(sources, targets, tokenizer) | ||
|
||
self.input_ids = data_dict["input_ids"] | ||
self.labels = data_dict["labels"] | ||
|
||
def __len__(self): | ||
return len(self.input_ids) | ||
|
||
def __getitem__(self, i) -> Dict[str, torch.Tensor]: | ||
return dict(input_ids=self.input_ids[i], labels=self.labels[i]) | ||
|
||
|
||
@dataclass | ||
class DataCollatorForSupervisedDataset(object): | ||
"""Collate examples for supervised fine-tuning.""" | ||
|
||
tokenizer: transformers.PreTrainedTokenizer | ||
|
||
def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]: | ||
input_ids, labels = tuple([instance[key] for instance in instances] for key in ("input_ids", "labels")) | ||
input_ids = torch.nn.utils.rnn.pad_sequence( | ||
input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id | ||
) | ||
labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=IGNORE_INDEX) | ||
return dict( | ||
input_ids=input_ids, | ||
labels=labels, | ||
attention_mask=input_ids.ne(self.tokenizer.pad_token_id), | ||
) | ||
|
||
|
||
def make_supervised_data_module(tokenizer: transformers.PreTrainedTokenizer, data_args) -> Dict: | ||
"""Make dataset and collator for supervised fine-tuning.""" | ||
|
||
from datasets import load_dataset | ||
# Conll | ||
dataset = load_dataset("conll2003", split="train") | ||
# Convert to dict with keys "instruction" and "output" | ||
dataset = [dict(instruction=" ".join(example["tokens"]), output=f'[{", ".join(example["ner_tags"])}]') for example in dataset] | ||
train_dataset = SupervisedDataset(dataset, tokenizer=tokenizer) | ||
data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer) | ||
return dict(train_dataset=train_dataset, eval_dataset=None, data_collator=data_collator) | ||
|
||
|
||
def train(): | ||
parser = transformers.HfArgumentParser((ModelArguments, DataArguments, TrainingArguments)) | ||
model_args, data_args, training_args = parser.parse_args_into_dataclasses() | ||
|
||
model = transformers.AutoModelForCausalLM.from_pretrained( | ||
model_args.model_name_or_path, | ||
cache_dir=training_args.cache_dir, | ||
) | ||
model = None | ||
|
||
tokenizer = transformers.AutoTokenizer.from_pretrained( | ||
model_args.model_name_or_path, | ||
cache_dir=training_args.cache_dir, | ||
model_max_length=training_args.model_max_length, | ||
padding_side="right", | ||
use_fast=False, | ||
) | ||
special_tokens_dict = dict() | ||
if tokenizer.pad_token is None or tokenizer.pad_token == "": | ||
special_tokens_dict["pad_token"] = DEFAULT_PAD_TOKEN | ||
if tokenizer.eos_token is None or tokenizer.eos_token == "": | ||
special_tokens_dict["eos_token"] = DEFAULT_EOS_TOKEN | ||
if tokenizer.bos_token is None or tokenizer.bos_token == "": | ||
special_tokens_dict["bos_token"] = DEFAULT_BOS_TOKEN | ||
if tokenizer.unk_token is None or tokenizer.unk_token == "": | ||
special_tokens_dict["unk_token"] = DEFAULT_UNK_TOKEN | ||
|
||
smart_tokenizer_and_embedding_resize( | ||
special_tokens_dict=special_tokens_dict, | ||
tokenizer=tokenizer, | ||
model=model, | ||
) | ||
|
||
data_module = make_supervised_data_module(tokenizer=tokenizer, data_args=data_args) | ||
trainer = Trainer(model=model, tokenizer=tokenizer, args=training_args, **data_module) | ||
trainer.train() | ||
trainer.save_state() | ||
trainer.save_model(output_dir=training_args.output_dir) | ||
|
||
|
||
if __name__ == "__main__": | ||
train() |