Skip to content

Commit

Permalink
Merge pull request #59 from flairNLP/whoisjones/doc-tutorial-update
Browse files Browse the repository at this point in the history
Added experiments for submission
  • Loading branch information
HallerPatrick authored Aug 3, 2023
2 parents ce12e73 + 12672fc commit dd4739d
Show file tree
Hide file tree
Showing 15 changed files with 1,125 additions and 5 deletions.
72 changes: 72 additions & 0 deletions paper_experiments/conll_annotate_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import os
from datasets import load_dataset
from haystack.nodes import PromptNode
from ai_dataset_generator import DatasetGenerator, BasePrompt
from ai_dataset_generator.dataset_transformations.token_classification import convert_token_labels_to_spans


def run():
fewshot_dataset = load_dataset("conll2003", split="train")
fewshot_dataset, label_options = convert_token_labels_to_spans(
fewshot_dataset,
"tokens",
"ner_tags",
expanded_label_mapping={
0: "O",
1: "B-person",
2: "I-person",
3: "B-organization",
4: "I-organization",
5: "B-location",
6: "I-location",
7: "B-miscellaneous",
8: "I-miscellaneous",
}
)

annotation_dataset = load_dataset("conll2003", split="validation")
annotation_dataset, label_options = convert_token_labels_to_spans(
annotation_dataset,
"tokens",
"ner_tags",
expanded_label_mapping={
0: "O",
1: "B-person",
2: "I-person",
3: "B-organization",
4: "I-organization",
5: "B-location",
6: "I-location",
7: "B-miscellaneous",
8: "I-miscellaneous",
}
)

prompt = BasePrompt(
task_description="Extract the following named entities from the text: {}. "
"Your output format must be exactly the same as from the fewshot examples.",
label_options=label_options,
generate_data_for_column="ner_tags",
fewshot_example_columns="tokens",
)

prompt_node = PromptNode(
model_name_or_path="gpt-3.5-turbo",
api_key=os.environ.get("OPENAI_API_KEY"),
max_length=500,
)

generator = DatasetGenerator(prompt_node)
generated_dataset = generator.generate(
prompt_template=prompt,
fewshot_dataset=fewshot_dataset,
fewshot_examples_per_class=3,
unlabeled_dataset=annotation_dataset,
max_prompt_calls=len(annotation_dataset),
)

generated_dataset.push_to_hub("conll-validation-annotated", private=True)


if __name__ == "__main__":
run()
38 changes: 38 additions & 0 deletions paper_experiments/conll_gpt_train_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
from argparse import ArgumentParser
from datasets import load_dataset
from ai_dataset_generator import convert_spans_to_token_labels
from seqeval.metrics import accuracy_score, f1_score


def run(args):
id2label = {
0: "O",
1: "B-person",
2: "I-person",
3: "B-organization",
4: "I-organization",
5: "B-location",
6: "I-location",
7: "B-miscellaneous",
8: "I-miscellaneous",
}
dataset = load_dataset(args.corpus, split="train")
dataset = convert_spans_to_token_labels(dataset, "tokens", "ner_tags", id2label=id2label)
original = load_dataset("conll2003", split="validation")
y_pred = []
y_true = []
for generated_example, original_example in zip(dataset, original):
if len(generated_example["tokens"]) == len(original_example["tokens"]):
y_pred.append([id2label[tag] for tag in generated_example["ner_tags"]])
y_true.append([id2label[tag] for tag in original_example["ner_tags"]])

print(len(y_pred) / len(dataset))
print(accuracy_score(y_true, y_pred))
print(f1_score(y_true, y_pred))


if __name__ == "__main__":
parser = ArgumentParser()
parser.add_argument("--corpus", type=str)
arguments = parser.parse_args()
run(arguments)
52 changes: 52 additions & 0 deletions paper_experiments/mrpc_annotate_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import os
from datasets import load_dataset, concatenate_datasets
from haystack.nodes import PromptNode
from ai_dataset_generator import DatasetGenerator, BasePrompt
from ai_dataset_generator.dataset_transformations.text_classification import convert_label_ids_to_texts


def run():
annotation_dataset, label_options = convert_label_ids_to_texts(
load_dataset("glue", "mrpc", split="train"),
"label",
return_label_options=True,
)
fewshot_datasets = []
for label in range(2):
filtered_ds = load_dataset("glue", "mrpc", split="validation").filter(
lambda x: x["label"] == label)
fewshot_datasets.append(filtered_ds.select(range(6)))
fewshot_dataset = concatenate_datasets(fewshot_datasets).shuffle(seed=42)

fewshot_dataset = convert_label_ids_to_texts(fewshot_dataset, "label")

prompt = BasePrompt(
task_description="Given two sentences, determine by means of the fewshot examples whether these sentences are: {}.",
label_options=label_options,
generate_data_for_column="label",
fewshot_example_columns=["sentence1", "sentence2"],
)

prompt_node = PromptNode(
model_name_or_path="gpt-3.5-turbo",
api_key=os.environ.get("OPENAI_API_KEY"),
max_length=100,
)

generator = DatasetGenerator(prompt_node)
generated_dataset, original_dataset = generator.generate(
prompt_template=prompt,
fewshot_dataset=fewshot_dataset,
fewshot_examples_per_class=2,
fewshot_label_sampling_strategy="stratified",
unlabeled_dataset=annotation_dataset,
max_prompt_calls=len(annotation_dataset),
return_unlabeled_dataset=True
)

generated_dataset.push_to_hub("glue_mrpc_annotated_12_fewshot_examples_2_per_prompt_stratified", private=True)
original_dataset.push_to_hub("glue_mrpc_original", private=True)


if __name__ == "__main__":
run()
143 changes: 143 additions & 0 deletions paper_experiments/mrpc_train_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
import argparse
import numpy as np
from datasets import load_dataset, ClassLabel
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, \
DataCollatorWithPadding
import evaluate
import shutil


def run(args):
# iterate over all corpora
for corpus_name in args.corpora:
# iterate over all sizes, -1 means we are taking all examples but at most 10k
for size in [-1, 50, 500, 1000]:
# Average results for corpus and size over 5 seeds
result_avg = []
for seed in [41, 42, 43, 44, 45]:
# Load the dataset
dataset = load_dataset(corpus_name, split="train").shuffle(seed=seed)
test_split = load_dataset("glue", "mrpc", split="test")

# preprocess annotated dataset - ensure unified labels (lowercased and no whitespaces) + correct
# ClassLabel feature
if "annotated" in corpus_name:
original_labels = test_split.features["label"].names

def clean_labels(examples):
examples["label"] = examples["label"].lower()
return examples

dataset = dataset.map(clean_labels)

dst_feat = ClassLabel(names=original_labels)
dataset = dataset.map(lambda batch: {
"label": dst_feat.str2int(batch)}, input_columns="label", batched=True)
new_features = dataset.features.copy()
new_features["label"] = dst_feat
dataset = dataset.cast(new_features)

# Compose final training dataset + gold-labeled test split
if size > 0:
dataset = dataset.select(range(size))
dataset = dataset.train_test_split(test_size=0.1)
dataset["validation"] = dataset["test"]
dataset["test"] = test_split
num_labels = dataset["train"].features["label"].num_classes

# Load the BERT tokenizer and model
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

# Preprocessing function
def preprocess_function(examples):
return tokenizer(
examples["sentence1"],
examples["sentence2"],
padding=True,
truncation=True,
return_tensors="pt"
)

tokenized_dataset = dataset.map(preprocess_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=1)
return accuracy.compute(predictions=predictions, references=labels)

id2label = dict(enumerate(dataset["train"].features["label"].names))
label2id = {v: k for k, v in id2label.items()}

# Create model and move to CUDA
model = AutoModelForSequenceClassification.from_pretrained(
model_name,
num_labels=num_labels,
id2label=id2label,
label2id=label2id
).to("cuda")

# Set number of training epochs depending on dataset size
if size < 0:
num_train_epochs = 5
elif size == 1000:
num_train_epochs = 10
else:
num_train_epochs = 20

# Make tmp path for storing the model
tmp_path = f"tmp/{corpus_name.replace('/', '-')}-{size}-samples"

# Training arguments
training_args = TrainingArguments(
output_dir=tmp_path,
learning_rate=2e-5,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=num_train_epochs,
weight_decay=0.01,
save_total_limit=1,
evaluation_strategy="epoch",
push_to_hub=False,
)

trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset["train"],
eval_dataset=tokenized_dataset["validation"],
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics,
)

trainer.train()

results = trainer.predict(tokenized_dataset["test"])
result_avg.append(results.metrics["test_accuracy"] * 100)

# remove tmp path since we iterate over seeds, corpora and sizes
shutil.rmtree(tmp_path)

# change -1 for logging to 'all'
if size > 0:
log_size = str(size)
else:
log_size = "all"

# write results to log file
log_corpus_name = corpus_name.replace("whoisjones/", "")
file = f"{log_corpus_name}-{log_size}-samples"
with open(f"results/{file}.log", "w") as f:
f.write(f"Accuracy: {np.mean(result_avg)}\n")
f.write(f"Standard deviation: {np.std(result_avg)}\n")


if __name__ == "__main__":
# Run like 'python snli_train_model.py --corpora hfaccount/generated-model snli
parser = argparse.ArgumentParser()
parser.add_argument("--corpora", nargs='+') # a list of generated and gold-label corpus
arguments = parser.parse_args()
run(arguments)
54 changes: 54 additions & 0 deletions paper_experiments/snli_annotate_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import os
from datasets import load_dataset, concatenate_datasets
from haystack.nodes import PromptNode
from ai_dataset_generator import DatasetGenerator, BasePrompt
from ai_dataset_generator.dataset_transformations.text_classification import convert_label_ids_to_texts


def run():
annotation_dataset, label_options = convert_label_ids_to_texts(
load_dataset("snli", split="train").filter(lambda x: x["label"] in [0,1,2]).shuffle(seed=42).select(
range(10000)),
"label",
return_label_options=True,
)

fewshot_datasets = []
for label in range(3):
filtered_ds = load_dataset("snli", split="validation").filter(lambda x: x["label"] == label)
fewshot_datasets.append(filtered_ds.select(range(6)))
fewshot_dataset = concatenate_datasets(fewshot_datasets).shuffle(seed=42)

fewshot_dataset = convert_label_ids_to_texts(fewshot_dataset, "label")

prompt = BasePrompt(
task_description="Given two sentences, determine by means of the fewshot examples whether these sentence "
"pairs are: {}.",
label_options=label_options,
generate_data_for_column="label",
fewshot_example_columns=["premise", "hypothesis"],
)

prompt_node = PromptNode(
model_name_or_path="gpt-3.5-turbo",
api_key=os.environ.get("OPENAI_API_KEY"),
max_length=100,
)

generator = DatasetGenerator(prompt_node)
generated_dataset, original_dataset = generator.generate(
prompt_template=prompt,
fewshot_dataset=fewshot_dataset,
fewshot_examples_per_class=2,
fewshot_label_sampling_strategy="stratified",
unlabeled_dataset=annotation_dataset,
max_prompt_calls=len(annotation_dataset),
return_unlabeled_dataset=True
)

generated_dataset.push_to_hub("snli_annotated_18_fewshot_examples_2_per_prompt_stratified", private=True)
original_dataset.push_to_hub("snli_original", private=True)


if __name__ == "__main__":
run()
Loading

0 comments on commit dd4739d

Please sign in to comment.