diff --git a/src/small_model_training/project_experiments/cola/cola_baseline_downstream.py b/src/small_model_training/project_experiments/cola/cola_baseline_downstream.py index e69de29..2d7e3bb 100644 --- a/src/small_model_training/project_experiments/cola/cola_baseline_downstream.py +++ b/src/small_model_training/project_experiments/cola/cola_baseline_downstream.py @@ -0,0 +1,90 @@ +import numpy as np +import datetime +import os +from transformers import AutoTokenizer, TrainingArguments +from transformers import DataCollatorWithPadding +from transformers import AutoModelForSequenceClassification, Trainer +from datasets import load_dataset +import evaluate + +generated_dataset = load_dataset("stolzenp/500-cola-sentences-baseline", split="train") +test_split = load_dataset("glue", "cola", split="test") + +model_name = "bert-base-uncased" + +label2id = {"correct": 1, "wrong": 0, "acceptable": 1, "unacceptable": 0} +id2label = {1: "acceptable", 0: "unacceptable"} + +# setup preprocessing +tokenizer = AutoTokenizer.from_pretrained(model_name) + + +def preprocess_text(batch): + preprocessed_tokens = tokenizer(batch["text"], truncation=True, padding=True) + return preprocessed_tokens + + +def preprocess_text_and_labels(batch): + preprocessed_tokens = tokenizer(batch["text"], truncation=True, padding=True) + preprocessed_tokens["label"] = [label2id[label] for label in batch["label"]] + return preprocessed_tokens + + +# setup compute_metrics +accuracy = evaluate.load("accuracy") + + +def compute_metrics(eval_pred): + predictions, labels = eval_pred + predictions = np.argmax(predictions, axis=1) + return accuracy.compute(predictions=predictions, references=labels) + + +generated_dataset = generated_dataset.train_test_split(test_size=0.1) +train_split = generated_dataset["train"] +val_split = generated_dataset["test"] + +tokenized_train = train_split.map(preprocess_text_and_labels, batched=True) +tokenized_val = val_split.map(preprocess_text_and_labels, batched=True) +tokenized_test = test_split.map(preprocess_text, batched=True) + +data_collator = DataCollatorWithPadding(tokenizer=tokenizer) + +model = AutoModelForSequenceClassification.from_pretrained( + model_name, num_labels=2, id2label=id2label, label2id=label2id +) + +# initialize training +trainer = Trainer( + model=model, + args=TrainingArguments( + output_dir="baseline_downstream_model", + learning_rate=2e-5, + per_device_train_batch_size=16, + per_device_eval_batch_size=16, + num_train_epochs=2, + weight_decay=0.01, + evaluation_strategy="epoch", + save_strategy="epoch", + load_best_model_at_end=True, + ), + train_dataset=tokenized_train, + eval_dataset=tokenized_val, + tokenizer=tokenizer, + data_collator=data_collator, + compute_metrics=compute_metrics, +) + +trainer.train() +outputs = trainer.predict(tokenized_test) +test_accuracy = outputs[2]["test_accuracy"] + +results_file = "results.txt" +experiments_directory = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +results_path = f"{experiments_directory}/{results_file}" + +results_timestamp = datetime.datetime.now() + +with open(results_path, "a") as file: + file.write(f"{results_timestamp} - cola_baseline\n") + file.write(f"accuracy: {test_accuracy}\n") diff --git a/src/small_model_training/project_experiments/cola/cola_baseline_generation.py b/src/small_model_training/project_experiments/cola/cola_baseline_generation.py index e69de29..d58ae5b 100644 --- a/src/small_model_training/project_experiments/cola/cola_baseline_generation.py +++ b/src/small_model_training/project_experiments/cola/cola_baseline_generation.py @@ -0,0 +1,28 @@ +from haystack.nodes import PromptNode +from fabricator import DatasetGenerator +from fabricator.prompts import BasePrompt + +label_options = ["correct", "wrong"] + +prompt = BasePrompt( + task_description="Generate a grammatically {} sentence similar to this:", + label_options=label_options, + target_formatting_template="", +) + +prompt_node = PromptNode( + model_name_or_path="mistralai/Mistral-7B-Instruct-v0.1", + max_length=50, + model_kwargs={ + "model_kwargs": {"do_sample": True, "temperature": 0.5, "top_p": 0.9} + }, +) + +generator = DatasetGenerator(prompt_node) +generated_dataset = generator.generate( + prompt_template=prompt, + max_prompt_calls=500, + num_samples_to_generate=500, +) + +generated_dataset.push_to_hub("500-cola-sentences-baseline") diff --git a/src/small_model_training/project_experiments/cola/cola_gold_downstream.py b/src/small_model_training/project_experiments/cola/cola_gold_downstream.py index e43e9f5..79808da 100644 --- a/src/small_model_training/project_experiments/cola/cola_gold_downstream.py +++ b/src/small_model_training/project_experiments/cola/cola_gold_downstream.py @@ -1,3 +1,81 @@ +import numpy as np +import datetime +import os +from transformers import AutoTokenizer, TrainingArguments +from transformers import DataCollatorWithPadding +from transformers import AutoModelForSequenceClassification, Trainer from datasets import load_dataset +import evaluate -gold_dataset = load_dataset("glue", "cola", split="train") +train_split = load_dataset("glue", "cola", split="train") +val_split = load_dataset("glue", "cola", split="validation") +test_split = load_dataset("glue", "cola", split="test") + +model_name = "bert-base-uncased" + +label2id = {"correct": 1, "wrong": 0, "acceptable": 1, "unacceptable": 0} +id2label = {1: "acceptable", 0: "unacceptable"} + +# setup preprocessing +tokenizer = AutoTokenizer.from_pretrained(model_name) + + +def preprocess_text(batch): + preprocessed_tokens = tokenizer(batch["text"], truncation=True, padding=True) + return preprocessed_tokens + + +# setup compute_metrics +accuracy = evaluate.load("accuracy") + + +def compute_metrics(eval_pred): + predictions, labels = eval_pred + predictions = np.argmax(predictions, axis=1) + return accuracy.compute(predictions=predictions, references=labels) + + +tokenized_train = train_split.map(preprocess_text, batched=True) +tokenized_val = val_split.map(preprocess_text, batched=True) +tokenized_test = test_split.map(preprocess_text, batched=True) + +data_collator = DataCollatorWithPadding(tokenizer=tokenizer) + +model = AutoModelForSequenceClassification.from_pretrained( + model_name, num_labels=2, id2label=id2label, label2id=label2id +) + +# initialize training +trainer = Trainer( + model=model, + args=TrainingArguments( + output_dir="gold_downstream_model", + learning_rate=2e-5, + per_device_train_batch_size=16, + per_device_eval_batch_size=16, + num_train_epochs=2, + weight_decay=0.01, + evaluation_strategy="epoch", + save_strategy="epoch", + load_best_model_at_end=True, + ), + train_dataset=tokenized_train, + eval_dataset=tokenized_val, + tokenizer=tokenizer, + data_collator=data_collator, + compute_metrics=compute_metrics, +) + +trainer.train() +outputs = trainer.predict(tokenized_test) +test_accuracy = outputs[2]["test_accuracy"] + +results_file = "results.txt" +experiments_directory = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +results_path = f"{experiments_directory}/{results_file}" + +results_timestamp = datetime.datetime.now() + +with open(results_path, "a") as file: + file.write(f"{results_timestamp} - cola_gold\n") + file.write(f"accuracy: {test_accuracy}\n") diff --git a/src/small_model_training/project_experiments/cola/cola_smt_downstream.py b/src/small_model_training/project_experiments/cola/cola_smt_downstream.py index e69de29..98ef47f 100644 --- a/src/small_model_training/project_experiments/cola/cola_smt_downstream.py +++ b/src/small_model_training/project_experiments/cola/cola_smt_downstream.py @@ -0,0 +1,90 @@ +import numpy as np +import datetime +import os +from transformers import AutoTokenizer, TrainingArguments +from transformers import DataCollatorWithPadding +from transformers import AutoModelForSequenceClassification, Trainer +from datasets import load_dataset +import evaluate + +generated_dataset = load_dataset("stolzenp/500-cola-sentences-smt", split="train") +test_split = load_dataset("glue", "cola", split="test") + +model_name = "bert-base-uncased" + +label2id = {"correct": 1, "wrong": 0, "acceptable": 1, "unacceptable": 0} +id2label = {1: "acceptable", 0: "unacceptable"} + +# setup preprocessing +tokenizer = AutoTokenizer.from_pretrained(model_name) + + +def preprocess_text(batch): + preprocessed_tokens = tokenizer(batch["text"], truncation=True, padding=True) + return preprocessed_tokens + + +def preprocess_text_and_labels(batch): + preprocessed_tokens = tokenizer(batch["text"], truncation=True, padding=True) + preprocessed_tokens["label"] = [label2id[label] for label in batch["label"]] + return preprocessed_tokens + + +# setup compute_metrics +accuracy = evaluate.load("accuracy") + + +def compute_metrics(eval_pred): + predictions, labels = eval_pred + predictions = np.argmax(predictions, axis=1) + return accuracy.compute(predictions=predictions, references=labels) + + +generated_dataset = generated_dataset.train_test_split(test_size=0.1) +train_split = generated_dataset["train"] +val_split = generated_dataset["test"] + +tokenized_train = train_split.map(preprocess_text_and_labels, batched=True) +tokenized_val = val_split.map(preprocess_text_and_labels, batched=True) +tokenized_test = test_split.map(preprocess_text, batched=True) + +data_collator = DataCollatorWithPadding(tokenizer=tokenizer) + +model = AutoModelForSequenceClassification.from_pretrained( + model_name, num_labels=2, id2label=id2label, label2id=label2id +) + +# initialize training +trainer = Trainer( + model=model, + args=TrainingArguments( + output_dir="smt_downstream_model", + learning_rate=2e-5, + per_device_train_batch_size=16, + per_device_eval_batch_size=16, + num_train_epochs=2, + weight_decay=0.01, + evaluation_strategy="epoch", + save_strategy="epoch", + load_best_model_at_end=True, + ), + train_dataset=tokenized_train, + eval_dataset=tokenized_val, + tokenizer=tokenizer, + data_collator=data_collator, + compute_metrics=compute_metrics, +) + +trainer.train() +outputs = trainer.predict(tokenized_test) +test_accuracy = outputs[2]["test_accuracy"] + +results_file = "results.txt" +experiments_directory = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +results_path = f"{experiments_directory}/{results_file}" + +results_timestamp = datetime.datetime.now() + +with open(results_path, "a") as file: + file.write(f"{results_timestamp} - cola_smt\n") + file.write(f"accuracy: {test_accuracy}\n") diff --git a/src/small_model_training/project_experiments/cola/cola_smt_generation.py b/src/small_model_training/project_experiments/cola/cola_smt_generation.py index e69de29..e492262 100644 --- a/src/small_model_training/project_experiments/cola/cola_smt_generation.py +++ b/src/small_model_training/project_experiments/cola/cola_smt_generation.py @@ -0,0 +1,30 @@ +from haystack.nodes import PromptNode +from fabricator import DatasetGenerator +from fabricator.prompts import BasePrompt + +label_options = ["correct", "wrong"] + +prompt = BasePrompt( + task_description="Generate a grammatically {} sentence similar to this:", + label_options=label_options, + target_formatting_template="", +) + +prompt_node = PromptNode( + model_name_or_path="mistralai/Mistral-7B-Instruct-v0.1", + max_length=50, + model_kwargs={ + "model_kwargs": {"do_sample": True, "temperature": 0.5, "top_p": 0.9} + }, +) + +generator = DatasetGenerator(prompt_node) +generated_dataset = generator.generate( + prompt_template=prompt, + max_prompt_calls=500, + num_samples_to_generate=500, + small_model_training="text_classification", + train_small_model_every_X_generations=50, +) + +generated_dataset.push_to_hub("500-cola-sentences-smt") diff --git a/src/small_model_training/project_experiments/imdb/imdb_baseline_downstream.py b/src/small_model_training/project_experiments/imdb/imdb_baseline_downstream.py index 35e035e..391e204 100644 --- a/src/small_model_training/project_experiments/imdb/imdb_baseline_downstream.py +++ b/src/small_model_training/project_experiments/imdb/imdb_baseline_downstream.py @@ -88,5 +88,5 @@ def compute_metrics(eval_pred): results_timestamp = datetime.datetime.now() with open(results_path, "a") as file: - file.write(f"{results_timestamp} - imdb_baseline_downstream\n") + file.write(f"{results_timestamp} - imdb_baseline\n") file.write(f"accuracy: {test_accuracy}\n") diff --git a/src/small_model_training/project_experiments/imdb/imdb_gold_downstream.py b/src/small_model_training/project_experiments/imdb/imdb_gold_downstream.py index 88926cd..68a7961 100644 --- a/src/small_model_training/project_experiments/imdb/imdb_gold_downstream.py +++ b/src/small_model_training/project_experiments/imdb/imdb_gold_downstream.py @@ -79,5 +79,5 @@ def compute_metrics(eval_pred): results_timestamp = datetime.datetime.now() with open(results_path, "a") as file: - file.write(f"{results_timestamp} - imdb_gold_downstream\n") + file.write(f"{results_timestamp} - imdb_gold\n") file.write(f"accuracy: {test_accuracy}\n") diff --git a/src/small_model_training/project_experiments/imdb/imdb_smt_downstream.py b/src/small_model_training/project_experiments/imdb/imdb_smt_downstream.py index 2850702..f647776 100644 --- a/src/small_model_training/project_experiments/imdb/imdb_smt_downstream.py +++ b/src/small_model_training/project_experiments/imdb/imdb_smt_downstream.py @@ -86,5 +86,5 @@ def compute_metrics(eval_pred): results_timestamp = datetime.datetime.now() with open(results_path, "a") as file: - file.write(f"{results_timestamp} - imdb_smt_downstream\n") + file.write(f"{results_timestamp} - imdb_smt\n") file.write(f"accuracy: {test_accuracy}\n") diff --git a/src/small_model_training/project_experiments/iterative_dataset_synthesis_movies.py b/src/small_model_training/project_experiments/iterative_dataset_synthesis_movies.py index ba72432..579c22a 100644 --- a/src/small_model_training/project_experiments/iterative_dataset_synthesis_movies.py +++ b/src/small_model_training/project_experiments/iterative_dataset_synthesis_movies.py @@ -2,19 +2,19 @@ from fabricator import DatasetGenerator from fabricator.prompts import BasePrompt -label_options = ["positive", "negative"] +label_options = ["correct", "wrong"] prompt = BasePrompt( - task_description="Generate an excerpt from a {} movie review similar to these:", + task_description="Generate a grammatically {} sentence similar to this:", label_options=label_options, target_formatting_template="", ) prompt_node = PromptNode( model_name_or_path="mistralai/Mistral-7B-Instruct-v0.1", - max_length=100, + max_length=50, model_kwargs={ - "model_kwargs": {"do_sample": True, "temperature": 0.4, "top_p": 0.9} + "model_kwargs": {"do_sample": True, "temperature": 0.5, "top_p": 0.9} }, ) diff --git a/src/small_model_training/project_experiments/sst2/sst2_baseline_downstream.py b/src/small_model_training/project_experiments/sst2/sst2_baseline_downstream.py index 3cc36aa..00e5056 100644 --- a/src/small_model_training/project_experiments/sst2/sst2_baseline_downstream.py +++ b/src/small_model_training/project_experiments/sst2/sst2_baseline_downstream.py @@ -88,5 +88,5 @@ def compute_metrics(eval_pred): results_timestamp = datetime.datetime.now() with open(results_path, "a") as file: - file.write(f"{results_timestamp} - sst2_baseline_downstream\n") + file.write(f"{results_timestamp} - sst2_baseline\n") file.write(f"accuracy: {test_accuracy}\n") diff --git a/src/small_model_training/project_experiments/sst2/sst2_gold_downstream.py b/src/small_model_training/project_experiments/sst2/sst2_gold_downstream.py index b45a415..2571bd8 100644 --- a/src/small_model_training/project_experiments/sst2/sst2_gold_downstream.py +++ b/src/small_model_training/project_experiments/sst2/sst2_gold_downstream.py @@ -77,5 +77,5 @@ def compute_metrics(eval_pred): results_timestamp = datetime.datetime.now() with open(results_path, "a") as file: - file.write(f"{results_timestamp} - sst2_gold_downstream\n") + file.write(f"{results_timestamp} - sst2_gold\n") file.write(f"accuracy: {test_accuracy}\n") diff --git a/src/small_model_training/project_experiments/sst2/sst2_smt_downstream.py b/src/small_model_training/project_experiments/sst2/sst2_smt_downstream.py index a143dc9..80f8f25 100644 --- a/src/small_model_training/project_experiments/sst2/sst2_smt_downstream.py +++ b/src/small_model_training/project_experiments/sst2/sst2_smt_downstream.py @@ -86,5 +86,5 @@ def compute_metrics(eval_pred): results_timestamp = datetime.datetime.now() with open(results_path, "a") as file: - file.write(f"{results_timestamp} - sst2_smt_downstream\n") + file.write(f"{results_timestamp} - sst2_smt\n") file.write(f"accuracy: {test_accuracy}\n")