From ba38c3427b0d2c7033c0e1ebaf142497bfb15499 Mon Sep 17 00:00:00 2001 From: "H. Jin" Date: Mon, 15 Apr 2024 23:53:16 -0500 Subject: [PATCH] clean repo --- demo.py | 43 -- demo_sft_hpo_deephyper.py | 131 ---- demo_sft_hpo_optuna.py | 121 --- demo_v1.py | 47 -- run_demo_sft_hpo_deephyper.sh | 16 - run_demo_sft_hpo_optuna.sh | 16 - using_sft.ipynb | 774 ------------------- using_transformers.ipynb | 1314 --------------------------------- 8 files changed, 2462 deletions(-) delete mode 100644 demo.py delete mode 100644 demo_sft_hpo_deephyper.py delete mode 100644 demo_sft_hpo_optuna.py delete mode 100644 demo_v1.py delete mode 100755 run_demo_sft_hpo_deephyper.sh delete mode 100755 run_demo_sft_hpo_optuna.sh delete mode 100644 using_sft.ipynb delete mode 100644 using_transformers.ipynb diff --git a/demo.py b/demo.py deleted file mode 100644 index 8494436..0000000 --- a/demo.py +++ /dev/null @@ -1,43 +0,0 @@ -# %% -%load_ext autoreload -%autoreload 2 - -# %% -import pandas as pd -from transformers import AutoTokenizer, BertForSequenceClassification -from data_processing import build_text_data, load_tabular_data, split_dataset -from transformers import pipeline -import torch -# %% - -# merged_df = load_dataframe() -# path = build_vocab(df=merged_df) -# ds_dict_encoded = split_dataset(pretrained_model="bert") - -# %% -DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") -DEVICE = "cpu" -ckp = "bert-base-uncased" -model = BertForSequenceClassification.from_pretrained(ckp, num_labels=2).to(DEVICE) -# %% -tokenizer = AutoTokenizer.from_pretrained(ckp) - -# %% -df = pd.read_csv("output.csv") -df.iloc[0] - -# %% - -# clf = pipeline("text-classification", model=model) -clf = pipeline("text-classification", model=model, tokenizer=tokenizer) -# %% -# for text in df["text"].tolist()[:10]: -# print(clf(text.to(DEVICE))) -# %% -inputs = tokenizer(df["text"].tolist()[:200], padding=True, truncation=True, return_tensors="pt").to(DEVICE) - -outputs = model(**inputs) -# %% -from transformers import AutoModelForSequenceClassification - -# TODO: build a benchmark for sequenceclassification without SFT diff --git a/demo_sft_hpo_deephyper.py b/demo_sft_hpo_deephyper.py deleted file mode 100644 index ef20b8d..0000000 --- a/demo_sft_hpo_deephyper.py +++ /dev/null @@ -1,131 +0,0 @@ -""" Hyperparameter search (HPS) for SFT with DeeyHyper - -* Task: text-classification (binary labels) -* Method: Supervised Fine-tuning -* Dataset: 1000genome -* Pre-trained model: bert-base-uncased - -Ref: -* https://docs.nersc.gov/machinelearning/hpo/ -* https://deephyper.readthedocs.io/en/latest/install/hpc/nersc.html -""" - -import evaluate -import numpy as np -from datasets import load_dataset -from transformers import (AutoModelForSequenceClassification, AutoTokenizer, - BertConfig, DataCollatorWithPadding, Trainer, - TrainingArguments) -from deephyper.problem import HpProblem -from deephyper.search.hps import CBO -from deephyper.evaluator import Evaluator - - -name = "1000genome" -ckp = "bert-base-uncased" - -# load dataset -raw_dataset = load_dataset("csv", - data_files={"train": f"./data/{name}/train.csv", - "validation": f"./data/{name}/validation.csv", - "test": f"./data/{name}/test.csv"}) - -tokenizer = AutoTokenizer.from_pretrained(ckp) -tokenizer_datasets = raw_dataset.map(lambda data: tokenizer(data["text"], truncation=True), batched=True) -data_collator = DataCollatorWithPadding(tokenizer=tokenizer) -tokenizer_datasets = tokenizer_datasets.remove_columns(["text"]) -tokenized_datasets = tokenizer_datasets.rename_column("label", "labels") -tokenized_datasets.set_format(type="torch") - -train_dataset = tokenized_datasets['train'] -eval_dataset = tokenized_datasets['validation'] -acc = evaluate.load("accuracy") - - -def compute_metrics(eval_pred): - predictions, labels = eval_pred - predictions = np.argmax(predictions, axis=1) - return acc.compute(predictions=predictions, references=labels) - - -def run(config): - - trainer_config = {"learning_rate": config.get("learning_rate", 5e-5), - "weight_decay": config.get("weight_decay", 0.0), - "adam_beta1": config.get("adam_beta1", 0.9), - "adam_beta2": config.get("adam_beta2", 0.999), - "adam_epsilon": config.get("adam_epsilon", 1e-8), - "max_grad_norm": config.get("max_grad_norm", 1.0), - "num_train_epochs": config.get("num_train_epochs", 3), - "per_device_train_batch_size": config.get("per_device_train_batch_size", 32)} - - # NOTE: not defined in problem - model_config = {"hidden_act": config.get("hidden_act", "gelu"), - "hidden_dropout_prob": config.get("hidden_dropout_prob", 0.1), - "hidden_size": config.get("hidden_size", 768), - "initializer_range": config.get("initializer_range", 0.02), - "intermediate_size": config.get("intermediate_size", 3072), - "layer_norm_eps": config.get("layer_norm_eps", 1e-12), - "max_position_embeddings": config.get("max_position_embeddings", 512), - "num_attention_heads": config.get("num_attention_heads", 12), - "num_hidden_layers": config.get("num_hidden_layers", 12), - "type_vocab_size": config.get("type_vocab_size", 2), - "vocab_size": config.get("vocab_size", 30522)} - # NOTE: add hps in Bert: - # https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertConfig - bert_config = BertConfig(**model_config) - - model = AutoModelForSequenceClassification.from_pretrained(ckp, config=bert_config) - - # NOTE: add hps to training arguments - training_args = TrainingArguments( - output_dir="./models/tmp-sft", - overwrite_output_dir=True, - save_strategy="no", - seed=42, - auto_find_batch_size=True, - **trainer_config - ) - trainer = Trainer( - model=model, - args=training_args, - train_dataset=train_dataset, - eval_dataset=eval_dataset, - # compute_metrics=compute_metrics, # NOTE: remove metrics and it will return eval loss by default - tokenizer=tokenizer, - # model_init=model, - data_collator=data_collator, - ) - trainer.train() - - # Evaluate the model - eval_results = trainer.evaluate() - - # Return the evaluation loss, maximize the -loss - return -eval_results['eval_loss'] - - -problem = HpProblem() -# NOTE: define hyperparameters search space -# trainer parameters -problem.add_hyperparameter((1e-6, 1e-4, "log-uniform"), "learning_rate", default_value=5e-5) -problem.add_hyperparameter((0.0, 1e-4), "weight_decay", default_value=0.0) -problem.add_hyperparameter((0.9, 0.999), "adam_beta1", default_value=0.9) -problem.add_hyperparameter((0.9, 0.999), "adam_beta2", default_value=0.999) -problem.add_hyperparameter((1e-9, 1e-7), "adam_epsilon", default_value=1e-8) -problem.add_hyperparameter((0.5, 1.0), "max_grad_norm", default_value=1.0) -problem.add_hyperparameter((3, 10), "num_train_epochs", default_value=3) -problem.add_hyperparameter([16, 32, 64, 128], "per_device_train_batch_size", default_value=32) -# pretrain config parameters - -# define the evaluator to distribute the computation -# TODO: check the method compatible with NERSC for multi-gpu usage -evaluator = Evaluator.create(run, - method="serial", - method_kwargs={ - "num_workers": 2, - }) - -search = CBO(problem, evaluator) - -results = search.search(max_evals=10) diff --git a/demo_sft_hpo_optuna.py b/demo_sft_hpo_optuna.py deleted file mode 100644 index 624a181..0000000 --- a/demo_sft_hpo_optuna.py +++ /dev/null @@ -1,121 +0,0 @@ -""" Hyperparameter search (HPS) for SFT with Optuna (with `trainer.hyperparameter_search`) - -* Task: text-classification (binary labels) -* Method: Supervised Fine-tuning -* Dataset: 1000genome -* Pre-trained model: bert-base-uncased - -""" - -import pickle - -import evaluate -import numpy as np -from datasets import load_dataset -from transformers import (AutoModelForSequenceClassification, AutoTokenizer, - BertConfig, DataCollatorWithPadding, Trainer, - TrainingArguments) - - -def optuna_hp_space(trial): - # NOTE: define the hyperparameter search space - return { - "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True), - "weight_decay": trial.suggest_float("weight_decay", 0.0, 1e-4), - "adam_beta1": trial.suggest_float("adam_beta1", 0.9, 0.999), - "adam_beta2": trial.suggest_float("adam_beta2", 0.9, 0.999), - "adam_epsilon": trial.suggest_float("adam_epsilon", 1e-9, 1e-7), - "max_grad_norm": trial.suggest_float("max_grad_norm", 0.5, 1.0), - "num_train_epochs": trial.suggest_int("num_train_epochs", 3, 10), - "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [16, 32, 64, 128]), - } - - -name = "1000genome" -ckps = [ - # "albert-base-v2", - # "albert-large-v2", - # "bert-base-cased", - # "bert-base-uncased", - "bert-large-cased", - "bert-large-uncased", - "distilbert-base-cased", - "distilbert-base-uncased", - "roberta-base", - "roberta-large", - "xlnet-base-cased", - "xlnet-large-cased" -] - -# load dataset -raw_dataset = load_dataset("csv", - data_files={"train": f"./data/{name}/train.csv", - "validation": f"./data/{name}/validation.csv", - "test": f"./data/{name}/test.csv"}) - -res = {} -for ckp in ckps: - tokenizer = AutoTokenizer.from_pretrained(ckp) - tokenizer_datasets = raw_dataset.map(lambda data: tokenizer(data["text"], truncation=True), batched=True) - data_collator = DataCollatorWithPadding(tokenizer=tokenizer) - tokenizer_datasets = tokenizer_datasets.remove_columns(["text"]) - tokenized_datasets = tokenizer_datasets.rename_column("label", "labels") - tokenized_datasets.set_format(type="torch") - - # NOTE: add hps in Bert: - # https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertConfig - config = BertConfig() - train_dataset = tokenized_datasets['train'] - eval_dataset = tokenized_datasets['validation'] - acc = evaluate.load("accuracy") - - def compute_metrics(eval_pred): - predictions, labels = eval_pred - predictions = np.argmax(predictions, axis=1) - return acc.compute(predictions=predictions, references=labels) - - # model_init = AutoModelForSequenceClassification.from_pretrained(ckp, config=config) - - def model_init(trial): - return AutoModelForSequenceClassification.from_pretrained( - ckp, - # from_tf=bool(".ckpt" in model_args.model_name_or_path), - config=config, - # cache_dir=model_args.cache_dir, - # revision=model_args.model_revision, - # token=True if model_args.use_auth_token else None, - ignore_mismatched_sizes=True, - ) - - # set hps to training arguments - training_args = TrainingArguments( - output_dir="./models/tmp-sft", - overwrite_output_dir=True, - save_strategy="no", - seed=42, - auto_find_batch_size=True, - ) - - trainer = Trainer( - model=None, - args=training_args, - train_dataset=train_dataset, - eval_dataset=eval_dataset, - # compute_metrics=compute_metrics, # NOTE: remove metrics and it will return eval loss by default - tokenizer=tokenizer, - model_init=model_init, - data_collator=data_collator, - ) - - best_trials = trainer.hyperparameter_search( - direction="minimize", - backend="optuna", - hp_space=optuna_hp_space, - n_trials=20, - # compute_objective=compute_objective, - ) - - print(ckp, best_trials.hyperparameters) - res[ckp] = best_trials.hyperparameters - -pickle.dump(res, open("hps_optuna.pkl", "wb")) diff --git a/demo_v1.py b/demo_v1.py deleted file mode 100644 index 6013633..0000000 --- a/demo_v1.py +++ /dev/null @@ -1,47 +0,0 @@ -# %% - -# %% -import pandas as pd -import torch -from transformers import (AlbertForSequenceClassification, AutoModel, - AutoModelForSequenceClassification, AutoTokenizer, RobertaModel, - BertForSequenceClassification, pipeline) - -from data_processing import build_text_data, load_tabular_data, split_dataset -from sklearn.metrics import accuracy_score, classification_report, confusion_matrix -from tqdm import tqdm - -DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") -DEVICE = "cpu" -print(DEVICE) - -# %% -''' preprocess data ''' -merged_df = load_tabular_data(columns=["wms_delay"]) -build_text_data(df=merged_df) -df = pd.read_csv("output.csv") - -# %% -ckp = "albert-base-v2" -ckp = "bert-base-uncased" -ckp = "roberta-base" -model = AutoModelForSequenceClassification.from_pretrained(ckp, num_labels=2).to(DEVICE) -tokenizer = AutoTokenizer.from_pretrained(ckp) - -# %% - -# %% -clf = pipeline("text-classification", model=model, tokenizer=tokenizer) -torch.cuda.empty_cache() -y_pred = [] -for i in tqdm(range(len(df))): - # tokers = tokenizer([df['text'][i]], padding=True, truncation=True, return_tensors="pt").to(DEVICE) - # outputs = model(**tokers) - y_pred.append(int(clf(df['text'][i])[0]["label"].split("_")[1])) -y_true = df["label"].tolist() -# inputs = tokenizer(df["text"].tolist()[:1000], padding=True, truncation=True, return_tensors="pt").to(DEVICE) -# outputs = model(**inputs) -# outputs.logits.argmax(1) -print(classification_report(y_true, y_pred)) - -# %% diff --git a/run_demo_sft_hpo_deephyper.sh b/run_demo_sft_hpo_deephyper.sh deleted file mode 100755 index 4dbfb1e..0000000 --- a/run_demo_sft_hpo_deephyper.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/usr/bin/env bash - -#SBATCH --time=03:00:00 -#SBATCH --nodes=1 -#SBATCH --constraint=gpu -#SBATCH --qos=regular -#SBATCH --account=m4144 - -module load conda -conda activate hf -export HF_EVALUATE_OFFLINE=1 - -cd /global/homes/p/papajim/GitHub/poseidon/LLM_AD -python3 /global/homes/p/papajim/GitHub/poseidon/LLM_AD/demo_sft_hpo_deephyper.py - -exit diff --git a/run_demo_sft_hpo_optuna.sh b/run_demo_sft_hpo_optuna.sh deleted file mode 100755 index f2beb5d..0000000 --- a/run_demo_sft_hpo_optuna.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/usr/bin/env bash - -#SBATCH --time=03:00:00 -#SBATCH --nodes=1 -#SBATCH --constraint=gpu -#SBATCH --qos=regular -#SBATCH --account=m4144 - -module load conda -conda activate hf -export HF_EVALUATE_OFFLINE=1 - -cd /global/homes/p/papajim/GitHub/poseidon/LLM_AD -python3 /global/homes/p/papajim/GitHub/poseidon/LLM_AD/demo_sft_hpo_optuna.py - -exit diff --git a/using_sft.ipynb b/using_sft.ipynb deleted file mode 100644 index e99dd16..0000000 --- a/using_sft.ipynb +++ /dev/null @@ -1,774 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# A full supervised fine-tuning using local data" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Load dataset" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": {}, - "outputs": [], - "source": [ - "from datasets import load_dataset\n", - "import pandas as pd\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Split the dataset into train/validation/test" - ] - }, - { - "cell_type": "code", - "execution_count": 55, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Downloading and preparing dataset csv/default to /tmp/jinh/huggingface/datasets/csv/default-6e4f0556d6300670/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d...\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "6a896d19d2cb4b8288d2abe04feaa560", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Downloading data files: 0%| | 0/3 [00:00) torch.Size([64, 2])\n" - ] - } - ], - "source": [ - "outputs = model(**batch)\n", - "print(outputs.loss, outputs.logits.shape)" - ] - }, - { - "cell_type": "code", - "execution_count": 49, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/jinh/miniconda3/envs/hf/lib/python3.11/site-packages/transformers/optimization.py:411: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n", - " warnings.warn(\n" - ] - } - ], - "source": [ - "from transformers import AdamW\n", - "\n", - "optimizer = AdamW(model.parameters(), lr=5e-5)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 50, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "602\n" - ] - } - ], - "source": [ - "from transformers import get_scheduler\n", - "\n", - "num_epochs = 1\n", - "num_training_steps = num_epochs * len(train_dataloader)\n", - "lr_scheduler = get_scheduler(\n", - " \"linear\",\n", - " optimizer=optimizer,\n", - " num_warmup_steps=0,\n", - " num_training_steps=num_training_steps,\n", - ")\n", - "print(num_training_steps)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 51, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "cuda\n" - ] - } - ], - "source": [ - "import torch\n", - "\n", - "DEVICE = torch.device(\"cuda\") if torch.cuda.is_available() else torch.device(\"cpu\")\n", - "model.to(DEVICE)\n", - "print(DEVICE)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 52, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']\n", - "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "a2f6231e046d4d47a5ce58fd4508ee1b", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/602 [00:00), hidden_states=None, attentions=None)" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# for this model, it cannot be used for classification directly\n", - "model(**inputs)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Model heads: Making sense out of numbers\n", - "![image](https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter2/transformer_and_head-dark.svg)\n", - "\n", - "Now, consider the model for `text-classification` task." - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "output: \n", - " SequenceClassifierOutput(loss=None, logits=tensor([[ 3.3347, -2.7614],\n", - " [ 3.1755, -2.6341],\n", - " [ 3.1681, -2.6305],\n", - " [ 3.2287, -2.6752]], grad_fn=), hidden_states=None, attentions=None)\n", - "logits: \n", - " tensor([[ 3.3347, -2.7614],\n", - " [ 3.1755, -2.6341],\n", - " [ 3.1681, -2.6305],\n", - " [ 3.2287, -2.6752]], grad_fn=)\n", - "prob. : \n", - " tensor([[0.9978, 0.0022],\n", - " [0.9970, 0.0030],\n", - " [0.9970, 0.0030],\n", - " [0.9973, 0.0027]], grad_fn=)\n", - "labels: \n", - " tensor([0, 0, 0, 0])\n", - "true_labels: \n", - " [0, 1, 1, 1]\n" - ] - } - ], - "source": [ - "model = DistilBertForSequenceClassification.from_pretrained(ckp, num_labels=2)\n", - "outputs = model(**inputs)\n", - "print(\"output: \\n\", outputs)\n", - "print(\"logits: \\n\", outputs.logits)\n", - "print(\"prob. : \\n\", torch.nn.functional.softmax(outputs.logits, dim=-1))\n", - "print(\"labels: \\n\", outputs.logits.argmax(dim=-1))\n", - "print(\"true_labels: \\n\", inputs_labels)" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "DistilBertForSequenceClassification(\n", - " (distilbert): DistilBertModel(\n", - " (embeddings): Embeddings(\n", - " (word_embeddings): Embedding(30522, 768, padding_idx=0)\n", - " (position_embeddings): Embedding(512, 768)\n", - " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (transformer): Transformer(\n", - " (layer): ModuleList(\n", - " (0-5): 6 x TransformerBlock(\n", - " (attention): MultiHeadSelfAttention(\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " (q_lin): Linear(in_features=768, out_features=768, bias=True)\n", - " (k_lin): Linear(in_features=768, out_features=768, bias=True)\n", - " (v_lin): Linear(in_features=768, out_features=768, bias=True)\n", - " (out_lin): Linear(in_features=768, out_features=768, bias=True)\n", - " )\n", - " (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", - " (ffn): FFN(\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " (lin1): Linear(in_features=768, out_features=3072, bias=True)\n", - " (lin2): Linear(in_features=3072, out_features=768, bias=True)\n", - " (activation): GELUActivation()\n", - " )\n", - " (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", - " )\n", - " )\n", - " )\n", - " )\n", - " (pre_classifier): Linear(in_features=768, out_features=768, bias=True)\n", - " (classifier): Linear(in_features=768, out_features=2, bias=True)\n", - " (dropout): Dropout(p=0.2, inplace=False)\n", - ")\n" - ] - } - ], - "source": [ - "print(model)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Handling multiple sentences\n" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Tokens ['w', '##ms', '_', 'delay', 'is', '1', '.', '0', 'queue', '_', 'delay', 'is', '4', '.', '0', 'run', '##time', 'is', '6', '.', '0', 'post', '_', 'script', '_', 'delay', 'is', '5', '.', '0', 'stage', '_', 'in', '_', 'delay', 'is', 'nan', 'stage', '_', 'out', '_', 'delay', 'is', 'nan', 'stage', '_', 'in', '_', 'bytes', 'is', 'nan', 'stage', '_', 'out', '_', 'bytes', 'is', 'nan', 'kicks', '##tar', '##t', '_', 'ex', '##ec', '##utable', '##s', '_', 'cpu', '_', 'time', 'is', '0', '.', '2']\n", - "IDs [1059, 5244, 1035, 8536, 2003, 1015, 1012, 1014, 24240, 1035, 8536, 2003, 1018, 1012, 1014, 2448, 7292, 2003, 1020, 1012, 1014, 2695, 1035, 5896, 1035, 8536, 2003, 1019, 1012, 1014, 2754, 1035, 1999, 1035, 8536, 2003, 16660, 2754, 1035, 2041, 1035, 8536, 2003, 16660, 2754, 1035, 1999, 1035, 27507, 2003, 16660, 2754, 1035, 2041, 1035, 27507, 2003, 16660, 14590, 7559, 2102, 1035, 4654, 8586, 23056, 2015, 1035, 17368, 1035, 2051, 2003, 1014, 1012, 1016]\n", - "Logits tensor([[ 3.7297, -3.0441]], grad_fn=)\n", - "Prob. tensor([[0.9989, 0.0011]], grad_fn=)\n" - ] - } - ], - "source": [ - "# first sentence\n", - "tokens = tokenizer.tokenize([text_df.loc[0,:][\"text\"]][0])\n", - "print(\"Tokens\", tokens)\n", - "ids = tokenizer.convert_tokens_to_ids(tokens)\n", - "print(\"IDs \",ids)\n", - "output = model(torch.tensor([ids]))\n", - "print(\"Logits\", output.logits)\n", - "prob = torch.nn.functional.softmax(output.logits, dim=-1)\n", - "print(\"Prob. \", prob)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "__NOTE__:\n", - "* the tokens are splitted into subwords, or integers\n", - "* logits are the model output\n", - "* probabilities indicates the confidence of the model on the prediction between [0, 1]" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'\\ndf = pd.read_csv(\"output.csv\")\\ntorch.cuda.empty_cache()\\ny_pred = []\\nfor i in range(len(df)):\\n # tokers = tokenizer([df[\\'text\\'][i]], padding=True, truncation=True, return_tensors=\"pt\").to(DEVICE)\\n # outputs = model(**tokers)\\n y_pred.append(int(clf(df[\\'text\\'][i])[0][\"label\"].split(\"_\")[1]))\\ny_true = df[\"label\"].tolist()\\n# inputs = tokenizer(df[\"text\"].tolist()[:1000], padding=True, truncation=True, return_tensors=\"pt\").to(DEVICE)\\n# outputs = model(**inputs)\\n# outputs.logits.argmax(1)\\nclassification_report(y_true, y_pred)\\n'" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "'''\n", - "df = pd.read_csv(\"output.csv\")\n", - "torch.cuda.empty_cache()\n", - "y_pred = []\n", - "for i in range(len(df)):\n", - " # tokers = tokenizer([df['text'][i]], padding=True, truncation=True, return_tensors=\"pt\").to(DEVICE)\n", - " # outputs = model(**tokers)\n", - " y_pred.append(int(clf(df['text'][i])[0][\"label\"].split(\"_\")[1]))\n", - "y_true = df[\"label\"].tolist()\n", - "# inputs = tokenizer(df[\"text\"].tolist()[:1000], padding=True, truncation=True, return_tensors=\"pt\").to(DEVICE)\n", - "# outputs = model(**inputs)\n", - "# outputs.logits.argmax(1)\n", - "classification_report(y_true, y_pred)\n", - "'''" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### wrapping up: from tokenizer to model" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "pred labels tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", - " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", - " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", - " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", - " 0, 0, 0, 0])\n", - "true labels [0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1]\n" - ] - } - ], - "source": [ - "inputs = tokenizer(text_df['text'].tolist()[:100], padding=True, truncation=True, return_tensors=\"pt\")\n", - "output = model(**inputs)\n", - "print(\"pred labels\", output.logits.argmax(1))\n", - "print(\"true labels\", text_df['label'].tolist()[:100])" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.metrics import accuracy_score, classification_report\n" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " precision recall f1-score support\n", - "\n", - " 0 1.00 0.26 0.41 100\n", - " 1 0.00 0.00 0.00 0\n", - "\n", - " accuracy 0.26 100\n", - " macro avg 0.50 0.13 0.21 100\n", - "weighted avg 1.00 0.26 0.41 100\n", - "\n" - ] - } - ], - "source": [ - "rep = classification_report(output.logits.argmax(1).detach().cpu().numpy(), text_df['label'].tolist()[:100])\n", - "print(rep)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "__NOTE__:\n", - "* tokenizer and model take all the input is inefficient. `OOM` issue on both CPU and GPU.\n", - "* " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Supervised Fine-Tuning\n" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "from datasets import load_dataset" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Found cached dataset csv (/tmp/jinh/huggingface/datasets/csv/default-0a7c04ab8c22fc34/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d)\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "7fe4a4f05c5349c499ada5f3f54f877f", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/3 [00:003\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mtokenizer_function\u001b[39m(data):\n\u001b[1;32m 4\u001b[0m \u001b[39mreturn\u001b[39;00m tokenizer(data[\u001b[39m\"\u001b[39m\u001b[39mtext\u001b[39m\u001b[39m\"\u001b[39m], truncation\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m)\n\u001b[0;32m----> 6\u001b[0m tokenized_datasets \u001b[39m=\u001b[39m raw_datasets\u001b[39m.\u001b[39mmap(tokenizer_function, batched\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m)\n\u001b[1;32m 7\u001b[0m data_collator \u001b[39m=\u001b[39m DataCollatorWithPadding(tokenizer\u001b[39m=\u001b[39mtokenizer)\n", - "File \u001b[0;32m~/miniconda3/envs/hf/lib/python3.11/site-packages/datasets/dataset_dict.py:851\u001b[0m, in \u001b[0;36mDatasetDict.map\u001b[0;34m(self, function, with_indices, with_rank, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, load_from_cache_file, cache_file_names, writer_batch_size, features, disable_nullable, fn_kwargs, num_proc, desc)\u001b[0m\n\u001b[1;32m 848\u001b[0m \u001b[39mif\u001b[39;00m cache_file_names \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[1;32m 849\u001b[0m cache_file_names \u001b[39m=\u001b[39m {k: \u001b[39mNone\u001b[39;00m \u001b[39mfor\u001b[39;00m k \u001b[39min\u001b[39;00m \u001b[39mself\u001b[39m}\n\u001b[1;32m 850\u001b[0m \u001b[39mreturn\u001b[39;00m DatasetDict(\n\u001b[0;32m--> 851\u001b[0m {\n\u001b[1;32m 852\u001b[0m k: dataset\u001b[39m.\u001b[39mmap(\n\u001b[1;32m 853\u001b[0m function\u001b[39m=\u001b[39mfunction,\n\u001b[1;32m 854\u001b[0m with_indices\u001b[39m=\u001b[39mwith_indices,\n\u001b[1;32m 855\u001b[0m with_rank\u001b[39m=\u001b[39mwith_rank,\n\u001b[1;32m 856\u001b[0m input_columns\u001b[39m=\u001b[39minput_columns,\n\u001b[1;32m 857\u001b[0m batched\u001b[39m=\u001b[39mbatched,\n\u001b[1;32m 858\u001b[0m batch_size\u001b[39m=\u001b[39mbatch_size,\n\u001b[1;32m 859\u001b[0m drop_last_batch\u001b[39m=\u001b[39mdrop_last_batch,\n\u001b[1;32m 860\u001b[0m remove_columns\u001b[39m=\u001b[39mremove_columns,\n\u001b[1;32m 861\u001b[0m keep_in_memory\u001b[39m=\u001b[39mkeep_in_memory,\n\u001b[1;32m 862\u001b[0m load_from_cache_file\u001b[39m=\u001b[39mload_from_cache_file,\n\u001b[1;32m 863\u001b[0m cache_file_name\u001b[39m=\u001b[39mcache_file_names[k],\n\u001b[1;32m 864\u001b[0m writer_batch_size\u001b[39m=\u001b[39mwriter_batch_size,\n\u001b[1;32m 865\u001b[0m features\u001b[39m=\u001b[39mfeatures,\n\u001b[1;32m 866\u001b[0m disable_nullable\u001b[39m=\u001b[39mdisable_nullable,\n\u001b[1;32m 867\u001b[0m fn_kwargs\u001b[39m=\u001b[39mfn_kwargs,\n\u001b[1;32m 868\u001b[0m num_proc\u001b[39m=\u001b[39mnum_proc,\n\u001b[1;32m 869\u001b[0m desc\u001b[39m=\u001b[39mdesc,\n\u001b[1;32m 870\u001b[0m )\n\u001b[1;32m 871\u001b[0m \u001b[39mfor\u001b[39;00m k, dataset \u001b[39min\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mitems()\n\u001b[1;32m 872\u001b[0m }\n\u001b[1;32m 873\u001b[0m )\n", - "File \u001b[0;32m~/miniconda3/envs/hf/lib/python3.11/site-packages/datasets/dataset_dict.py:852\u001b[0m, in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 848\u001b[0m \u001b[39mif\u001b[39;00m cache_file_names \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[1;32m 849\u001b[0m cache_file_names \u001b[39m=\u001b[39m {k: \u001b[39mNone\u001b[39;00m \u001b[39mfor\u001b[39;00m k \u001b[39min\u001b[39;00m \u001b[39mself\u001b[39m}\n\u001b[1;32m 850\u001b[0m \u001b[39mreturn\u001b[39;00m DatasetDict(\n\u001b[1;32m 851\u001b[0m {\n\u001b[0;32m--> 852\u001b[0m k: dataset\u001b[39m.\u001b[39mmap(\n\u001b[1;32m 853\u001b[0m function\u001b[39m=\u001b[39mfunction,\n\u001b[1;32m 854\u001b[0m with_indices\u001b[39m=\u001b[39mwith_indices,\n\u001b[1;32m 855\u001b[0m with_rank\u001b[39m=\u001b[39mwith_rank,\n\u001b[1;32m 856\u001b[0m input_columns\u001b[39m=\u001b[39minput_columns,\n\u001b[1;32m 857\u001b[0m batched\u001b[39m=\u001b[39mbatched,\n\u001b[1;32m 858\u001b[0m batch_size\u001b[39m=\u001b[39mbatch_size,\n\u001b[1;32m 859\u001b[0m drop_last_batch\u001b[39m=\u001b[39mdrop_last_batch,\n\u001b[1;32m 860\u001b[0m remove_columns\u001b[39m=\u001b[39mremove_columns,\n\u001b[1;32m 861\u001b[0m keep_in_memory\u001b[39m=\u001b[39mkeep_in_memory,\n\u001b[1;32m 862\u001b[0m load_from_cache_file\u001b[39m=\u001b[39mload_from_cache_file,\n\u001b[1;32m 863\u001b[0m cache_file_name\u001b[39m=\u001b[39mcache_file_names[k],\n\u001b[1;32m 864\u001b[0m writer_batch_size\u001b[39m=\u001b[39mwriter_batch_size,\n\u001b[1;32m 865\u001b[0m features\u001b[39m=\u001b[39mfeatures,\n\u001b[1;32m 866\u001b[0m disable_nullable\u001b[39m=\u001b[39mdisable_nullable,\n\u001b[1;32m 867\u001b[0m fn_kwargs\u001b[39m=\u001b[39mfn_kwargs,\n\u001b[1;32m 868\u001b[0m num_proc\u001b[39m=\u001b[39mnum_proc,\n\u001b[1;32m 869\u001b[0m desc\u001b[39m=\u001b[39mdesc,\n\u001b[1;32m 870\u001b[0m )\n\u001b[1;32m 871\u001b[0m \u001b[39mfor\u001b[39;00m k, dataset \u001b[39min\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mitems()\n\u001b[1;32m 872\u001b[0m }\n\u001b[1;32m 873\u001b[0m )\n", - "File \u001b[0;32m~/miniconda3/envs/hf/lib/python3.11/site-packages/datasets/arrow_dataset.py:580\u001b[0m, in \u001b[0;36mtransmit_tasks..wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 578\u001b[0m \u001b[39mself\u001b[39m: \u001b[39m\"\u001b[39m\u001b[39mDataset\u001b[39m\u001b[39m\"\u001b[39m \u001b[39m=\u001b[39m kwargs\u001b[39m.\u001b[39mpop(\u001b[39m\"\u001b[39m\u001b[39mself\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[1;32m 579\u001b[0m \u001b[39m# apply actual function\u001b[39;00m\n\u001b[0;32m--> 580\u001b[0m out: Union[\u001b[39m\"\u001b[39m\u001b[39mDataset\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39mDatasetDict\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m func(\u001b[39mself\u001b[39m, \u001b[39m*\u001b[39margs, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs)\n\u001b[1;32m 581\u001b[0m datasets: List[\u001b[39m\"\u001b[39m\u001b[39mDataset\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m \u001b[39mlist\u001b[39m(out\u001b[39m.\u001b[39mvalues()) \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(out, \u001b[39mdict\u001b[39m) \u001b[39melse\u001b[39;00m [out]\n\u001b[1;32m 582\u001b[0m \u001b[39mfor\u001b[39;00m dataset \u001b[39min\u001b[39;00m datasets:\n\u001b[1;32m 583\u001b[0m \u001b[39m# Remove task templates if a column mapping of the template is no longer valid\u001b[39;00m\n", - "File \u001b[0;32m~/miniconda3/envs/hf/lib/python3.11/site-packages/datasets/arrow_dataset.py:545\u001b[0m, in \u001b[0;36mtransmit_format..wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 538\u001b[0m self_format \u001b[39m=\u001b[39m {\n\u001b[1;32m 539\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mtype\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_format_type,\n\u001b[1;32m 540\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mformat_kwargs\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_format_kwargs,\n\u001b[1;32m 541\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mcolumns\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_format_columns,\n\u001b[1;32m 542\u001b[0m \u001b[39m\"\u001b[39m\u001b[39moutput_all_columns\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_output_all_columns,\n\u001b[1;32m 543\u001b[0m }\n\u001b[1;32m 544\u001b[0m \u001b[39m# apply actual function\u001b[39;00m\n\u001b[0;32m--> 545\u001b[0m out: Union[\u001b[39m\"\u001b[39m\u001b[39mDataset\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39mDatasetDict\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m func(\u001b[39mself\u001b[39m, \u001b[39m*\u001b[39margs, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs)\n\u001b[1;32m 546\u001b[0m datasets: List[\u001b[39m\"\u001b[39m\u001b[39mDataset\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m \u001b[39mlist\u001b[39m(out\u001b[39m.\u001b[39mvalues()) \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(out, \u001b[39mdict\u001b[39m) \u001b[39melse\u001b[39;00m [out]\n\u001b[1;32m 547\u001b[0m \u001b[39m# re-apply format to the output\u001b[39;00m\n", - "File \u001b[0;32m~/miniconda3/envs/hf/lib/python3.11/site-packages/datasets/arrow_dataset.py:3087\u001b[0m, in \u001b[0;36mDataset.map\u001b[0;34m(self, function, with_indices, with_rank, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, load_from_cache_file, cache_file_name, writer_batch_size, features, disable_nullable, fn_kwargs, num_proc, suffix_template, new_fingerprint, desc)\u001b[0m\n\u001b[1;32m 3079\u001b[0m \u001b[39mif\u001b[39;00m transformed_dataset \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[1;32m 3080\u001b[0m \u001b[39mwith\u001b[39;00m logging\u001b[39m.\u001b[39mtqdm(\n\u001b[1;32m 3081\u001b[0m disable\u001b[39m=\u001b[39m\u001b[39mnot\u001b[39;00m logging\u001b[39m.\u001b[39mis_progress_bar_enabled(),\n\u001b[1;32m 3082\u001b[0m unit\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m examples\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 3085\u001b[0m desc\u001b[39m=\u001b[39mdesc \u001b[39mor\u001b[39;00m \u001b[39m\"\u001b[39m\u001b[39mMap\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[1;32m 3086\u001b[0m ) \u001b[39mas\u001b[39;00m pbar:\n\u001b[0;32m-> 3087\u001b[0m \u001b[39mfor\u001b[39;00m rank, done, content \u001b[39min\u001b[39;00m Dataset\u001b[39m.\u001b[39m_map_single(\u001b[39m*\u001b[39m\u001b[39m*\u001b[39mdataset_kwargs):\n\u001b[1;32m 3088\u001b[0m \u001b[39mif\u001b[39;00m done:\n\u001b[1;32m 3089\u001b[0m shards_done \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m \u001b[39m1\u001b[39m\n", - "File \u001b[0;32m~/miniconda3/envs/hf/lib/python3.11/site-packages/datasets/arrow_dataset.py:3463\u001b[0m, in \u001b[0;36mDataset._map_single\u001b[0;34m(shard, function, with_indices, with_rank, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, cache_file_name, writer_batch_size, features, disable_nullable, fn_kwargs, new_fingerprint, rank, offset)\u001b[0m\n\u001b[1;32m 3459\u001b[0m indices \u001b[39m=\u001b[39m \u001b[39mlist\u001b[39m(\n\u001b[1;32m 3460\u001b[0m \u001b[39mrange\u001b[39m(\u001b[39m*\u001b[39m(\u001b[39mslice\u001b[39m(i, i \u001b[39m+\u001b[39m batch_size)\u001b[39m.\u001b[39mindices(shard\u001b[39m.\u001b[39mnum_rows)))\n\u001b[1;32m 3461\u001b[0m ) \u001b[39m# Something simpler?\u001b[39;00m\n\u001b[1;32m 3462\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m-> 3463\u001b[0m batch \u001b[39m=\u001b[39m apply_function_on_filtered_inputs(\n\u001b[1;32m 3464\u001b[0m batch,\n\u001b[1;32m 3465\u001b[0m indices,\n\u001b[1;32m 3466\u001b[0m check_same_num_examples\u001b[39m=\u001b[39m\u001b[39mlen\u001b[39m(shard\u001b[39m.\u001b[39mlist_indexes()) \u001b[39m>\u001b[39m \u001b[39m0\u001b[39m,\n\u001b[1;32m 3467\u001b[0m offset\u001b[39m=\u001b[39moffset,\n\u001b[1;32m 3468\u001b[0m )\n\u001b[1;32m 3469\u001b[0m \u001b[39mexcept\u001b[39;00m NumExamplesMismatchError:\n\u001b[1;32m 3470\u001b[0m \u001b[39mraise\u001b[39;00m DatasetTransformationNotAllowedError(\n\u001b[1;32m 3471\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mUsing `.map` in batched mode on a dataset with attached indexes is allowed only if it doesn\u001b[39m\u001b[39m'\u001b[39m\u001b[39mt create or remove existing examples. You can first run `.drop_index() to remove your index and then re-add it.\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 3472\u001b[0m ) \u001b[39mfrom\u001b[39;00m \u001b[39mNone\u001b[39;00m\n", - "File \u001b[0;32m~/miniconda3/envs/hf/lib/python3.11/site-packages/datasets/arrow_dataset.py:3344\u001b[0m, in \u001b[0;36mDataset._map_single..apply_function_on_filtered_inputs\u001b[0;34m(pa_inputs, indices, check_same_num_examples, offset)\u001b[0m\n\u001b[1;32m 3342\u001b[0m \u001b[39mif\u001b[39;00m with_rank:\n\u001b[1;32m 3343\u001b[0m additional_args \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m (rank,)\n\u001b[0;32m-> 3344\u001b[0m processed_inputs \u001b[39m=\u001b[39m function(\u001b[39m*\u001b[39mfn_args, \u001b[39m*\u001b[39madditional_args, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mfn_kwargs)\n\u001b[1;32m 3345\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(processed_inputs, LazyDict):\n\u001b[1;32m 3346\u001b[0m processed_inputs \u001b[39m=\u001b[39m {\n\u001b[1;32m 3347\u001b[0m k: v \u001b[39mfor\u001b[39;00m k, v \u001b[39min\u001b[39;00m processed_inputs\u001b[39m.\u001b[39mdata\u001b[39m.\u001b[39mitems() \u001b[39mif\u001b[39;00m k \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m processed_inputs\u001b[39m.\u001b[39mkeys_to_format\n\u001b[1;32m 3348\u001b[0m }\n", - "\u001b[1;32m/home/jinh/PoSeiDon/llm_ad/using_transformers.ipynb Cell 36\u001b[0m line \u001b[0;36m4\n\u001b[1;32m 3\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mtokenizer_function\u001b[39m(data):\n\u001b[0;32m----> 4\u001b[0m \u001b[39mreturn\u001b[39;00m tokenizer(data[\u001b[39m\"\u001b[39m\u001b[39mtext\u001b[39m\u001b[39m\"\u001b[39m], truncation\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m)\n", - "\u001b[0;31mNameError\u001b[0m: name 'tokenizer' is not defined" - ] - } - ], - "source": [ - "# setup tokenizer function\n", - "from transformers import DataCollatorWithPadding\n", - "def tokenizer_function(data):\n", - " return tokenizer(data[\"text\"], truncation=True)\n", - "\n", - "tokenized_datasets = raw_datasets.map(tokenizer_function, batched=True)\n", - "data_collator = DataCollatorWithPadding(tokenizer=tokenizer)" - ] - }, - { - "cell_type": "code", - "execution_count": 66, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "
\n", - " \n", - " \n", - " [14427/14427 10:50, Epoch 3/3]\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
StepTraining Loss
5000.541000
10000.559700
15000.543100
20000.543000
25000.530200
30000.550100
35000.549800
40000.531500
45000.543900
50000.547900
55000.544300
60000.534000
65000.536400
70000.529500
75000.557400
80000.537500
85000.534000
90000.537900
95000.535200
100000.535700
105000.527200
110000.525100
115000.546300
120000.540500
125000.553300
130000.547600
135000.518700
140000.539300

" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "TrainOutput(global_step=14427, training_loss=0.5403152088237468, metrics={'train_runtime': 650.842, 'train_samples_per_second': 177.32, 'train_steps_per_second': 22.167, 'total_flos': 298587208529160.0, 'train_loss': 0.5403152088237468, 'epoch': 3.0})" - ] - }, - "execution_count": 66, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# trainer API\n", - "from transformers import TrainingArguments, Trainer\n", - "\n", - "training_args = TrainingArguments(output_dir=\"sft\", \n", - " save_strategy=\"epoch\", overwrite_output_dir=True)\n", - "trainer = Trainer(model, training_args, \n", - " train_dataset=tokenized_datasets[\"train\"], \n", - " eval_dataset=tokenized_datasets[\"validation\"], \n", - " data_collator=data_collator, \n", - " tokenizer=tokenizer)\n", - "\n", - "trainer.train()\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 67, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(4809, 2) (4809,)\n" - ] - } - ], - "source": [ - "# NOTE: evaluate on the validation set\n", - "predictions = trainer.predict(tokenized_datasets[\"validation\"])\n", - "print(predictions.predictions.shape, predictions.label_ids.shape)" - ] - }, - { - "cell_type": "code", - "execution_count": 68, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "\n", - "preds = np.argmax(predictions.predictions, axis=-1)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 69, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "106" - ] - }, - "execution_count": 69, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "(preds==0).sum()" - ] - }, - { - "cell_type": "code", - "execution_count": 70, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'accuracy': 0.7602412143896861,\n", - " 'f1': 0.8602932267054405,\n", - " 'precision': 0.7548373378694451,\n", - " 'recall': 1.0}" - ] - }, - "execution_count": 70, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import evaluate\n", - "\n", - "metric = evaluate.combine([\"accuracy\", \"f1\", \"precision\", \"recall\"])\n", - "metric.compute(predictions=preds, references=predictions.label_ids)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 47, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "1.0" - ] - }, - "execution_count": 47, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "accuracy_score(predictions.label_ids, preds)" - ] - }, - { - "cell_type": "code", - "execution_count": 50, - "metadata": {}, - "outputs": [], - "source": [ - "# wrapping everything together, we can define a `compute_metrics` function and put in trainer\n", - "def compute_metrics(eval_preds):\n", - " metric = evaluate.combine([\"accuracy\", \"f1\", \"precision\", \"recall\"])\n", - " logits, labels = eval_preds\n", - " predictions = np.argmax(logits, axis=-1)\n", - " return metric.compute(predictions=predictions, references=labels)\n", - " # return accuracy_score(labels, predictions)" - ] - }, - { - "cell_type": "code", - "execution_count": 51, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "

\n", - " \n", - " \n", - " [1803/1803 01:28, Epoch 3/3]\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
StepTraining Loss
5000.000000
10000.000000
15000.013400

" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "TrainOutput(global_step=1803, training_loss=0.0037077239942009204, metrics={'train_runtime': 88.7796, 'train_samples_per_second': 162.47, 'train_steps_per_second': 20.309, 'total_flos': 351620221110624.0, 'train_loss': 0.0037077239942009204, 'epoch': 3.0})" - ] - }, - "execution_count": 51, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# update trainer\n", - "trainer = Trainer(\n", - " model,\n", - " training_args,\n", - " train_dataset=tokenized_datasets[\"train\"],\n", - " eval_dataset=tokenized_datasets[\"validation\"],\n", - " data_collator=data_collator,\n", - " tokenizer=tokenizer,\n", - " compute_metrics=compute_metrics,\n", - ")\n", - "trainer.train()" - ] - }, - { - "cell_type": "code", - "execution_count": 52, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "ename": "ValueError", - "evalue": "too many values to unpack (expected 2)", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", - "\u001b[1;32m/home/jinh/PoSeiDon/llm_ad/using_transformers.ipynb Cell 44\u001b[0m line \u001b[0;36m2\n\u001b[1;32m 1\u001b[0m predictions \u001b[39m=\u001b[39m trainer\u001b[39m.\u001b[39mpredict(tokenized_datasets[\u001b[39m\"\u001b[39m\u001b[39mtest\u001b[39m\u001b[39m\"\u001b[39m])\n\u001b[0;32m----> 2\u001b[0m compute_metrics(predictions)\n", - "\u001b[1;32m/home/jinh/PoSeiDon/llm_ad/using_transformers.ipynb Cell 44\u001b[0m line \u001b[0;36m4\n\u001b[1;32m 2\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mcompute_metrics\u001b[39m(eval_preds):\n\u001b[1;32m 3\u001b[0m metric \u001b[39m=\u001b[39m evaluate\u001b[39m.\u001b[39mcombine([\u001b[39m\"\u001b[39m\u001b[39maccuracy\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39mf1\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39mprecision\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39mrecall\u001b[39m\u001b[39m\"\u001b[39m])\n\u001b[0;32m----> 4\u001b[0m logits, labels \u001b[39m=\u001b[39m eval_preds\n\u001b[1;32m 5\u001b[0m predictions \u001b[39m=\u001b[39m np\u001b[39m.\u001b[39margmax(logits, axis\u001b[39m=\u001b[39m\u001b[39m-\u001b[39m\u001b[39m1\u001b[39m)\n\u001b[1;32m 6\u001b[0m \u001b[39mreturn\u001b[39;00m metric\u001b[39m.\u001b[39mcompute(predictions\u001b[39m=\u001b[39mpredictions, references\u001b[39m=\u001b[39mlabels)\n", - "\u001b[0;31mValueError\u001b[0m: too many values to unpack (expected 2)" - ] - } - ], - "source": [ - "predictions = trainer.predict(tokenized_datasets[\"test\"])\n", - "compute_metrics(predictions)" - ] - }, - { - "cell_type": "code", - "execution_count": 54, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'test_loss': 3.4706106877280263e-09,\n", - " 'test_accuracy': 1.0,\n", - " 'test_f1': 1.0,\n", - " 'test_precision': 1.0,\n", - " 'test_recall': 1.0,\n", - " 'test_runtime': 58.3066,\n", - " 'test_samples_per_second': 659.788,\n", - " 'test_steps_per_second': 82.478}" - ] - }, - "execution_count": 54, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "predictions.metrics\n" - ] - }, - { - "cell_type": "code", - "execution_count": 72, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Number of parameters: 66,955,010\n" - ] - } - ], - "source": [ - "num_params = sum(p.numel() for p in model.parameters())\n", - "print(\"Number of parameters: {:,}\".format(num_params))\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "hf", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.3" - }, - "orig_nbformat": 4 - }, - "nbformat": 4, - "nbformat_minor": 2 -}