-
Notifications
You must be signed in to change notification settings - Fork 1
/
dataset.py
104 lines (86 loc) · 4.13 KB
/
dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
from datasets.utils import disable_progress_bar
disable_progress_bar()
from datasets import load_dataset
from transformers import AutoTokenizer
from torch.utils.data import DataLoader
from datasets import Dataset
import torch
import numpy as np
import random
def load_nlp_dataset(dataset=None):
data_files={'train': './dataset/{}_train.csv'.format(dataset),
'validation': './dataset/{}_val.csv'.format(dataset),
'test': './dataset/{}_test.csv'.format(dataset)}
train_dataset = load_dataset('csv', data_files=data_files, split='train', ignore_verifications=False, name=dataset)
eval_dataset = load_dataset('csv', data_files=data_files, split='validation', ignore_verifications=False, name=dataset)
test_dataset = load_dataset('csv', data_files=data_files, split='test', ignore_verifications=False, name=dataset)
return train_dataset, eval_dataset, test_dataset
def pad_seq(seq, max_batch_len: int, pad_value: int):
return seq + (max_batch_len - len(seq)) * [pad_value]
def collate_batch(batch, tokenizer, device) :
batch_inputs = list()
batch_attention_masks = list()
labels = list()
max_size = max([len(ex['input_ids']) for ex in batch])
for item in batch:
batch_inputs += [pad_seq(item['input_ids'], max_size, tokenizer.pad_token_id)]
batch_attention_masks += [pad_seq(item['attention_mask'], max_size, 0)]
if "label" in item:
labels.append(item['label'])
return {"input_ids": torch.tensor(batch_inputs, dtype=torch.long).to(device),
"attention_mask": torch.tensor(batch_attention_masks, dtype=torch.long).to(device),
"labels": torch.tensor(labels, dtype=torch.long).to(device)}
def prepare_single_bert(texts, tokenizer, batch_size=1, max_len=64, device='cpu'):
def encode(examples):
return tokenizer(examples['text'], truncation=True, max_length=max_len)
my_dict = {"text": texts}
dataset = Dataset.from_dict(my_dict)
dataset = dataset.map(encode, batched=False)
data_iter = DataLoader(
dataset,
shuffle=False,
batch_size=batch_size,
collate_fn=lambda p: collate_batch(p, tokenizer, device),
drop_last=False,
)
return data_iter
def prepare_dataset_bert(model, dataset_name, batch_size=32, max_len=64, device='cpu'):
train_dataset, eval_dataset, test_dataset = load_nlp_dataset(dataset_name)
tokenizer = AutoTokenizer.from_pretrained(model)
def seed_worker(worker_id):
worker_seed = torch.initial_seed() % 2**32
np.random.seed(worker_seed)
random.seed(worker_seed)
def encode(examples):
return tokenizer(examples['text'], truncation=True, max_length=max_len)
train_dataset = train_dataset.map(encode, batched=True)
train_dataset = train_dataset.map(lambda examples: {'labels': examples['label']}, batched=True)
eval_dataset = eval_dataset.map(encode, batched=True)
eval_dataset = eval_dataset.map(lambda examples: {'labels': examples['label']}, batched=True)
test_dataset = test_dataset.map(encode, batched=True)
test_dataset = test_dataset.map(lambda examples: {'labels': examples['label']}, batched=True)
train_iter = DataLoader(
train_dataset,
shuffle=True,
batch_size=batch_size,
collate_fn=lambda p: collate_batch(p, tokenizer, device),
worker_init_fn=seed_worker,
drop_last=True,
)
val_iter = DataLoader(
eval_dataset,
shuffle=True,
batch_size=batch_size,
collate_fn=lambda p: collate_batch(p, tokenizer, device),
worker_init_fn=seed_worker,
drop_last=True,
)
test_iter = DataLoader(
test_dataset,
shuffle=False,
batch_size=batch_size,
collate_fn=lambda p: collate_batch(p, tokenizer, device),
worker_init_fn=seed_worker,
drop_last=False,
)
return train_iter, val_iter, test_iter, tokenizer