-
Notifications
You must be signed in to change notification settings - Fork 2
/
utils.py
115 lines (92 loc) · 3.32 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import os
import random
import logging
from numpy.lib.function_base import average
import torch
import numpy as np
from scipy.stats import pearsonr, spearmanr
from seqeval import metrics as seqeval_metrics
from sklearn import metrics as sklearn_metrics
from tokenization_kobert import KoBertTokenizer
from tokenization_hanbert import HanBertTokenizer
from transformers import (
ElectraConfig,
ElectraTokenizer,
ElectraForSequenceClassification,
ElectraForTokenClassification,
)
MODEL_CLASSES = {
'koelectra-base-v3': (ElectraConfig, ElectraForTokenClassification, ElectraTokenizer),
}
MODEL_PATH_MAP = {
"koelectra-base-v3": "monologg/koelectra-base-v3-discriminator",
}
TOKENIZER_CLASSES = {
"koelectra-base-v3": ElectraTokenizer,
}
MODEL_FOR_SEQUENCE_CLASSIFICATION = {
"koelectra-base-v3": ElectraForSequenceClassification,
}
MODEL_FOR_TOKEN_CLASSIFICATION = {
"koelectra-base-v3": ElectraForTokenClassification,
}
def init_logger():
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
level=logging.INFO,
)
def simple_accuracy(labels, preds):
return (labels == preds).mean()
def acc_score(labels, preds):
return {
"acc": simple_accuracy(labels, preds),
}
def pearson_and_spearman(labels, preds):
pearson_corr = pearsonr(preds, labels)[0]
spearman_corr = spearmanr(preds, labels)[0]
return {
"pearson": pearson_corr,
"spearmanr": spearman_corr,
"corr": (pearson_corr + spearman_corr) / 2,
}
def get_test_texts(args):
texts = []
with open(os.path.join(args.data_dir, args.test_file), 'r', encoding='utf-8') as f:
for line in f:
text, _ = line.split('\t')
text = text.split()
texts.append(text)
return texts
def get_labels(args):
return [label.strip() for label in open(os.path.join(args.data_dir, args.label_file), 'r', encoding='utf-8')]
def load_tokenizer(args):
return MODEL_CLASSES[args.model_type][2].from_pretrained(args.model_name_or_path)
def init_logger():
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt='%m/%d/%Y %H:%M:%S',
level=logging.INFO)
def set_seed(args):
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
if not args.no_cuda and torch.cuda.is_available():
torch.cuda.manual_seed_all(args.seed)
def f1_pre_rec(labels, preds, is_ner=True):
if is_ner:
return {
"precision": seqeval_metrics.precision_score(labels, preds, suffix=True),
"recall": seqeval_metrics.recall_score(labels, preds, suffix=True),
"f1": seqeval_metrics.f1_score(labels, preds, suffix=True),
}
else:
return {
"precision": sklearn_metrics.precision_score(labels, preds, average="macro"),
"recall": sklearn_metrics.recall_score(labels, preds, average="macro"),
"f1": sklearn_metrics.f1_score(labels, preds, average="macro"),
}
def compute_metrics(task_name, labels, preds):
if task_name == "naver-ner":
return f1_pre_rec(labels, preds, is_ner=True)
def show_ner_report(labels, preds):
return seqeval_metrics.classification_report(labels, preds, suffix=True)