forked from piEsposito/br-quad-2.0
-
Notifications
You must be signed in to change notification settings - Fork 0
/
squad_v2.py
121 lines (102 loc) · 4.76 KB
/
squad_v2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
from __future__ import absolute_import, division, print_function
import json
import os
import datasets
# TODO(squad_v2): BibTeX citation
_CITATION = """\
@article{2020braquad,
author = {{Esposito}, Wladimir and {Esposito}, Piero and {Tamais},
Ana Laura and {Gatti}, Daniel},
title = "{BrQuAD - Brazilian
Question-Answering Dataset: Dataset para benchmark de modelos de
Machine Learning para question-answering em
Portugu^es brasileiro traduzindo o SQuAD com Google Cloud API}",
year = 2020,
}
"""
_DESCRIPTION = """\
Translates SQuAD 2.0 from english to portuguese using Google Cloud API
"""
_URL = "https://github.com/piEsposito/br-quad-2.0/blob/main/data/"
_DEV_FILE = "brquad-gte-dev-v2.0.json"
_TRAINING_FILE = "brquad-gte-dev-v2.0.json"
class SquadV2Config(datasets.BuilderConfig):
"""BuilderConfig for SQUAD."""
def __init__(self, **kwargs):
"""BuilderConfig for SQUADV2.
Args:
**kwargs: keyword arguments forwarded to super.
"""
super(SquadV2Config, self).__init__(**kwargs)
class SquadV2(datasets.GeneratorBasedBuilder):
"""TODO(squad_v2): Short description of my dataset."""
# TODO(squad_v2): Set up version.
BUILDER_CONFIGS = [
SquadV2Config(name="squad_v2", version=datasets.Version("2.0.0"), description="SQuAD plaint text version 2"),
]
def _info(self):
# TODO(squad_v2): Specifies the datasets.DatasetInfo object
return datasets.DatasetInfo(
# This is the description that will appear on the datasets page.
description=_DESCRIPTION,
# datasets.features.FeatureConnectors
features=datasets.Features(
{
"id": datasets.Value("string"),
"title": datasets.Value("string"),
"context": datasets.Value("string"),
"question": datasets.Value("string"),
"answers": datasets.features.Sequence(
{
"text": datasets.Value("string"),
"answer_start": datasets.Value("int32"),
}
),
# These are the features of your dataset like images, labels ...
}
),
# If there's a common (input, target) tuple from the features,
# specify them here. They'll be used if as_supervised=True in
# builder.as_dataset.
supervised_keys=None,
# Homepage of the dataset for documentation
homepage="https://rajpurkar.github.io/SQuAD-explorer/",
citation=_CITATION,
)
def _split_generators(self, dl_manager):
"""Returns SplitGenerators."""
# TODO(squad_v2): Downloads the data and defines the splits
# dl_manager is a datasets.download.DownloadManager that can be used to
# download and extract URLs
urls_to_download = {"train": os.path.join(_URL, _TRAINING_FILE), "dev": os.path.join(_URL, _DEV_FILE)}
downloaded_files = dl_manager.download_and_extract(urls_to_download)
return [
datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepath": downloaded_files["train"]}),
datasets.SplitGenerator(name=datasets.Split.VALIDATION, gen_kwargs={"filepath": downloaded_files["dev"]}),
]
def _generate_examples(self, filepath):
"""Yields examples."""
# TODO(squad_v2): Yields (key, example) tuples from the dataset
with open(filepath, encoding="utf-8") as f:
squad = json.load(f)
for example in squad["data"]:
title = example.get("title", "").strip()
for paragraph in example["paragraphs"]:
context = paragraph["context"].strip()
for qa in paragraph["qas"]:
question = qa["question"].strip()
id_ = qa["id"]
answer_starts = [answer["answer_start"] for answer in qa["answers"]]
answers = [answer["text"].strip() for answer in qa["answers"]]
# Features currently used are "context", "question", and "answers".
# Others are extracted here for the ease of future expansions.
yield id_, {
"title": title,
"context": context,
"question": question,
"id": id_,
"answers": {
"answer_start": answer_starts,
"text": answers,
},
}