diff --git a/nusantara/nusa_datasets/id_qqp/id_qqp.py b/nusantara/nusa_datasets/id_qqp/id_qqp.py new file mode 100644 index 00000000..87f292e0 --- /dev/null +++ b/nusantara/nusa_datasets/id_qqp/id_qqp.py @@ -0,0 +1,137 @@ +import os +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from nusantara.utils.configs import NusantaraConfig +from nusantara.utils.constants import Tasks +from nusantara.utils import schemas +import json + +_CITATION = """\ + https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs +""" + +_DATASETNAME = "id_qqp" + +_DESCRIPTION = """\ +INDOSUM is a new benchmark dataset for Indonesian text summarization. +The dataset consists of news articles and manually constructed summaries. +""" + +_HOMEPAGE = "https://github.com/louisowen6/quora_paraphrasing_id" + +_LICENSE = "Apache License, Version 2.0" + +_URLS = { + _DATASETNAME: [ + "https://github.com/louisowen6/quora_paraphrasing_id/raw/main/ID_Quora_Paraphrasing_train.json", + "https://github.com/louisowen6/quora_paraphrasing_id/raw/main/ID_Quora_Paraphrasing_val.json", + ] +} + +_SUPPORTED_TASKS = [Tasks.PARAPHRASING] + +_SOURCE_VERSION = "1.0.0" + +_NUSANTARA_VERSION = "1.0.0" + + +class IndoSUM(datasets.GeneratorBasedBuilder): + """INDOSUM is a new benchmark dataset for Indonesian text summarization. The dataset consists of news articles and manually constructed summaries.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + NUSANTARA_VERSION = datasets.Version(_NUSANTARA_VERSION) + + BUILDER_CONFIGS = [ + NusantaraConfig( + name="id_qqp_source", + version=SOURCE_VERSION, + description="ID QQP source schema", + schema="source", + subset_id="id_qqp", + ), + NusantaraConfig( + name="id_qqp_nusantara_t2t", + version=NUSANTARA_VERSION, + description="ID QQP Nusantara schema", + schema="nusantara_t2t", + subset_id="id_qqp", + ), + ] + + DEFAULT_CONFIG_NAME = "id_qqp_source" + + def _info(self) -> datasets.DatasetInfo: + + if self.config.schema == "source": + + features = datasets.Features( + { + "id": datasets.Value("string"), + "question_1": datasets.Value("string"), + "question_2": datasets.Value("string") + } + ) + + elif self.config.schema == "nusantara_t2t": + features = schemas.text2text_features + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + urls = _URLS[_DATASETNAME] + data_dir = dl_manager.download_and_extract(urls) + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": data_dir[0], + }, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={ + "filepath": data_dir[1], + }, + ), + ] + + def _generate_examples(self, filepath: Path) -> Tuple[int, Dict]: + + with open(filepath, "r") as f: + lines = f.readlines() + + if self.config.schema == "source": + + for i, line in enumerate(lines): + line = json.loads(line.strip()) + + sample = { + "id": str(i), + "question_1": line["question_1"], + "question_2": line["question_2"] + } + yield i, sample + + elif self.config.schema == "nusantara_t2t": + + for i, line in enumerate(lines): + line = json.loads(line.strip()) + + sample = { + "id": str(i), + "text_1": line["question_1"], + "text_2": line["question_2"], + "text_1_name": "question_1", + "text_2_name": "question_2" + } + yield i, sample