Skip to content

Commit

Permalink
Merge branch 'master' of https://github.com/yana-xuyan/nusa-crowd
Browse files Browse the repository at this point in the history
  • Loading branch information
holylovenia committed Aug 10, 2022
2 parents 9f2cd00 + d779479 commit 50f62ed
Showing 1 changed file with 137 additions and 0 deletions.
137 changes: 137 additions & 0 deletions nusantara/nusa_datasets/id_qqp/id_qqp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
import os
from pathlib import Path
from typing import Dict, List, Tuple

import datasets

from nusantara.utils.configs import NusantaraConfig
from nusantara.utils.constants import Tasks
from nusantara.utils import schemas
import json

_CITATION = """\
https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs
"""

_DATASETNAME = "id_qqp"

_DESCRIPTION = """\
INDOSUM is a new benchmark dataset for Indonesian text summarization.
The dataset consists of news articles and manually constructed summaries.
"""

_HOMEPAGE = "https://github.com/louisowen6/quora_paraphrasing_id"

_LICENSE = "Apache License, Version 2.0"

_URLS = {
_DATASETNAME: [
"https://github.com/louisowen6/quora_paraphrasing_id/raw/main/ID_Quora_Paraphrasing_train.json",
"https://github.com/louisowen6/quora_paraphrasing_id/raw/main/ID_Quora_Paraphrasing_val.json",
]
}

_SUPPORTED_TASKS = [Tasks.PARAPHRASING]

_SOURCE_VERSION = "1.0.0"

_NUSANTARA_VERSION = "1.0.0"


class IndoSUM(datasets.GeneratorBasedBuilder):
"""INDOSUM is a new benchmark dataset for Indonesian text summarization. The dataset consists of news articles and manually constructed summaries."""

SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
NUSANTARA_VERSION = datasets.Version(_NUSANTARA_VERSION)

BUILDER_CONFIGS = [
NusantaraConfig(
name="id_qqp_source",
version=SOURCE_VERSION,
description="ID QQP source schema",
schema="source",
subset_id="id_qqp",
),
NusantaraConfig(
name="id_qqp_nusantara_t2t",
version=NUSANTARA_VERSION,
description="ID QQP Nusantara schema",
schema="nusantara_t2t",
subset_id="id_qqp",
),
]

DEFAULT_CONFIG_NAME = "id_qqp_source"

def _info(self) -> datasets.DatasetInfo:

if self.config.schema == "source":

features = datasets.Features(
{
"id": datasets.Value("string"),
"question_1": datasets.Value("string"),
"question_2": datasets.Value("string")
}
)

elif self.config.schema == "nusantara_t2t":
features = schemas.text2text_features

return datasets.DatasetInfo(
description=_DESCRIPTION,
features=features,
homepage=_HOMEPAGE,
license=_LICENSE,
citation=_CITATION,
)


def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
urls = _URLS[_DATASETNAME]
data_dir = dl_manager.download_and_extract(urls)
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={
"filepath": data_dir[0],
},
),
datasets.SplitGenerator(
name=datasets.Split.VALIDATION,
gen_kwargs={
"filepath": data_dir[1],
},
),
]

def _generate_examples(self, filepath: Path) -> Tuple[int, Dict]:

with open(filepath, "r") as f:
lines = f.readlines()

if self.config.schema == "source":

for i, line in enumerate(lines):
line = json.loads(line.strip())

sample = {
"id": str(i),
"question_1": line["question_1"],
"question_2": line["question_2"]
}
yield i, sample

elif self.config.schema == "nusantara_t2t":

for i, line in enumerate(lines):
line = json.loads(line.strip())

sample = {
"id": str(i),
"text_1": line["question_1"],
"text_2": line["question_2"],
"text_1_name": "question_1",
"text_2_name": "question_2"
}
yield i, sample

0 comments on commit 50f62ed

Please sign in to comment.