diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 5a87de32..450fcce8 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -1,3 +1,3 @@ # These are the current maintainers/admin of the nusantara-datasets repo -* @samuelcahyawijaya @afaji @holylovenia @gentaiscool @bryanwilie @fajri91 +* @samuelcahyawijaya @afaji @holylovenia @gentaiscool @bryanwilie @fajri91 @christianwbsn @muhsatrio diff --git a/.github/workflows/update-readme.yml b/.github/workflows/update-readme.yml new file mode 100644 index 00000000..d3718a89 --- /dev/null +++ b/.github/workflows/update-readme.yml @@ -0,0 +1,31 @@ +name: Update README +on: + push: + branches: + - master + workflow_dispatch: + schedule: + - cron: '0 0 * * 0' +jobs: + build: + runs-on: ubuntu-latest + steps: + - name: Check out repo + uses: actions/checkout@v2 + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: 3.8 + architecture: x64 + - name: Update README + run: |- + python update_readme.py + cat README.md + - name: Commit and push if changed + run: |- + git diff + git config --global user.email "readme-bot@indonlp.com" + git config --global user.name "README-Bot" + git add -A + git commit -m "Updated progress bar" || exit 0 + git push diff --git a/README.id.md b/README.id.md index 9dbff227..7dccf742 100644 --- a/README.id.md +++ b/README.id.md @@ -4,13 +4,15 @@ ![Dataset claimed](https://progress-bar.dev/81/?title=Datasets%20Claimed%20(77%20Datasets%20Claimed)) + ![Milestone 1](https://progress-bar.dev/100/?title=Milestone%201%20(30%20Datasets%20Completed)) -![Milestone 2](https://progress-bar.dev/80/?title=Milestone%202%20(60%20Datasets%20Completed)) +![Milestone 2](https://progress-bar.dev/95/?title=Milestone%202%20(60%20Datasets%20Completed)) -![Milestone 3](https://progress-bar.dev/48/?title=Milestone%203%20(100%20Datasets%20Completed)) +![Milestone 3](https://progress-bar.dev/57/?title=Milestone%203%20(100%20Datasets%20Completed)) -![Milestone 4](https://progress-bar.dev/32/?title=Milestone%204%20(150%20Datasets%20Completed)) +![Milestone 4](https://progress-bar.dev/38/?title=Milestone%204%20(150%20Datasets%20Completed)) + *Read this README in [English](README.md).* diff --git a/README.md b/README.md index 8c1b859f..06cb27ff 100644 --- a/README.md +++ b/README.md @@ -4,13 +4,15 @@ ![Dataset claimed](https://progress-bar.dev/80/?title=Datasets%20Claimed%20(77%20Datasets%20Claimed)) + ![Milestone 1](https://progress-bar.dev/100/?title=Milestone%201%20(30%20Datasets%20Completed)) -![Milestone 2](https://progress-bar.dev/80/?title=Milestone%202%20(60%20Datasets%20Completed)) +![Milestone 2](https://progress-bar.dev/95/?title=Milestone%202%20(60%20Datasets%20Completed)) -![Milestone 3](https://progress-bar.dev/48/?title=Milestone%203%20(100%20Datasets%20Completed)) +![Milestone 3](https://progress-bar.dev/57/?title=Milestone%203%20(100%20Datasets%20Completed)) -![Milestone 4](https://progress-bar.dev/32/?title=Milestone%204%20(150%20Datasets%20Completed)) +![Milestone 4](https://progress-bar.dev/38/?title=Milestone%204%20(150%20Datasets%20Completed)) + *Baca README ini dalam [Bahasa Indonesia](README.id.md).* @@ -60,7 +62,7 @@ You can upload your dataset publicly first, eg. on Github. #### Can I create a PR if I have an idea? -If you have an idea to improve or change the code of the nusa-crowd repository, please create an `issue` and ask for `feedback` before starting any PRs. +If you have an idea to improve or change the code of the nusa-crowd repository, please create an `issue` and ask for `feedback` before starting any PRs. #### I am confused, can you help me? diff --git a/nusantara/nusa_datasets/id_qqp/id_qqp.py b/nusantara/nusa_datasets/id_qqp/id_qqp.py index 87f292e0..496becdf 100644 --- a/nusantara/nusa_datasets/id_qqp/id_qqp.py +++ b/nusantara/nusa_datasets/id_qqp/id_qqp.py @@ -10,14 +10,22 @@ import json _CITATION = """\ - https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs +@misc{quoraFirstQuora, + author = {}, + title = {{F}irst {Q}uora {D}ataset {R}elease: {Q}uestion {P}airs --- quoradata.quora.com}, + howpublished = {https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs}, + year = 2017, + note = {Online}, +} """ _DATASETNAME = "id_qqp" _DESCRIPTION = """\ -INDOSUM is a new benchmark dataset for Indonesian text summarization. -The dataset consists of news articles and manually constructed summaries. +Quora Question Pairs (QQP) dataset consists of over 400,000 question pairs, +and each question pair is annotated with a binary value indicating whether +the two questions are paraphrase of each other. This dataset is translated +version of QQP to Indonesian Language. """ _HOMEPAGE = "https://github.com/louisowen6/quora_paraphrasing_id" @@ -38,8 +46,13 @@ _NUSANTARA_VERSION = "1.0.0" -class IndoSUM(datasets.GeneratorBasedBuilder): - """INDOSUM is a new benchmark dataset for Indonesian text summarization. The dataset consists of news articles and manually constructed summaries.""" +class IdQuoraQuestionPairs(datasets.GeneratorBasedBuilder): + """ + Quora Question Pairs (QQP) dataset consists of over 400,000 question pairs, + and each question pair is annotated with a binary value indicating whether + the two questions are paraphrase of each other. This dataset is translated + version of QQP to Indonesian Language. + """ SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) NUSANTARA_VERSION = datasets.Version(_NUSANTARA_VERSION) diff --git a/nusantara/nusa_datasets/id_stance/id_stance.py b/nusantara/nusa_datasets/id_stance/id_stance.py new file mode 100644 index 00000000..7a248db6 --- /dev/null +++ b/nusantara/nusa_datasets/id_stance/id_stance.py @@ -0,0 +1,133 @@ +import json +from pathlib import Path +from typing import List + +import datasets +import pandas as pd + +from nusantara.utils import schemas +from nusantara.utils.configs import NusantaraConfig +from nusantara.utils.constants import Tasks + +_CITATION = """\ +@INPROCEEDINGS{8629144, + author={R. {Jannati} and R. {Mahendra} and C. W. {Wardhana} and M. {Adriani}}, + booktitle={2018 International Conference on Asian Language Processing (IALP)}, + title={Stance Classification Towards Political Figures on Blog Writing}, + year={2018}, + volume={}, + number={}, + pages={96-101}, +} +""" +_DATASETNAME = "id_stance" +_DESCRIPTION = """\ +Stance Classification Towards Political Figures on Blog Writing. +This dataset contains dataset from the second research, which is combined from the first research and new dataset. +The dataset consist of 337 data, about five target and every target have 1 different event. +Two label are used: 'For' and 'Againts'. +1. For - the text that is created by author is support the target in an event +2. Against - the text that is created by author is oppose the target in an event +""" +_HOMEPAGE = "https://github.com/reneje/id_stance_dataset_article-Stance-Classification-Towards-Political-Figures-on-Blog-Writing" +_LICENSE = "Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License" +_URLs = { + _DATASETNAME: "https://raw.githubusercontent.com/reneje/id_stance_dataset_article-Stance-Classification-Towards-Political-Figures-on-Blog-Writing/master/dataset_stance_2_label_2018_building_by_rini.csv" +} +_SUPPORTED_TASKS = [Tasks.TEXTUAL_ENTAILMENT] +_SOURCE_VERSION = "1.0.0" +_NUSANTARA_VERSION = "1.0.0" + + +def parse_list(content): + if (not content): + return [] + try: + return json.loads(content) + except: + return json.loads("[\"" + content[1:-1].replace("\"", "\\\"") + "\"]") + + +class IdStance(datasets.GeneratorBasedBuilder): + """The ID Stance dataset is annotated with a label whether the article is in favor of the person in the context of the event""" + + BUILDER_CONFIGS = [ + NusantaraConfig( + name="id_stance_source", + version=datasets.Version(_SOURCE_VERSION), + description="IdStance source schema", + schema="source", + subset_id="id_stance", + ), + NusantaraConfig( + name="id_stance_nusantara_pairs", + version=datasets.Version(_NUSANTARA_VERSION), + description="IdStance Nusantara schema", + schema="nusantara_pairs", + subset_id="id_stance", + ), + ] + + DEFAULT_CONFIG_NAME = "id_stance_source" + + def _info(self): + if self.config.schema == "source": + features = datasets.Features( + { + "person": datasets.Value("string"), + "event": datasets.Value("string"), + "title": datasets.Value("string"), + "content": datasets.Value("string"), + "stance_final": datasets.Value("string"), + } + ) + elif self.config.schema == "nusantara_pairs": + features = schemas.pairs_features(["for", "against", "againts", "no"]) + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + data_path = Path(dl_manager.download_and_extract(_URLs[_DATASETNAME])) + data_files = { + "train": data_path, + } + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={"filepath": data_files["train"]}, + ), + ] + + def _generate_examples(self, filepath: Path): + df = pd.read_csv(filepath, sep=";", header="infer", keep_default_na=False).reset_index() + df.columns = ["id", "person", "event", "title", "content", "stance_final", ""] + df.content = df.content.apply(parse_list) + + if self.config.schema == "source": + for row in df.itertuples(): + ex = { + "person": row.person, + "event": row.event, + "title": row.title, + "content": " ".join(row.content), + "stance_final": row.stance_final + } + yield row.id, ex + elif self.config.schema == "nusantara_pairs": + for row in df.itertuples(): + ex = { + "id": row.id, + "text_1": row.person + " | " + row.event, + "text_2": " ".join([row.title] + row.content), + "label": row.stance_final + } + yield row.id, ex + else: + raise ValueError(f"Invalid config: {self.config.name}") diff --git a/nusantara/nusa_datasets/indo_puisi/indo_puisi.py b/nusantara/nusa_datasets/indo_puisi/indo_puisi.py new file mode 100644 index 00000000..f6d16dc6 --- /dev/null +++ b/nusantara/nusa_datasets/indo_puisi/indo_puisi.py @@ -0,0 +1,114 @@ +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets +import pandas as pd + +from nusantara.utils import schemas +from nusantara.utils.configs import NusantaraConfig +from nusantara.utils.constants import (DEFAULT_NUSANTARA_VIEW_NAME, + DEFAULT_SOURCE_VIEW_NAME, Tasks) + +_DATASETNAME = "indo_puisi" +_SOURCE_VIEW_NAME = DEFAULT_SOURCE_VIEW_NAME +_UNIFIED_VIEW_NAME = DEFAULT_NUSANTARA_VIEW_NAME + +_CITATION = """ +""" + +_DESCRIPTION = """\ +Puisi is an Indonesian poetic form. The dataset was collected by scraping various websites. It contains 7223 Indonesian puisi along with the title and author. +""" + +_HOMEPAGE = "https://github.com/ilhamfp/puisi-pantun-generator" + +_LICENSE = "Creative Commons Attribution Share-Alike 4.0 International" + +_SUPPORTED_TASKS = [Tasks.SELF_SUPERVISED_PRETRAINING] + +_SOURCE_VERSION = "1.0.0" + +_NUSANTARA_VERSION = "1.0.0" + +_URLS = { + "train": "https://raw.githubusercontent.com/ilhamfp/puisi-pantun-generator/main/data/puisi.csv", +} + + +class IndoPuisi(datasets.GeneratorBasedBuilder): + """IndoPuisi contains 7223 Indonesian puisi along with the title and author.""" + + BUILDER_CONFIGS = ( + NusantaraConfig( + name="indo_puisi_source", + version=_SOURCE_VERSION, + description="Indo puisi source schema", + schema="source", + subset_id="indo_puisi", + ), + NusantaraConfig( + name="indo_puisi_nusantara_ssp", + version=_NUSANTARA_VERSION, + description="Indo puisi Nusantara schema", + schema="nusantara_ssp", + subset_id="indo_puisi", + ), + ) + + DEFAULT_CONFIG_NAME = "indo_puisi_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "id": datasets.Value("string"), + "puisi": datasets.Value("string"), + "title": datasets.Value("string"), + "author": datasets.Value("string"), + "puisi_with_header": datasets.Value("string"), + } + ) + elif self.config.schema == "nusantara_ssp": + features = schemas.self_supervised_pretraining.features + else: + raise ValueError(f"Invalid config schema: {self.config.schema}") + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + train_csv_path = Path(dl_manager.download(_URLS["train"])) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={"filepath": train_csv_path}, + ), + ] + + def _generate_examples(self, filepath: Path) -> Tuple[int, Dict]: + if self.config.schema != "source" and self.config.schema != "nusantara_ssp": + raise ValueError(f"Invalid config schema: {self.config.schema}") + + df = pd.read_csv(filepath).reset_index() + if self.config.name == "indo_puisi_source": + for row in df.itertuples(): + ex = { + "id": str(row.index), + "puisi": str(row.puisi).rstrip(), + "title": row.title, + "author": row.author, + "puisi_with_header": str(row.puisi_with_header).rstrip(), + } + yield row.index, ex + + elif self.config.name == "indo_puisi_nusantara_ssp": + for row in df.itertuples(): + ex = {"id": str(row.index), "text": str(row.puisi).rstrip()} + yield row.index, ex diff --git a/nusantara/nusa_datasets/pos_sun_mono/pos_sun_mono.py b/nusantara/nusa_datasets/pos_sun_mono/pos_sun_mono.py new file mode 100644 index 00000000..175973dc --- /dev/null +++ b/nusantara/nusa_datasets/pos_sun_mono/pos_sun_mono.py @@ -0,0 +1,268 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import re +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from nusantara.utils import schemas +from nusantara.utils.configs import NusantaraConfig +from nusantara.utils.constants import Tasks + +_CITATION = """\ +@data{FK2/VTAHRH_2022, + author = {ARDIYANTI SURYANI, ARIE and Widyantoro, Dwi Hendratmo and Purwarianti, Ayu and Sudaryat, Yayat}, + publisher = {Telkom University Dataverse}, + title = {{PoSTagged Sundanese Monolingual Corpus}}, + year = {2022}, + version = {DRAFT VERSION}, + doi = {10.34820/FK2/VTAHRH}, + url = {https://doi.org/10.34820/FK2/VTAHRH} +} + +@INPROCEEDINGS{7437678, + author={Suryani, Arie Ardiyanti and Widyantoro, Dwi Hendratmo and Purwarianti, Ayu and Sudaryat, Yayat}, + booktitle={2015 International Conference on Information Technology Systems and Innovation (ICITSI)}, + title={Experiment on a phrase-based statistical machine translation using PoS Tag information for Sundanese into Indonesian}, + year={2015}, + volume={}, + number={}, + pages={1-6}, + doi={10.1109/ICITSI.2015.7437678} +} +""" + +_DATASETNAME = "pos_sun_mono" + +_DESCRIPTION = """\ +This dataset contains 3616 lines of Sundanese sentences taken from several online magazines (Mangle, Dewan Dakwah Jabar, and Balebat). \ +Annotated with PoS Labels by several undergraduates of the Sundanese Language Education Study Program (PPBS), UPI Bandung. +""" + +_HOMEPAGE = "https://dataverse.telkomuniversity.ac.id/dataset.xhtml?persistentId=doi:10.34820/FK2/VTAHRH" + +_LICENSE = 'CC0 - "Public Domain Dedication"' + +_URLS = { + _DATASETNAME: "https://dataverse.telkomuniversity.ac.id/api/access/datafile/:persistentId?persistentId=doi:10.34820/FK2/VTAHRH/WQIFK8", +} + +_SUPPORTED_TASKS = [Tasks.POS_TAGGING] + +_SOURCE_VERSION = "1.1.0" + +_NUSANTARA_VERSION = "1.0.0" + + +class PosSunMonoDataset(datasets.GeneratorBasedBuilder): + """PoSTagged Sundanese Monolingual Corpus""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + NUSANTARA_VERSION = datasets.Version(_NUSANTARA_VERSION) + + # Based on Wicaksono, A. F., & Purwarianti, A. (2010). HMM Based Part-of-Speech Tagger for Bahasa Indonesia. On Proceedings of 4th International MALINDO (Malay and Indonesian Language) Workshop. + POS_TAGS = [ + "", + "!", + '"', + "'", + ")", + ",", + "-", + ".", + "...", + "....", + "/", + ":", + ";", + "?", + "C", + "CBI", + "CC", + "CDC", + "CDI", + "CDO", + "CDP", + "CDT", + "CP", + "CRB", + "CS", + "DC", + "DT", + "FE", + "FW", + "GM", + "IN", + "J", + "JJ", + "KA", + "KK", + "MD", + "MG", + "MN", + "N", + "NEG", + "NN", + "NNA", + "NNG", + "NNN", + "NNO", + "NNP", + "NNPP", + "NP", + "NPP", + "OP", + "PB", + "PCDP", + "PR", + "PRL", + "PRL|IN", + "PRN", + "PRP", + "RB", + "RBT", + "RB|RP", + "RN", + "RP", + "SC", + "SCC", + "SC|IN", + "SYM", + "UH", + "VB", + "VBI", + "VBT", + "VRB", + "W", + "WH", + "WHP", + "WRP", + "`", + "–", + "—", + "‘", + "’", + "“", + "”", + ] + + BUILDER_CONFIGS = [ + NusantaraConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=f"{_DATASETNAME}", + ), + NusantaraConfig( + name=f"{_DATASETNAME}_nusantara_seq_label", + version=NUSANTARA_VERSION, + description=f"{_DATASETNAME} Nusantara Seq Label schema", + schema="nusantara_seq_label", + subset_id=f"{_DATASETNAME}", + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + + if self.config.schema == "source": + features = datasets.Features({"labeled_sentence": datasets.Value("string")}) + elif self.config.schema == "nusantara_seq_label": + features = schemas.seq_label_features(self.POS_TAGS) + + else: + raise NotImplementedError(f"Schema '{self.config.schema}' is not defined.") + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + urls = _URLS[_DATASETNAME] + data_path = dl_manager.download(urls) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": data_path, + }, + ), + ] + + def _generate_examples(self, filepath: Path) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + + def __hotfix(line): + if line.endswith(" taun|NN 1953.|."): + return line.replace(" taun|NN 1953.|.", " taun|NN 1953|CDP .|.") + elif line.endswith(" jeung|CC|CC sasab|RB .|."): + return line.replace(" jeung|CC|CC sasab|RB .|.", " jeung|CC sasab|RB .|.") + elif line.startswith("Kagiatan|NN éta|DT dihadiran|VBT kira|-kira "): + return line.replace("Kagiatan|NN éta|DT dihadiran|VBT kira|-kira ", "Kagiatan|NN éta|DT dihadiran|VBT kira-kira|DT ") + return line + + with open(filepath, "r", encoding="utf8") as ipt: + raw = list(map(lambda l: __hotfix(l.rstrip("\n ")), ipt)) + + pat_0 = r"(,\|,|\?\|\?|-\|-|!\|!)" + repl_spc = r" \1 " + + pat_1 = r"([A-Z”])(\.\|\.)" + pat_2 = r"(\.\|\.)([^. ])" + repl_spl = r"\1 \2" + + pat_3 = r"([^ ]+\|[^ ]+)\| " + repl_del = r"\1 " + + pat_4 = r"\|\|" + repl_dup = r"|" + + def __apply_regex(txt): + for pat, repl in [(pat_0, repl_spc), (pat_1, repl_spl), (pat_2, repl_spl), (pat_3, repl_del), (pat_4, repl_dup)]: + txt = re.sub(pat, repl, txt) + return txt + + def __cleanse_label(token): + text, label = token + return text, re.sub(r"([A-Z]+)[.,)]", r"\1", label.upper()) + + if self.config.schema == "source": + for key, example in enumerate(raw): + yield key, {"labeled_sentence": example} + + elif self.config.schema == "nusantara_seq_label": + spaced = list(map(__apply_regex, raw)) + data = list(map(lambda l: [__cleanse_label(tok.split("|", 1)) for tok in filter(None, l.split(" "))], spaced)) + + for key, example in enumerate(data): + tokens, labels = zip(*example) + yield key, {"id": str(key), "tokens": tokens, "labels": labels} + + else: + raise NotImplementedError(f"Schema '{self.config.schema}' is not defined.") + + +if __name__ == "__main__": + datasets.load_dataset(__file__) diff --git a/nusantara/nusa_datasets/ud_id_csui/__init__.py b/nusantara/nusa_datasets/ud_id_csui/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/nusantara/nusa_datasets/ud_id_csui/ud_id_csui.py b/nusantara/nusa_datasets/ud_id_csui/ud_id_csui.py new file mode 100644 index 00000000..5a9281c0 --- /dev/null +++ b/nusantara/nusa_datasets/ud_id_csui/ud_id_csui.py @@ -0,0 +1,239 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets +from conllu import TokenList + +from nusantara.utils import schemas +from nusantara.utils.common_parser import load_ud_data, load_ud_data_as_nusantara_kb +from nusantara.utils.configs import NusantaraConfig +from nusantara.utils.constants import Tasks + +_CITATION = """\ +@article {10.3844/jcssp.2020.1585.1597, +author = {Alfina, Ika and Budi, Indra and Suhartanto, Heru}, +title = {Tree Rotations for Dependency Trees: Converting the Head-Directionality of Noun Phrases}, +article_type = {journal}, +volume = {16}, +number = {11}, +year = {2020}, +month = {Nov}, +pages = {1585-1597}, +doi = {10.3844/jcssp.2020.1585.1597}, +url = {https://thescipub.com/abstract/jcssp.2020.1585.1597}, +journal = {Journal of Computer Science}, +publisher = {Science Publications} +} +""" + +_DATASETNAME = "ud_id_csui" + +_DESCRIPTION = """\ +UD Indonesian-CSUI is a conversion from an Indonesian constituency treebank in the Penn Treebank format named Kethu that was also a conversion from a constituency treebank built by Dinakaramani et al. (2015). +This treebank is named after the place where treebanks were built: Faculty of Computer Science (CS), Universitas Indonesia (UI). + +About this treebank: +- Genre is news in formal Indonesian (the majority is economic news) +- 1030 sentences (28K words) divided into testing and training dataset of around 10K words and around 18K words respectively. +- Average of 27.4 words per-sentence. +""" + +_HOMEPAGE = "https://github.com/UniversalDependencies/UD_Indonesian-CSUI" + +_LICENSE = "CC BY-SA 4.0" + +_URLS = { + _DATASETNAME: { + "train": "https://raw.githubusercontent.com/UniversalDependencies/UD_Indonesian-CSUI/master/id_csui-ud-train.conllu", + "test": "https://raw.githubusercontent.com/UniversalDependencies/UD_Indonesian-CSUI/master/id_csui-ud-test.conllu", + }, +} + +_SUPPORTED_TASKS = [Tasks.DEPENDENCY_PARSING, Tasks.MACHINE_TRANSLATION, Tasks.POS_TAGGING] + +_SOURCE_VERSION = "1.0.0" + +_NUSANTARA_VERSION = "1.0.0" + + +class UdIdCsuiDataset(datasets.GeneratorBasedBuilder): + """Treebank of formal Indonesian news which consists of 1030 sentences (28K words)""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + NUSANTARA_VERSION = datasets.Version(_NUSANTARA_VERSION) + + # source: https://universaldependencies.org/u/pos/ + UPOS_TAGS = ["ADJ", "ADP", "ADV", "AUX", "CCONJ", "DET", "INTJ", "NOUN", "NUM", "PART", "PRON", "PROPN", "PUNCT", "SCONJ", "SYM", "VERB", "X"] + + BUILDER_CONFIGS = [ + NusantaraConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=f"{_DATASETNAME}", + ), + NusantaraConfig( + name=f"{_DATASETNAME}_nusantara_kb", + version=NUSANTARA_VERSION, + description=f"{_DATASETNAME} Nusantara KB schema", + schema="nusantara_kb", + subset_id=f"{_DATASETNAME}", + ), + NusantaraConfig( + name=f"{_DATASETNAME}_nusantara_t2t", + version=NUSANTARA_VERSION, + description=f"{_DATASETNAME} Nusantara Text to Text schema", + schema="nusantara_t2t", + subset_id=f"{_DATASETNAME}", + ), + NusantaraConfig( + name=f"{_DATASETNAME}_nusantara_seq_label", + version=NUSANTARA_VERSION, + description=f"{_DATASETNAME} Nusantara Seq Label schema", + schema="nusantara_seq_label", + subset_id=f"{_DATASETNAME}", + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + + if self.config.schema == "source": + features = datasets.Features( + { + # metadata + "sent_id": datasets.Value("string"), + "text": datasets.Value("string"), + "text_en": datasets.Value("string"), + # tokens + "id": [datasets.Value("string")], + "form": [datasets.Value("string")], + "lemma": [datasets.Value("string")], + "upos": [datasets.Value("string")], + "xpos": [datasets.Value("string")], + "feats": [datasets.Value("string")], + "head": [datasets.Value("string")], + "deprel": [datasets.Value("string")], + "deps": [datasets.Value("string")], + "misc": [datasets.Value("string")], + } + ) + + elif self.config.schema == "nusantara_kb": + features = schemas.kb_features + + elif self.config.schema == "nusantara_t2t": + features = schemas.text2text_features + + elif self.config.schema == "nusantara_seq_label": + features = schemas.seq_label_features(self.UPOS_TAGS) + + else: + raise NotImplementedError(f"Schema '{self.config.schema}' is not defined.") + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + urls = _URLS[_DATASETNAME] + data_path = dl_manager.download(urls) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": data_path["train"], + }, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "filepath": data_path["test"], + }, + ), + ] + + @staticmethod + def _assert_multispan_range_is_one(token_list: TokenList): + """ + Asserting that all tokens with multiple span can only have 2 span, and \ + no field other than form has important information + """ + for token in token_list.filter(id=lambda i: not isinstance(i, int)): + _id = token["id"] + assert len(_id) == 3, f"Unexpected length of non-int CONLLU Token's id. Expected 3, found {len(_id)};" + assert all(isinstance(a, b) for a, b in zip(_id, [int, str, int])), f"Non-int ID should be in format of '\\d+-\\d+'. Found {_id};" + assert _id[2] - _id[0] == 1, f"Token has more than 2 spans. Found {_id[2] - _id[0] + 1} spans;" + for key in ["lemma", "upos", "xpos", "feats", "head", "deprel", "deps"]: + assert token[key] in {"_", None}, f"Field other than 'form' should not contain extra information. Found: '{key}' = '{token[key]}'" + + def _generate_examples(self, filepath: Path) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + # method parameters are unpacked from `gen_kwargs` as given in `_split_generators` + + dataset = list(load_ud_data(filepath, filter_kwargs={"id": lambda i: isinstance(i, int)}, assert_fn=self._assert_multispan_range_is_one)) + + if self.config.schema == "source": + pass + + elif self.config.schema == "nusantara_kb": + dataset = load_ud_data_as_nusantara_kb(filepath, dataset) + + elif self.config.schema == "nusantara_t2t": + dataset = list( + map( + lambda d: { + "id": d["sent_id"], + "text_1": d["text"], + "text_2": d["text_en"], + "text_1_name": "ind", + "text_2_name": "eng", + }, + dataset, + ) + ) + + elif self.config.schema == "nusantara_seq_label": + dataset = list( + map( + lambda d: { + "id": d["sent_id"], + "tokens": d["form"], + "labels": d["upos"], + }, + dataset, + ) + ) + + else: + raise NotImplementedError(f"Schema '{self.config.schema}' is not defined.") + + for key, example in enumerate(dataset): + yield key, example + + +if __name__ == "__main__": + datasets.load_dataset(__file__) diff --git a/nusantara/nusa_datasets/xpersona_id/xpersona_id.py b/nusantara/nusa_datasets/xpersona_id/xpersona_id.py new file mode 100644 index 00000000..c99d8abc --- /dev/null +++ b/nusantara/nusa_datasets/xpersona_id/xpersona_id.py @@ -0,0 +1,189 @@ +import os +from pathlib import Path +from typing import Dict, List, Tuple +from nusantara.utils.constants import Tasks +from nusantara.utils import schemas + +import datasets +import json + +from nusantara.utils.configs import NusantaraConfig + +_CITATION = """\ +@article{lin2020xpersona, + title={XPersona: Evaluating multilingual personalized chatbot}, + author={Lin, Zhaojiang and Liu, Zihan and Winata, Genta Indra and Cahyawijaya, Samuel and Madotto, Andrea and Bang, Yejin and Ishii, Etsuko and Fung, Pascale}, + journal={arXiv preprint arXiv:2003.07568}, + year={2020} +} +@inproceedings{cahyawijaya-etal-2021-indonlg, + title = "{I}ndo{NLG}: Benchmark and Resources for Evaluating {I}ndonesian Natural Language Generation", + author = "Cahyawijaya, Samuel and + Winata, Genta Indra and + Wilie, Bryan and + Vincentio, Karissa and + Li, Xiaohong and + Kuncoro, Adhiguna and + Ruder, Sebastian and + Lim, Zhi Yuan and + Bahar, Syafri and + Khodra, Masayu and + Purwarianti, Ayu and + Fung, Pascale", + booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing", + month = nov, + year = "2021", + address = "Online and Punta Cana, Dominican Republic", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2021.emnlp-main.699", + doi = "10.18653/v1/2021.emnlp-main.699", + pages = "8875--8898" +} +""" + +_DATASETNAME = "xpersona_id" + +_DESCRIPTION = """\ +XPersona is a multi-lingual extension of Persona-Chat. +XPersona dataset includes persona conversations in six different languages other than English for building and evaluating multilingual personalized agents. +""" + +_HOMEPAGE = "" + +_LICENSE = "CC-BY-SA 4.0" + +_URLS = { + _DATASETNAME: "https://storage.googleapis.com/babert-pretraining/IndoNLG_finals/downstream_task/downstream_task_datasets.zip", +} + +_SUPPORTED_TASKS = [Tasks.MACHINE_TRANSLATION] + +_SOURCE_VERSION = "1.0.0" + +_NUSANTARA_VERSION = "1.0.0" + +class XPersonaID(datasets.GeneratorBasedBuilder): + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + NUSANTARA_VERSION = datasets.Version(_NUSANTARA_VERSION) + + BUILDER_CONFIGS = [ + NusantaraConfig( + name="xpersona_id_source", + version=SOURCE_VERSION, + description="XPersona ID source schema", + schema="source", + subset_id="xpersona_id", + ), + NusantaraConfig( + name="xpersona_id_nusantara_t2t", + version=NUSANTARA_VERSION, + description="XPersona ID Nusantara schema", + schema="nusantara_t2t", + subset_id="xpersona_id", + ), + ] + + DEFAULT_CONFIG_NAME = "xpersona_id_source" + + def _info(self) -> datasets.DatasetInfo: + + if self.config.schema == "source": + features = datasets.Features( + { + "persona": datasets.Sequence( + datasets.Value("string") + ), + "dialogue": datasets.Sequence( + datasets.Sequence( + datasets.Value("string") + ) + ) + } + ) + + elif self.config.schema == "nusantara_t2t": + features = schemas.text2text_features + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + urls = _URLS[_DATASETNAME] + data_dir = dl_manager.download_and_extract(urls) + + data_dir = os.path.join(data_dir, "IndoNLG_downstream_tasks/xpersona") + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + # Whatever you put in gen_kwargs will be passed to _generate_examples + gen_kwargs={ + "filepath": os.path.join(data_dir, "Id_persona_train_corrected.json"), + "split": "train", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "filepath": os.path.join(data_dir, "Id_persona_split_test_human_annotated.json"), + "split": "test", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={ + "filepath": os.path.join(data_dir, "Id_persona_split_valid_human_annotated.json"), + "split": "dev", + }, + ), + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + data = json.load(open(filepath, "r")) + + if self.config.schema == "source": + key = 0 + for each_data in data: + example = { + "persona": each_data["persona"], + "dialogue": each_data["dialogue"] + } + yield key, example + key+=1 + + elif self.config.schema == "nusantara_t2t": + id = 0 + key = 0 + for each_data in data: + persona = " | ".join(each_data["persona"]) + for i in range(len(each_data["dialogue"]) - 1): + example = { + "text_1_name": persona, + "text_2_name": "response" + } + + # for first turn + + if i == 0: + example["id"] = "{}_{}".format(id, i) + example["text_1"] = "U: {}".format(each_data["dialogue"][i][0]) + example["text_2"] = each_data["dialogue"][i][1] + yield key, example + key+=1 + + # for second turn and other until last turn + + example["id"] = "{}_{}".format(id, i+1) + example["text_1"] = "U: {} | S: {} | U: {}".format(each_data["dialogue"][i][0], each_data["dialogue"][i][1], each_data["dialogue"][i+1][0]) + example["text_2"] = each_data["dialogue"][i+1][1] + yield key, example + key+=1 + id+=1 + + diff --git a/nusantara/utils/common_parser.py b/nusantara/utils/common_parser.py index 0581bf2b..85e92e54 100644 --- a/nusantara/utils/common_parser.py +++ b/nusantara/utils/common_parser.py @@ -1,3 +1,5 @@ +from typing import Iterable + import pandas as pd from conllu import parse @@ -21,29 +23,38 @@ def load_conll_data(file_path): return dataset -def load_ud_data(filepath): +def load_ud_data(filepath, filter_kwargs=None, assert_fn=None): """ Load and parse conllu data. Proposed by @fhudi for issue #34 and #9. :param filepath: file path + :param filter_kwargs: filtering tokens, see conllu.models.TokenList.filter() + :param assert_fn: assertion to make sure raw data is in the expected format :return: generator with schema following CONLLU """ dataset_raw = parse(open(filepath).read()) - return map(lambda sent: {**sent.metadata, **pd.DataFrame(sent).to_dict(orient="list")}, dataset_raw) + + filter_kwargs = filter_kwargs or dict() + if callable(assert_fn): + for token_list in dataset_raw: + assert_fn(token_list) + + return map(lambda sent: {**sent.metadata, **pd.DataFrame(sent.filter(**filter_kwargs)).to_dict(orient="list")}, dataset_raw) -def load_ud_data_as_nusantara_kb(filepath): +def load_ud_data_as_nusantara_kb(filepath, dataset_source: Iterable = tuple()): """ Load and parse conllu data, followed by mapping its elements to Nusantara Knowledge Base schema. Proposed by @fhudi for issue #34 and #9. :param filepath: file path + :param dataset_source: dataset with source schema (output of load_ud_data()) :return: generator for Nusantara KB schema """ - dataset_source = list(load_ud_data(filepath)) + dataset_source = dataset_source or list(load_ud_data(filepath)) def as_nusa_kb(tokens): sent_id = tokens["sent_id"] diff --git a/nusantara/utils/constants.py b/nusantara/utils/constants.py index d29f26b1..b1f532be 100644 --- a/nusantara/utils/constants.py +++ b/nusantara/utils/constants.py @@ -48,6 +48,11 @@ class Tasks(Enum): # Speech Recognition SPEECH_RECOGNITION = "ASR" + # ImageText + IMAGE_CAPTIONING = "IC" + STYLIZED_IMAGE_CAPTIONING = "SIC" + VISUALLY_GROUNDED_REASONING = "VGR" + # TASK_TO_SCHEMA = { # Tasks.NAMED_ENTITY_RECOGNITION: "KB", diff --git a/nusantara/utils/schemas/__init__.py b/nusantara/utils/schemas/__init__.py index 1671351e..67af2e0e 100644 --- a/nusantara/utils/schemas/__init__.py +++ b/nusantara/utils/schemas/__init__.py @@ -8,5 +8,6 @@ from .seq_label import features as seq_label_features from .self_supervised_pretraining import features as ssp_features from .speech_recognition import features as asr_features +from .image_text import features as image_text_features -__all__ = ["kb_features", "qa_features", "text2text_features", "text_features", "text_multi_features", "pairs_features", "pairs_features_score", "seq_label_features", "ssp_features", "asr_features"] +__all__ = ["kb_features", "qa_features", "text2text_features", "text_features", "text_multi_features", "pairs_features", "pairs_features_score", "seq_label_features", "ssp_features", "asr_features", "image_text_features"] diff --git a/nusantara/utils/schemas/image_text.py b/nusantara/utils/schemas/image_text.py new file mode 100644 index 00000000..59ef6d87 --- /dev/null +++ b/nusantara/utils/schemas/image_text.py @@ -0,0 +1,18 @@ +""" +General ImageText Classification Schema +""" +import datasets + +def features(label_names = ["Yes", "No"]): + return datasets.Features( + { + "id": datasets.Value("string"), + "image_paths": datasets.Sequence(datasets.Value("string")), + "texts": datasets.Value("string"), + "metadata": { + "context": datasets.Value("string"), + "labels": datasets.Sequence(datasets.ClassLabel(names=label_names)), + } + } + ) + diff --git a/tests/test_nusantara.py b/tests/test_nusantara.py index a36bcad3..c484b5c6 100644 --- a/tests/test_nusantara.py +++ b/tests/test_nusantara.py @@ -12,7 +12,7 @@ import datasets from datasets import DatasetDict, Features from nusantara.utils.constants import Tasks -from nusantara.utils.schemas import kb_features, pairs_features, pairs_features_score, qa_features, text2text_features, text_features, text_multi_features, seq_label_features, ssp_features, asr_features +from nusantara.utils.schemas import kb_features, pairs_features, pairs_features_score, qa_features, text2text_features, text_features, text_multi_features, seq_label_features, ssp_features, asr_features, image_text_features sys.path.append(str(Path(__file__).parent.parent)) @@ -43,6 +43,9 @@ Tasks.EMOTION_CLASSIFICATION: "TEXT", Tasks.SELF_SUPERVISED_PRETRAINING: "SSP", Tasks.SPEECH_RECOGNITION: "ASR", + Tasks.IMAGE_CAPTIONING: "IC", + Tasks.STYLIZED_IMAGE_CAPTIONING: "SIC", + Tasks.VISUALLY_GROUNDED_REASONING: "VGR", } _VALID_TASKS = set(_TASK_TO_SCHEMA.keys()) @@ -59,6 +62,9 @@ "SEQ_LABEL": seq_label_features(), "SSP": ssp_features, "ASR": asr_features, + "IC": image_text_features(), + "SIC": image_text_features(), + "VGR": image_text_features(), } _TASK_TO_FEATURES = { diff --git a/update_readme.py b/update_readme.py new file mode 100644 index 00000000..05f2d265 --- /dev/null +++ b/update_readme.py @@ -0,0 +1,57 @@ +import os +import pathlib +import re + +root = pathlib.Path(__file__).parent.resolve() + + +def replace_writing(content, marker, chunk, inline=False): + r = re.compile( + r".*".format(marker, marker), + re.DOTALL, + ) + if not inline: + chunk = "\n{}\n".format(chunk) + chunk = "{}".format(marker, chunk, marker) + return r.sub(chunk, content) + + +def build_progress_bar(milestones={}): + progress_bar = [] + progress_bar_template = "![Milestone {}](https://progress-bar.dev/{}/?title=Milestone%20{}%20({}%20Datasets%20Completed))" + for index, target in enumerate(milestones): + percentage = min(milestones[target] * 100 // target, 100) + bar = progress_bar_template.format(index + 1, percentage, index + 1, target) + progress_bar.append(bar) + return progress_bar + + +def calculate_completed_dataset(base_dir="nusantara/nusa_datasets"): + count = 0 + for path in os.listdir(base_dir): + if os.path.isdir(os.path.join(base_dir, path)) and path != "__pycache__": + count += 1 + return count + + +if __name__ == "__main__": + # read current readme + readme_path = root / "README.md" + readme = readme_path.open().read() + + readme_id_path = root / "README.id.md" + readme_id = readme_id_path.open().read() + + # calculate progress + target = [30, 60, 100, 150] + count_completed_dataset = calculate_completed_dataset() + milestones = {k: count_completed_dataset for k in target} + progress_bar = build_progress_bar(milestones=milestones) + entries_md = "\n\n".join(progress_bar) + + # Update entries + rewritten_entries = replace_writing(readme, "milestone", entries_md) + readme_path.open("w").write(rewritten_entries) + + rewritten_id_entries = replace_writing(readme_id, "milestone", entries_md) + readme_id_path.open("w").write(rewritten_id_entries)