Merge branch 'master' of https://github.com/IndoNLP/nusa-crowd

IndoNLP · Aug 19, 2022 · eed2b41 · eed2b41
2 parents 50f62ed + 06e13e6
commit eed2b41
Show file tree

Hide file tree

Showing 17 changed files with 1,108 additions and 19 deletions.
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -1,3 +1,3 @@
 # These are the current maintainers/admin of the nusantara-datasets repo
 
-* @samuelcahyawijaya @afaji @holylovenia @gentaiscool @bryanwilie @fajri91
+* @samuelcahyawijaya @afaji @holylovenia @gentaiscool @bryanwilie @fajri91 @christianwbsn @muhsatrio
diff --git a/.github/workflows/update-readme.yml b/.github/workflows/update-readme.yml
@@ -0,0 +1,31 @@
+name: Update README
+on:
+  push:
+    branches:
+    - master
+  workflow_dispatch:
+  schedule:
+    - cron:  '0 0 * * 0'
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Check out repo
+      uses: actions/checkout@v2
+    - name: Set up Python
+      uses: actions/setup-python@v2
+      with:
+        python-version: 3.8
+        architecture: x64
+    - name: Update README
+      run: |-
+        python update_readme.py
+        cat README.md
+    - name: Commit and push if changed
+      run: |-
+        git diff
+        git config --global user.email "[email protected]"
+        git config --global user.name "README-Bot"
+        git add -A
+        git commit -m "Updated progress bar" || exit 0
+        git push
diff --git a/README.id.md b/README.id.md
@@ -4,13 +4,15 @@
 
 ![Dataset claimed](https://progress-bar.dev/81/?title=Datasets%20Claimed%20(77%20Datasets%20Claimed))
 
+<!-- milestone starts -->
 ![Milestone 1](https://progress-bar.dev/100/?title=Milestone%201%20(30%20Datasets%20Completed))
 
-![Milestone 2](https://progress-bar.dev/80/?title=Milestone%202%20(60%20Datasets%20Completed))
+![Milestone 2](https://progress-bar.dev/95/?title=Milestone%202%20(60%20Datasets%20Completed))
 
-![Milestone 3](https://progress-bar.dev/48/?title=Milestone%203%20(100%20Datasets%20Completed))
+![Milestone 3](https://progress-bar.dev/57/?title=Milestone%203%20(100%20Datasets%20Completed))
 
-![Milestone 4](https://progress-bar.dev/32/?title=Milestone%204%20(150%20Datasets%20Completed))
+![Milestone 4](https://progress-bar.dev/38/?title=Milestone%204%20(150%20Datasets%20Completed))
+<!-- milestone ends -->
 
 *Read this README in [English](README.md).*
 

diff --git a/README.md b/README.md
@@ -4,13 +4,15 @@
 
 ![Dataset claimed](https://progress-bar.dev/80/?title=Datasets%20Claimed%20(77%20Datasets%20Claimed))
 
+<!-- milestone starts -->
 ![Milestone 1](https://progress-bar.dev/100/?title=Milestone%201%20(30%20Datasets%20Completed))
 
-![Milestone 2](https://progress-bar.dev/80/?title=Milestone%202%20(60%20Datasets%20Completed))
+![Milestone 2](https://progress-bar.dev/95/?title=Milestone%202%20(60%20Datasets%20Completed))
 
-![Milestone 3](https://progress-bar.dev/48/?title=Milestone%203%20(100%20Datasets%20Completed))
+![Milestone 3](https://progress-bar.dev/57/?title=Milestone%203%20(100%20Datasets%20Completed))
 
-![Milestone 4](https://progress-bar.dev/32/?title=Milestone%204%20(150%20Datasets%20Completed))
+![Milestone 4](https://progress-bar.dev/38/?title=Milestone%204%20(150%20Datasets%20Completed))
+<!-- milestone ends -->
 
 *Baca README ini dalam [Bahasa Indonesia](README.id.md).*
 
@@ -60,7 +62,7 @@ You can upload your dataset publicly first, eg. on Github.
 
 #### Can I create a PR if I have an idea?
 
-If you have an idea to improve or change the code of the nusa-crowd repository, please create an `issue` and ask for `feedback` before starting any PRs. 
+If you have an idea to improve or change the code of the nusa-crowd repository, please create an `issue` and ask for `feedback` before starting any PRs.
 
 #### I am confused, can you help me?
 

diff --git a/nusantara/nusa_datasets/id_qqp/id_qqp.py b/nusantara/nusa_datasets/id_qqp/id_qqp.py
@@ -10,14 +10,22 @@
 import json
 
 _CITATION = """\
-    https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs
+@misc{quoraFirstQuora,
+	author = {},
+	title = {{F}irst {Q}uora {D}ataset {R}elease: {Q}uestion {P}airs --- quoradata.quora.com},
+	howpublished = {https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs},
+	year = 2017,
+	note = {Online},
+}
 """
 
 _DATASETNAME = "id_qqp"
 
 _DESCRIPTION = """\
-INDOSUM is a new benchmark dataset for Indonesian text summarization. 
-The dataset consists of news articles and manually constructed summaries.
+Quora Question Pairs (QQP) dataset consists of over 400,000 question pairs, 
+and each question pair is annotated with a binary value indicating whether 
+the two questions are paraphrase of each other. This dataset is translated 
+version of QQP to Indonesian Language.
 """
 
 _HOMEPAGE = "https://github.com/louisowen6/quora_paraphrasing_id"
@@ -38,8 +46,13 @@
 _NUSANTARA_VERSION = "1.0.0"
 
 
-class IndoSUM(datasets.GeneratorBasedBuilder):
-    """INDOSUM is a new benchmark dataset for Indonesian text summarization. The dataset consists of news articles and manually constructed summaries."""
+class IdQuoraQuestionPairs(datasets.GeneratorBasedBuilder):
+    """
+    Quora Question Pairs (QQP) dataset consists of over 400,000 question pairs, 
+    and each question pair is annotated with a binary value indicating whether 
+    the two questions are paraphrase of each other. This dataset is translated 
+    version of QQP to Indonesian Language.
+    """
 
     SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
     NUSANTARA_VERSION = datasets.Version(_NUSANTARA_VERSION)

diff --git a/nusantara/nusa_datasets/id_stance/id_stance.py b/nusantara/nusa_datasets/id_stance/id_stance.py
@@ -0,0 +1,133 @@
+import json
+from pathlib import Path
+from typing import List
+
+import datasets
+import pandas as pd
+
+from nusantara.utils import schemas
+from nusantara.utils.configs import NusantaraConfig
+from nusantara.utils.constants import Tasks
+
+_CITATION = """\
+@INPROCEEDINGS{8629144,  
+  author={R. {Jannati} and R. {Mahendra} and C. W. {Wardhana} and M. {Adriani}},
+  booktitle={2018 International Conference on Asian Language Processing (IALP)},
+  title={Stance Classification Towards Political Figures on Blog Writing},
+  year={2018},
+  volume={},
+  number={},
+  pages={96-101},
+}
+"""
+_DATASETNAME = "id_stance"
+_DESCRIPTION = """\
+Stance Classification Towards Political Figures on Blog Writing.
+This dataset contains dataset from the second research, which is combined from the first research and new dataset.
+The dataset consist of 337 data, about five target and every target have 1 different event.
+Two label are used: 'For' and 'Againts'.
+1. For - the text that is created by author is support the target in an event
+2. Against - the text that is created by author is oppose the target in an event
+"""
+_HOMEPAGE = "https://github.com/reneje/id_stance_dataset_article-Stance-Classification-Towards-Political-Figures-on-Blog-Writing"
+_LICENSE = "Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License"
+_URLs = {
+    _DATASETNAME: "https://raw.githubusercontent.com/reneje/id_stance_dataset_article-Stance-Classification-Towards-Political-Figures-on-Blog-Writing/master/dataset_stance_2_label_2018_building_by_rini.csv"
+}
+_SUPPORTED_TASKS = [Tasks.TEXTUAL_ENTAILMENT]
+_SOURCE_VERSION = "1.0.0"
+_NUSANTARA_VERSION = "1.0.0"
+
+
+def parse_list(content):
+    if (not content):
+        return []
+    try:
+        return json.loads(content)
+    except:
+        return json.loads("[\"" + content[1:-1].replace("\"", "\\\"") + "\"]")
+
+
+class IdStance(datasets.GeneratorBasedBuilder):
+    """The ID Stance dataset is annotated with a label whether the article is in favor of the person in the context of the event"""
+
+    BUILDER_CONFIGS = [
+        NusantaraConfig(
+            name="id_stance_source",
+            version=datasets.Version(_SOURCE_VERSION),
+            description="IdStance source schema",
+            schema="source",
+            subset_id="id_stance",
+        ),
+        NusantaraConfig(
+            name="id_stance_nusantara_pairs",
+            version=datasets.Version(_NUSANTARA_VERSION),
+            description="IdStance Nusantara schema",
+            schema="nusantara_pairs",
+            subset_id="id_stance",
+        ),
+    ]
+
+    DEFAULT_CONFIG_NAME = "id_stance_source"
+
+    def _info(self):
+        if self.config.schema == "source":
+            features = datasets.Features(
+                {
+                    "person": datasets.Value("string"),
+                    "event": datasets.Value("string"),
+                    "title": datasets.Value("string"),
+                    "content": datasets.Value("string"),
+                    "stance_final": datasets.Value("string"),
+                }
+            )
+        elif self.config.schema == "nusantara_pairs":
+            features = schemas.pairs_features(["for", "against", "againts", "no"])
+
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=features,
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
+        data_path = Path(dl_manager.download_and_extract(_URLs[_DATASETNAME]))
+        data_files = {
+            "train": data_path,
+        }
+
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                gen_kwargs={"filepath": data_files["train"]},
+            ),
+        ]
+
+    def _generate_examples(self, filepath: Path):
+        df = pd.read_csv(filepath, sep=";", header="infer", keep_default_na=False).reset_index()
+        df.columns = ["id", "person", "event", "title", "content", "stance_final", ""]
+        df.content = df.content.apply(parse_list)
+
+        if self.config.schema == "source":
+            for row in df.itertuples():
+                ex = {
+                    "person": row.person,
+                    "event": row.event,
+                    "title": row.title,
+                    "content": " ".join(row.content),
+                    "stance_final": row.stance_final
+                }
+                yield row.id, ex
+        elif self.config.schema == "nusantara_pairs":
+            for row in df.itertuples():
+                ex = {
+                    "id": row.id,
+                    "text_1": row.person + " | " + row.event,
+                    "text_2": " ".join([row.title] + row.content),
+                    "label": row.stance_final
+                }
+                yield row.id, ex
+        else:
+            raise ValueError(f"Invalid config: {self.config.name}")
diff --git a/nusantara/nusa_datasets/indo_puisi/indo_puisi.py b/nusantara/nusa_datasets/indo_puisi/indo_puisi.py
@@ -0,0 +1,114 @@
+from pathlib import Path
+from typing import Dict, List, Tuple
+
+import datasets
+import pandas as pd
+
+from nusantara.utils import schemas
+from nusantara.utils.configs import NusantaraConfig
+from nusantara.utils.constants import (DEFAULT_NUSANTARA_VIEW_NAME,
+                                       DEFAULT_SOURCE_VIEW_NAME, Tasks)
+
+_DATASETNAME = "indo_puisi"
+_SOURCE_VIEW_NAME = DEFAULT_SOURCE_VIEW_NAME
+_UNIFIED_VIEW_NAME = DEFAULT_NUSANTARA_VIEW_NAME
+
+_CITATION = """
+"""
+
+_DESCRIPTION = """\
+Puisi is an Indonesian poetic form. The dataset was collected by scraping various websites. It contains 7223 Indonesian puisi along with the title and author.
+"""
+
+_HOMEPAGE = "https://github.com/ilhamfp/puisi-pantun-generator"
+
+_LICENSE = "Creative Commons Attribution Share-Alike 4.0 International"
+
+_SUPPORTED_TASKS = [Tasks.SELF_SUPERVISED_PRETRAINING]
+
+_SOURCE_VERSION = "1.0.0"
+
+_NUSANTARA_VERSION = "1.0.0"
+
+_URLS = {
+    "train": "https://raw.githubusercontent.com/ilhamfp/puisi-pantun-generator/main/data/puisi.csv",
+}
+
+
+class IndoPuisi(datasets.GeneratorBasedBuilder):
+    """IndoPuisi contains 7223 Indonesian puisi along with the title and author."""
+
+    BUILDER_CONFIGS = (
+        NusantaraConfig(
+            name="indo_puisi_source",
+            version=_SOURCE_VERSION,
+            description="Indo puisi source schema",
+            schema="source",
+            subset_id="indo_puisi",
+        ),
+        NusantaraConfig(
+            name="indo_puisi_nusantara_ssp",
+            version=_NUSANTARA_VERSION,
+            description="Indo puisi Nusantara schema",
+            schema="nusantara_ssp",
+            subset_id="indo_puisi",
+        ),
+    )
+
+    DEFAULT_CONFIG_NAME = "indo_puisi_source"
+
+    def _info(self) -> datasets.DatasetInfo:
+        if self.config.schema == "source":
+            features = datasets.Features(
+                {
+                    "id": datasets.Value("string"),
+                    "puisi": datasets.Value("string"),
+                    "title": datasets.Value("string"),
+                    "author": datasets.Value("string"),
+                    "puisi_with_header": datasets.Value("string"),
+                }
+            )
+        elif self.config.schema == "nusantara_ssp":
+            features = schemas.self_supervised_pretraining.features
+        else:
+            raise ValueError(f"Invalid config schema: {self.config.schema}")
+
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=features,
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
+        """Returns SplitGenerators."""
+        train_csv_path = Path(dl_manager.download(_URLS["train"]))
+
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                gen_kwargs={"filepath": train_csv_path},
+            ),
+        ]
+
+    def _generate_examples(self, filepath: Path) -> Tuple[int, Dict]:
+        if self.config.schema != "source" and self.config.schema != "nusantara_ssp":
+            raise ValueError(f"Invalid config schema: {self.config.schema}")
+
+        df = pd.read_csv(filepath).reset_index()
+        if self.config.name == "indo_puisi_source":
+            for row in df.itertuples():
+                ex = {
+                    "id": str(row.index),
+                    "puisi": str(row.puisi).rstrip(),
+                    "title": row.title,
+                    "author": row.author,
+                    "puisi_with_header": str(row.puisi_with_header).rstrip(),
+                }
+                yield row.index, ex
+
+        elif self.config.name == "indo_puisi_nusantara_ssp":
+            for row in df.itertuples():
+                ex = {"id": str(row.index), "text": str(row.puisi).rstrip()}
+                yield row.index, ex