Skip to content

Commit

Permalink
Merge branch 'master' of https://github.com/IndoNLP/nusa-crowd
Browse files Browse the repository at this point in the history
  • Loading branch information
holylovenia committed Aug 19, 2022
2 parents 50f62ed + 06e13e6 commit eed2b41
Show file tree
Hide file tree
Showing 17 changed files with 1,108 additions and 19 deletions.
2 changes: 1 addition & 1 deletion .github/CODEOWNERS
Validating CODEOWNERS rules …
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
# These are the current maintainers/admin of the nusantara-datasets repo

* @samuelcahyawijaya @afaji @holylovenia @gentaiscool @bryanwilie @fajri91
* @samuelcahyawijaya @afaji @holylovenia @gentaiscool @bryanwilie @fajri91 @christianwbsn @muhsatrio
31 changes: 31 additions & 0 deletions .github/workflows/update-readme.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
name: Update README
on:
push:
branches:
- master
workflow_dispatch:
schedule:
- cron: '0 0 * * 0'
jobs:
build:
runs-on: ubuntu-latest
steps:
- name: Check out repo
uses: actions/checkout@v2
- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: 3.8
architecture: x64
- name: Update README
run: |-
python update_readme.py
cat README.md
- name: Commit and push if changed
run: |-
git diff
git config --global user.email "[email protected]"
git config --global user.name "README-Bot"
git add -A
git commit -m "Updated progress bar" || exit 0
git push
8 changes: 5 additions & 3 deletions README.id.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,15 @@

![Dataset claimed](https://progress-bar.dev/81/?title=Datasets%20Claimed%20(77%20Datasets%20Claimed))

<!-- milestone starts -->
![Milestone 1](https://progress-bar.dev/100/?title=Milestone%201%20(30%20Datasets%20Completed))

![Milestone 2](https://progress-bar.dev/80/?title=Milestone%202%20(60%20Datasets%20Completed))
![Milestone 2](https://progress-bar.dev/95/?title=Milestone%202%20(60%20Datasets%20Completed))

![Milestone 3](https://progress-bar.dev/48/?title=Milestone%203%20(100%20Datasets%20Completed))
![Milestone 3](https://progress-bar.dev/57/?title=Milestone%203%20(100%20Datasets%20Completed))

![Milestone 4](https://progress-bar.dev/32/?title=Milestone%204%20(150%20Datasets%20Completed))
![Milestone 4](https://progress-bar.dev/38/?title=Milestone%204%20(150%20Datasets%20Completed))
<!-- milestone ends -->

*Read this README in [English](README.md).*

Expand Down
10 changes: 6 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,15 @@

![Dataset claimed](https://progress-bar.dev/80/?title=Datasets%20Claimed%20(77%20Datasets%20Claimed))

<!-- milestone starts -->
![Milestone 1](https://progress-bar.dev/100/?title=Milestone%201%20(30%20Datasets%20Completed))

![Milestone 2](https://progress-bar.dev/80/?title=Milestone%202%20(60%20Datasets%20Completed))
![Milestone 2](https://progress-bar.dev/95/?title=Milestone%202%20(60%20Datasets%20Completed))

![Milestone 3](https://progress-bar.dev/48/?title=Milestone%203%20(100%20Datasets%20Completed))
![Milestone 3](https://progress-bar.dev/57/?title=Milestone%203%20(100%20Datasets%20Completed))

![Milestone 4](https://progress-bar.dev/32/?title=Milestone%204%20(150%20Datasets%20Completed))
![Milestone 4](https://progress-bar.dev/38/?title=Milestone%204%20(150%20Datasets%20Completed))
<!-- milestone ends -->

*Baca README ini dalam [Bahasa Indonesia](README.id.md).*

Expand Down Expand Up @@ -60,7 +62,7 @@ You can upload your dataset publicly first, eg. on Github.

#### Can I create a PR if I have an idea?

If you have an idea to improve or change the code of the nusa-crowd repository, please create an `issue` and ask for `feedback` before starting any PRs.
If you have an idea to improve or change the code of the nusa-crowd repository, please create an `issue` and ask for `feedback` before starting any PRs.

#### I am confused, can you help me?

Expand Down
23 changes: 18 additions & 5 deletions nusantara/nusa_datasets/id_qqp/id_qqp.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,22 @@
import json

_CITATION = """\
https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs
@misc{quoraFirstQuora,
author = {},
title = {{F}irst {Q}uora {D}ataset {R}elease: {Q}uestion {P}airs --- quoradata.quora.com},
howpublished = {https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs},
year = 2017,
note = {Online},
}
"""

_DATASETNAME = "id_qqp"

_DESCRIPTION = """\
INDOSUM is a new benchmark dataset for Indonesian text summarization.
The dataset consists of news articles and manually constructed summaries.
Quora Question Pairs (QQP) dataset consists of over 400,000 question pairs,
and each question pair is annotated with a binary value indicating whether
the two questions are paraphrase of each other. This dataset is translated
version of QQP to Indonesian Language.
"""

_HOMEPAGE = "https://github.com/louisowen6/quora_paraphrasing_id"
Expand All @@ -38,8 +46,13 @@
_NUSANTARA_VERSION = "1.0.0"


class IndoSUM(datasets.GeneratorBasedBuilder):
"""INDOSUM is a new benchmark dataset for Indonesian text summarization. The dataset consists of news articles and manually constructed summaries."""
class IdQuoraQuestionPairs(datasets.GeneratorBasedBuilder):
"""
Quora Question Pairs (QQP) dataset consists of over 400,000 question pairs,
and each question pair is annotated with a binary value indicating whether
the two questions are paraphrase of each other. This dataset is translated
version of QQP to Indonesian Language.
"""

SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
NUSANTARA_VERSION = datasets.Version(_NUSANTARA_VERSION)
Expand Down
133 changes: 133 additions & 0 deletions nusantara/nusa_datasets/id_stance/id_stance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
import json
from pathlib import Path
from typing import List

import datasets
import pandas as pd

from nusantara.utils import schemas
from nusantara.utils.configs import NusantaraConfig
from nusantara.utils.constants import Tasks

_CITATION = """\
@INPROCEEDINGS{8629144,
author={R. {Jannati} and R. {Mahendra} and C. W. {Wardhana} and M. {Adriani}},
booktitle={2018 International Conference on Asian Language Processing (IALP)},
title={Stance Classification Towards Political Figures on Blog Writing},
year={2018},
volume={},
number={},
pages={96-101},
}
"""
_DATASETNAME = "id_stance"
_DESCRIPTION = """\
Stance Classification Towards Political Figures on Blog Writing.
This dataset contains dataset from the second research, which is combined from the first research and new dataset.
The dataset consist of 337 data, about five target and every target have 1 different event.
Two label are used: 'For' and 'Againts'.
1. For - the text that is created by author is support the target in an event
2. Against - the text that is created by author is oppose the target in an event
"""
_HOMEPAGE = "https://github.com/reneje/id_stance_dataset_article-Stance-Classification-Towards-Political-Figures-on-Blog-Writing"
_LICENSE = "Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License"
_URLs = {
_DATASETNAME: "https://raw.githubusercontent.com/reneje/id_stance_dataset_article-Stance-Classification-Towards-Political-Figures-on-Blog-Writing/master/dataset_stance_2_label_2018_building_by_rini.csv"
}
_SUPPORTED_TASKS = [Tasks.TEXTUAL_ENTAILMENT]
_SOURCE_VERSION = "1.0.0"
_NUSANTARA_VERSION = "1.0.0"


def parse_list(content):
if (not content):
return []
try:
return json.loads(content)
except:
return json.loads("[\"" + content[1:-1].replace("\"", "\\\"") + "\"]")


class IdStance(datasets.GeneratorBasedBuilder):
"""The ID Stance dataset is annotated with a label whether the article is in favor of the person in the context of the event"""

BUILDER_CONFIGS = [
NusantaraConfig(
name="id_stance_source",
version=datasets.Version(_SOURCE_VERSION),
description="IdStance source schema",
schema="source",
subset_id="id_stance",
),
NusantaraConfig(
name="id_stance_nusantara_pairs",
version=datasets.Version(_NUSANTARA_VERSION),
description="IdStance Nusantara schema",
schema="nusantara_pairs",
subset_id="id_stance",
),
]

DEFAULT_CONFIG_NAME = "id_stance_source"

def _info(self):
if self.config.schema == "source":
features = datasets.Features(
{
"person": datasets.Value("string"),
"event": datasets.Value("string"),
"title": datasets.Value("string"),
"content": datasets.Value("string"),
"stance_final": datasets.Value("string"),
}
)
elif self.config.schema == "nusantara_pairs":
features = schemas.pairs_features(["for", "against", "againts", "no"])

return datasets.DatasetInfo(
description=_DESCRIPTION,
features=features,
homepage=_HOMEPAGE,
license=_LICENSE,
citation=_CITATION,
)

def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
data_path = Path(dl_manager.download_and_extract(_URLs[_DATASETNAME]))
data_files = {
"train": data_path,
}

return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={"filepath": data_files["train"]},
),
]

def _generate_examples(self, filepath: Path):
df = pd.read_csv(filepath, sep=";", header="infer", keep_default_na=False).reset_index()
df.columns = ["id", "person", "event", "title", "content", "stance_final", ""]
df.content = df.content.apply(parse_list)

if self.config.schema == "source":
for row in df.itertuples():
ex = {
"person": row.person,
"event": row.event,
"title": row.title,
"content": " ".join(row.content),
"stance_final": row.stance_final
}
yield row.id, ex
elif self.config.schema == "nusantara_pairs":
for row in df.itertuples():
ex = {
"id": row.id,
"text_1": row.person + " | " + row.event,
"text_2": " ".join([row.title] + row.content),
"label": row.stance_final
}
yield row.id, ex
else:
raise ValueError(f"Invalid config: {self.config.name}")
114 changes: 114 additions & 0 deletions nusantara/nusa_datasets/indo_puisi/indo_puisi.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
from pathlib import Path
from typing import Dict, List, Tuple

import datasets
import pandas as pd

from nusantara.utils import schemas
from nusantara.utils.configs import NusantaraConfig
from nusantara.utils.constants import (DEFAULT_NUSANTARA_VIEW_NAME,
DEFAULT_SOURCE_VIEW_NAME, Tasks)

_DATASETNAME = "indo_puisi"
_SOURCE_VIEW_NAME = DEFAULT_SOURCE_VIEW_NAME
_UNIFIED_VIEW_NAME = DEFAULT_NUSANTARA_VIEW_NAME

_CITATION = """
"""

_DESCRIPTION = """\
Puisi is an Indonesian poetic form. The dataset was collected by scraping various websites. It contains 7223 Indonesian puisi along with the title and author.
"""

_HOMEPAGE = "https://github.com/ilhamfp/puisi-pantun-generator"

_LICENSE = "Creative Commons Attribution Share-Alike 4.0 International"

_SUPPORTED_TASKS = [Tasks.SELF_SUPERVISED_PRETRAINING]

_SOURCE_VERSION = "1.0.0"

_NUSANTARA_VERSION = "1.0.0"

_URLS = {
"train": "https://raw.githubusercontent.com/ilhamfp/puisi-pantun-generator/main/data/puisi.csv",
}


class IndoPuisi(datasets.GeneratorBasedBuilder):
"""IndoPuisi contains 7223 Indonesian puisi along with the title and author."""

BUILDER_CONFIGS = (
NusantaraConfig(
name="indo_puisi_source",
version=_SOURCE_VERSION,
description="Indo puisi source schema",
schema="source",
subset_id="indo_puisi",
),
NusantaraConfig(
name="indo_puisi_nusantara_ssp",
version=_NUSANTARA_VERSION,
description="Indo puisi Nusantara schema",
schema="nusantara_ssp",
subset_id="indo_puisi",
),
)

DEFAULT_CONFIG_NAME = "indo_puisi_source"

def _info(self) -> datasets.DatasetInfo:
if self.config.schema == "source":
features = datasets.Features(
{
"id": datasets.Value("string"),
"puisi": datasets.Value("string"),
"title": datasets.Value("string"),
"author": datasets.Value("string"),
"puisi_with_header": datasets.Value("string"),
}
)
elif self.config.schema == "nusantara_ssp":
features = schemas.self_supervised_pretraining.features
else:
raise ValueError(f"Invalid config schema: {self.config.schema}")

return datasets.DatasetInfo(
description=_DESCRIPTION,
features=features,
homepage=_HOMEPAGE,
license=_LICENSE,
citation=_CITATION,
)

def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
"""Returns SplitGenerators."""
train_csv_path = Path(dl_manager.download(_URLS["train"]))

return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={"filepath": train_csv_path},
),
]

def _generate_examples(self, filepath: Path) -> Tuple[int, Dict]:
if self.config.schema != "source" and self.config.schema != "nusantara_ssp":
raise ValueError(f"Invalid config schema: {self.config.schema}")

df = pd.read_csv(filepath).reset_index()
if self.config.name == "indo_puisi_source":
for row in df.itertuples():
ex = {
"id": str(row.index),
"puisi": str(row.puisi).rstrip(),
"title": row.title,
"author": row.author,
"puisi_with_header": str(row.puisi_with_header).rstrip(),
}
yield row.index, ex

elif self.config.name == "indo_puisi_nusantara_ssp":
for row in df.itertuples():
ex = {"id": str(row.index), "text": str(row.puisi).rstrip()}
yield row.index, ex
Loading

0 comments on commit eed2b41

Please sign in to comment.