-
Notifications
You must be signed in to change notification settings - Fork 62
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'master' of https://github.com/IndoNLP/nusa-crowd
- Loading branch information
Showing
17 changed files
with
1,108 additions
and
19 deletions.
There are no files selected for viewing
Validating CODEOWNERS rules …
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,3 @@ | ||
# These are the current maintainers/admin of the nusantara-datasets repo | ||
|
||
* @samuelcahyawijaya @afaji @holylovenia @gentaiscool @bryanwilie @fajri91 | ||
* @samuelcahyawijaya @afaji @holylovenia @gentaiscool @bryanwilie @fajri91 @christianwbsn @muhsatrio |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
name: Update README | ||
on: | ||
push: | ||
branches: | ||
- master | ||
workflow_dispatch: | ||
schedule: | ||
- cron: '0 0 * * 0' | ||
jobs: | ||
build: | ||
runs-on: ubuntu-latest | ||
steps: | ||
- name: Check out repo | ||
uses: actions/checkout@v2 | ||
- name: Set up Python | ||
uses: actions/setup-python@v2 | ||
with: | ||
python-version: 3.8 | ||
architecture: x64 | ||
- name: Update README | ||
run: |- | ||
python update_readme.py | ||
cat README.md | ||
- name: Commit and push if changed | ||
run: |- | ||
git diff | ||
git config --global user.email "[email protected]" | ||
git config --global user.name "README-Bot" | ||
git add -A | ||
git commit -m "Updated progress bar" || exit 0 | ||
git push |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,133 @@ | ||
import json | ||
from pathlib import Path | ||
from typing import List | ||
|
||
import datasets | ||
import pandas as pd | ||
|
||
from nusantara.utils import schemas | ||
from nusantara.utils.configs import NusantaraConfig | ||
from nusantara.utils.constants import Tasks | ||
|
||
_CITATION = """\ | ||
@INPROCEEDINGS{8629144, | ||
author={R. {Jannati} and R. {Mahendra} and C. W. {Wardhana} and M. {Adriani}}, | ||
booktitle={2018 International Conference on Asian Language Processing (IALP)}, | ||
title={Stance Classification Towards Political Figures on Blog Writing}, | ||
year={2018}, | ||
volume={}, | ||
number={}, | ||
pages={96-101}, | ||
} | ||
""" | ||
_DATASETNAME = "id_stance" | ||
_DESCRIPTION = """\ | ||
Stance Classification Towards Political Figures on Blog Writing. | ||
This dataset contains dataset from the second research, which is combined from the first research and new dataset. | ||
The dataset consist of 337 data, about five target and every target have 1 different event. | ||
Two label are used: 'For' and 'Againts'. | ||
1. For - the text that is created by author is support the target in an event | ||
2. Against - the text that is created by author is oppose the target in an event | ||
""" | ||
_HOMEPAGE = "https://github.com/reneje/id_stance_dataset_article-Stance-Classification-Towards-Political-Figures-on-Blog-Writing" | ||
_LICENSE = "Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License" | ||
_URLs = { | ||
_DATASETNAME: "https://raw.githubusercontent.com/reneje/id_stance_dataset_article-Stance-Classification-Towards-Political-Figures-on-Blog-Writing/master/dataset_stance_2_label_2018_building_by_rini.csv" | ||
} | ||
_SUPPORTED_TASKS = [Tasks.TEXTUAL_ENTAILMENT] | ||
_SOURCE_VERSION = "1.0.0" | ||
_NUSANTARA_VERSION = "1.0.0" | ||
|
||
|
||
def parse_list(content): | ||
if (not content): | ||
return [] | ||
try: | ||
return json.loads(content) | ||
except: | ||
return json.loads("[\"" + content[1:-1].replace("\"", "\\\"") + "\"]") | ||
|
||
|
||
class IdStance(datasets.GeneratorBasedBuilder): | ||
"""The ID Stance dataset is annotated with a label whether the article is in favor of the person in the context of the event""" | ||
|
||
BUILDER_CONFIGS = [ | ||
NusantaraConfig( | ||
name="id_stance_source", | ||
version=datasets.Version(_SOURCE_VERSION), | ||
description="IdStance source schema", | ||
schema="source", | ||
subset_id="id_stance", | ||
), | ||
NusantaraConfig( | ||
name="id_stance_nusantara_pairs", | ||
version=datasets.Version(_NUSANTARA_VERSION), | ||
description="IdStance Nusantara schema", | ||
schema="nusantara_pairs", | ||
subset_id="id_stance", | ||
), | ||
] | ||
|
||
DEFAULT_CONFIG_NAME = "id_stance_source" | ||
|
||
def _info(self): | ||
if self.config.schema == "source": | ||
features = datasets.Features( | ||
{ | ||
"person": datasets.Value("string"), | ||
"event": datasets.Value("string"), | ||
"title": datasets.Value("string"), | ||
"content": datasets.Value("string"), | ||
"stance_final": datasets.Value("string"), | ||
} | ||
) | ||
elif self.config.schema == "nusantara_pairs": | ||
features = schemas.pairs_features(["for", "against", "againts", "no"]) | ||
|
||
return datasets.DatasetInfo( | ||
description=_DESCRIPTION, | ||
features=features, | ||
homepage=_HOMEPAGE, | ||
license=_LICENSE, | ||
citation=_CITATION, | ||
) | ||
|
||
def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: | ||
data_path = Path(dl_manager.download_and_extract(_URLs[_DATASETNAME])) | ||
data_files = { | ||
"train": data_path, | ||
} | ||
|
||
return [ | ||
datasets.SplitGenerator( | ||
name=datasets.Split.TRAIN, | ||
gen_kwargs={"filepath": data_files["train"]}, | ||
), | ||
] | ||
|
||
def _generate_examples(self, filepath: Path): | ||
df = pd.read_csv(filepath, sep=";", header="infer", keep_default_na=False).reset_index() | ||
df.columns = ["id", "person", "event", "title", "content", "stance_final", ""] | ||
df.content = df.content.apply(parse_list) | ||
|
||
if self.config.schema == "source": | ||
for row in df.itertuples(): | ||
ex = { | ||
"person": row.person, | ||
"event": row.event, | ||
"title": row.title, | ||
"content": " ".join(row.content), | ||
"stance_final": row.stance_final | ||
} | ||
yield row.id, ex | ||
elif self.config.schema == "nusantara_pairs": | ||
for row in df.itertuples(): | ||
ex = { | ||
"id": row.id, | ||
"text_1": row.person + " | " + row.event, | ||
"text_2": " ".join([row.title] + row.content), | ||
"label": row.stance_final | ||
} | ||
yield row.id, ex | ||
else: | ||
raise ValueError(f"Invalid config: {self.config.name}") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,114 @@ | ||
from pathlib import Path | ||
from typing import Dict, List, Tuple | ||
|
||
import datasets | ||
import pandas as pd | ||
|
||
from nusantara.utils import schemas | ||
from nusantara.utils.configs import NusantaraConfig | ||
from nusantara.utils.constants import (DEFAULT_NUSANTARA_VIEW_NAME, | ||
DEFAULT_SOURCE_VIEW_NAME, Tasks) | ||
|
||
_DATASETNAME = "indo_puisi" | ||
_SOURCE_VIEW_NAME = DEFAULT_SOURCE_VIEW_NAME | ||
_UNIFIED_VIEW_NAME = DEFAULT_NUSANTARA_VIEW_NAME | ||
|
||
_CITATION = """ | ||
""" | ||
|
||
_DESCRIPTION = """\ | ||
Puisi is an Indonesian poetic form. The dataset was collected by scraping various websites. It contains 7223 Indonesian puisi along with the title and author. | ||
""" | ||
|
||
_HOMEPAGE = "https://github.com/ilhamfp/puisi-pantun-generator" | ||
|
||
_LICENSE = "Creative Commons Attribution Share-Alike 4.0 International" | ||
|
||
_SUPPORTED_TASKS = [Tasks.SELF_SUPERVISED_PRETRAINING] | ||
|
||
_SOURCE_VERSION = "1.0.0" | ||
|
||
_NUSANTARA_VERSION = "1.0.0" | ||
|
||
_URLS = { | ||
"train": "https://raw.githubusercontent.com/ilhamfp/puisi-pantun-generator/main/data/puisi.csv", | ||
} | ||
|
||
|
||
class IndoPuisi(datasets.GeneratorBasedBuilder): | ||
"""IndoPuisi contains 7223 Indonesian puisi along with the title and author.""" | ||
|
||
BUILDER_CONFIGS = ( | ||
NusantaraConfig( | ||
name="indo_puisi_source", | ||
version=_SOURCE_VERSION, | ||
description="Indo puisi source schema", | ||
schema="source", | ||
subset_id="indo_puisi", | ||
), | ||
NusantaraConfig( | ||
name="indo_puisi_nusantara_ssp", | ||
version=_NUSANTARA_VERSION, | ||
description="Indo puisi Nusantara schema", | ||
schema="nusantara_ssp", | ||
subset_id="indo_puisi", | ||
), | ||
) | ||
|
||
DEFAULT_CONFIG_NAME = "indo_puisi_source" | ||
|
||
def _info(self) -> datasets.DatasetInfo: | ||
if self.config.schema == "source": | ||
features = datasets.Features( | ||
{ | ||
"id": datasets.Value("string"), | ||
"puisi": datasets.Value("string"), | ||
"title": datasets.Value("string"), | ||
"author": datasets.Value("string"), | ||
"puisi_with_header": datasets.Value("string"), | ||
} | ||
) | ||
elif self.config.schema == "nusantara_ssp": | ||
features = schemas.self_supervised_pretraining.features | ||
else: | ||
raise ValueError(f"Invalid config schema: {self.config.schema}") | ||
|
||
return datasets.DatasetInfo( | ||
description=_DESCRIPTION, | ||
features=features, | ||
homepage=_HOMEPAGE, | ||
license=_LICENSE, | ||
citation=_CITATION, | ||
) | ||
|
||
def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: | ||
"""Returns SplitGenerators.""" | ||
train_csv_path = Path(dl_manager.download(_URLS["train"])) | ||
|
||
return [ | ||
datasets.SplitGenerator( | ||
name=datasets.Split.TRAIN, | ||
gen_kwargs={"filepath": train_csv_path}, | ||
), | ||
] | ||
|
||
def _generate_examples(self, filepath: Path) -> Tuple[int, Dict]: | ||
if self.config.schema != "source" and self.config.schema != "nusantara_ssp": | ||
raise ValueError(f"Invalid config schema: {self.config.schema}") | ||
|
||
df = pd.read_csv(filepath).reset_index() | ||
if self.config.name == "indo_puisi_source": | ||
for row in df.itertuples(): | ||
ex = { | ||
"id": str(row.index), | ||
"puisi": str(row.puisi).rstrip(), | ||
"title": row.title, | ||
"author": row.author, | ||
"puisi_with_header": str(row.puisi_with_header).rstrip(), | ||
} | ||
yield row.index, ex | ||
|
||
elif self.config.name == "indo_puisi_nusantara_ssp": | ||
for row in df.itertuples(): | ||
ex = {"id": str(row.index), "text": str(row.puisi).rstrip()} | ||
yield row.index, ex |
Oops, something went wrong.