From c0d003d1add381089492eb0782f4c3ae44595b4a Mon Sep 17 00:00:00 2001 From: Nadya Aditama Date: Sat, 24 Sep 2022 21:52:33 +0700 Subject: [PATCH 01/12] data loader sundanese twitter emotions --- .../sunda_twitter_emotions.py | 125 ++++++++++++++++++ 1 file changed, 125 insertions(+) create mode 100644 nusantara/nusa_datasets/sunda_twitter_emotions/sunda_twitter_emotions.py diff --git a/nusantara/nusa_datasets/sunda_twitter_emotions/sunda_twitter_emotions.py b/nusantara/nusa_datasets/sunda_twitter_emotions/sunda_twitter_emotions.py new file mode 100644 index 00000000..9dfbb741 --- /dev/null +++ b/nusantara/nusa_datasets/sunda_twitter_emotions/sunda_twitter_emotions.py @@ -0,0 +1,125 @@ +import os +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets +import pandas as pd + +from nusantara.utils import schemas +from nusantara.utils.configs import NusantaraConfig +from nusantara.utils.constants import DEFAULT_NUSANTARA_VIEW_NAME, DEFAULT_SOURCE_VIEW_NAME, Tasks + +# TODO: Add BibTeX citation +_CITATION = """\ +@INPROCEEDINGS{ +9297929, +author={Putra, Oddy Virgantara and Wasmanson, Fathin Muhammad and Harmini, Triana and Utama, Shoffin Nahwa}, +booktitle={2020 International Conference on Computer Engineering, Network, and Intelligent Multimedia (CENIM)}, +title={Sundanese Twitter Dataset for Emotion Classification}, +year={2020}, +volume={}, +number={}, +pages={391-395}, +doi={10.1109/CENIM51130.2020.9297929} +} +""" + +_DATASETNAME = "sunda_twitter_emotions" + +_DESCRIPTION = """\ +This dataset is designed for Emotion Classification NLP task. +""" +_HOMEPAGE = "" + +_LICENSE = "UNKNOWN" + +_URLS = { + "datasets": "https://raw.githubusercontent.com/virgantara/sundanese-twitter-dataset/master/newdataset.csv" +} + +_SUPPORTED_TASKS = [Tasks.EMOTION_CLASSIFICATION] + + +_SOURCE_VERSION = "1.0.0" + +_NUSANTARA_VERSION = "1.0.0" + + +class NewDataset(datasets.GeneratorBasedBuilder): + """TODO: Short description of my dataset.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + NUSANTARA_VERSION = datasets.Version(_NUSANTARA_VERSION) + + BUILDER_CONFIGS = [ + NusantaraConfig( + name="sunda_twitter_emotions_source", + version=SOURCE_VERSION, + description="Sundanese Twitter Dataset for Emotion source schema", + schema="source", + subset_id="sunda_twitter_emotions", + ), + NusantaraConfig( + name="sunda_twitter_emotions_nusantara_text", + version=NUSANTARA_VERSION, + description="Sundanese Twitter Dataset for Emotion Nusantara schema", + schema="nusantara_text", + subset_id="sunda_twitter_emotions", + ), + ] + + DEFAULT_CONFIG_NAME = "sunda_twitter_emotions_source" + + def _info(self) -> datasets.DatasetInfo: + + if self.config.schema == "source": + features = datasets.Features({ + "index": datasets.Value("string"), + "data": datasets.Value("string"), + "label": datasets.Value("string")}) + + # For example nusantara_kb, nusantara_t2t + elif self.config.schema == "nusantara_text": + features = schemas.text_features(["anger", "joy", "fear", "sadness"]) + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + urls = _URLS + data_dir = Path(dl_manager.download_and_extract(urls['datasets'])) + data_files = {"train":data_dir} + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": data_files['train'], + "split": "train", + }, + ) + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + + df = pd.read_csv(filepath, sep=",", header="infer").reset_index() + df.columns = ["index","label", "data"] + + if self.config.schema == "source": + for row in df.itertuples(): + ex = {"index": str(row.index), "data": row.data, "label": row.label} + yield row.index, ex + elif self.config.schema == "nusantara_text": + for row in df.itertuples(): + ex = {"id": str(row.index), "text": row.data, "label": row.label} + yield row.index, ex + else: + raise ValueError(f"Invalid config: {self.config.name}") + +if __name__ == "__main__": + datasets.load_dataset(__file__) From 5d2311436d6048ff6aedfb43a14552cadf77fe57 Mon Sep 17 00:00:00 2001 From: Nadya Aditama Date: Mon, 26 Sep 2022 19:02:36 +0700 Subject: [PATCH 02/12] data loader for sunda twitter emotions dataset --- .../sunda_twitter_emotions.py | 125 ++++++++++++++++++ 1 file changed, 125 insertions(+) create mode 100644 nusacrowd/nusa_datasets/sunda_twitter_emotions/sunda_twitter_emotions.py diff --git a/nusacrowd/nusa_datasets/sunda_twitter_emotions/sunda_twitter_emotions.py b/nusacrowd/nusa_datasets/sunda_twitter_emotions/sunda_twitter_emotions.py new file mode 100644 index 00000000..4071f6d2 --- /dev/null +++ b/nusacrowd/nusa_datasets/sunda_twitter_emotions/sunda_twitter_emotions.py @@ -0,0 +1,125 @@ +import os +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets +import pandas as pd + +from nusacrowd.utils import schemas +from nusacrowd.utils.configs import NusantaraConfig +from nusacrowd.utils.constants import DEFAULT_NUSANTARA_VIEW_NAME, DEFAULT_SOURCE_VIEW_NAME, Tasks + +# TODO: Add BibTeX citation +_CITATION = """\ +@INPROCEEDINGS{ +9297929, +author={Putra, Oddy Virgantara and Wasmanson, Fathin Muhammad and Harmini, Triana and Utama, Shoffin Nahwa}, +booktitle={2020 International Conference on Computer Engineering, Network, and Intelligent Multimedia (CENIM)}, +title={Sundanese Twitter Dataset for Emotion Classification}, +year={2020}, +volume={}, +number={}, +pages={391-395}, +doi={10.1109/CENIM51130.2020.9297929} +} +""" + +_DATASETNAME = "sunda_twitter_emotions" + +_DESCRIPTION = """\ +This dataset is designed for Emotion Classification NLP task. +""" +_HOMEPAGE = "" + +_LICENSE = "UNKNOWN" + +_URLS = { + "datasets": "https://raw.githubusercontent.com/virgantara/sundanese-twitter-dataset/master/newdataset.csv" +} + +_SUPPORTED_TASKS = [Tasks.EMOTION_CLASSIFICATION] + + +_SOURCE_VERSION = "1.0.0" + +_NUSANTARA_VERSION = "1.0.0" + + +class NewDataset(datasets.GeneratorBasedBuilder): + """TODO: Short description of my dataset.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + NUSANTARA_VERSION = datasets.Version(_NUSANTARA_VERSION) + + BUILDER_CONFIGS = [ + NusantaraConfig( + name="sunda_twitter_emotions_source", + version=SOURCE_VERSION, + description="Sundanese Twitter Dataset for Emotion source schema", + schema="source", + subset_id="sunda_twitter_emotions", + ), + NusantaraConfig( + name="sunda_twitter_emotions_nusantara_text", + version=NUSANTARA_VERSION, + description="Sundanese Twitter Dataset for Emotion Nusantara schema", + schema="nusantara_text", + subset_id="sunda_twitter_emotions", + ), + ] + + DEFAULT_CONFIG_NAME = "sunda_twitter_emotions_source" + + def _info(self) -> datasets.DatasetInfo: + + if self.config.schema == "source": + features = datasets.Features({ + "index": datasets.Value("string"), + "data": datasets.Value("string"), + "label": datasets.Value("string")}) + + # For example nusantara_kb, nusantara_t2t + elif self.config.schema == "nusantara_text": + features = schemas.text_features(["anger", "joy", "fear", "sadness"]) + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + urls = _URLS + data_dir = Path(dl_manager.download_and_extract(urls['datasets'])) + data_files = {"train":data_dir} + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": data_files['train'], + "split": "train", + }, + ) + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + + df = pd.read_csv(filepath, sep=",", header="infer").reset_index() + df.columns = ["index","label", "data"] + + if self.config.schema == "source": + for row in df.itertuples(): + ex = {"index": str(row.index), "data": row.data, "label": row.label} + yield row.index, ex + elif self.config.schema == "nusantara_text": + for row in df.itertuples(): + ex = {"id": str(row.index), "text": row.data, "label": row.label} + yield row.index, ex + else: + raise ValueError(f"Invalid config: {self.config.name}") + +if __name__ == "__main__": + datasets.load_dataset(__file__) From 1cdc881efc34d8fb222b34a7ea1bdd076ba56686 Mon Sep 17 00:00:00 2001 From: Nadya Aditama Date: Mon, 26 Sep 2022 19:06:58 +0700 Subject: [PATCH 03/12] Delete sunda_twitter_emotions.py --- .../sunda_twitter_emotions.py | 125 ------------------ 1 file changed, 125 deletions(-) delete mode 100644 nusantara/nusa_datasets/sunda_twitter_emotions/sunda_twitter_emotions.py diff --git a/nusantara/nusa_datasets/sunda_twitter_emotions/sunda_twitter_emotions.py b/nusantara/nusa_datasets/sunda_twitter_emotions/sunda_twitter_emotions.py deleted file mode 100644 index 9dfbb741..00000000 --- a/nusantara/nusa_datasets/sunda_twitter_emotions/sunda_twitter_emotions.py +++ /dev/null @@ -1,125 +0,0 @@ -import os -from pathlib import Path -from typing import Dict, List, Tuple - -import datasets -import pandas as pd - -from nusantara.utils import schemas -from nusantara.utils.configs import NusantaraConfig -from nusantara.utils.constants import DEFAULT_NUSANTARA_VIEW_NAME, DEFAULT_SOURCE_VIEW_NAME, Tasks - -# TODO: Add BibTeX citation -_CITATION = """\ -@INPROCEEDINGS{ -9297929, -author={Putra, Oddy Virgantara and Wasmanson, Fathin Muhammad and Harmini, Triana and Utama, Shoffin Nahwa}, -booktitle={2020 International Conference on Computer Engineering, Network, and Intelligent Multimedia (CENIM)}, -title={Sundanese Twitter Dataset for Emotion Classification}, -year={2020}, -volume={}, -number={}, -pages={391-395}, -doi={10.1109/CENIM51130.2020.9297929} -} -""" - -_DATASETNAME = "sunda_twitter_emotions" - -_DESCRIPTION = """\ -This dataset is designed for Emotion Classification NLP task. -""" -_HOMEPAGE = "" - -_LICENSE = "UNKNOWN" - -_URLS = { - "datasets": "https://raw.githubusercontent.com/virgantara/sundanese-twitter-dataset/master/newdataset.csv" -} - -_SUPPORTED_TASKS = [Tasks.EMOTION_CLASSIFICATION] - - -_SOURCE_VERSION = "1.0.0" - -_NUSANTARA_VERSION = "1.0.0" - - -class NewDataset(datasets.GeneratorBasedBuilder): - """TODO: Short description of my dataset.""" - - SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) - NUSANTARA_VERSION = datasets.Version(_NUSANTARA_VERSION) - - BUILDER_CONFIGS = [ - NusantaraConfig( - name="sunda_twitter_emotions_source", - version=SOURCE_VERSION, - description="Sundanese Twitter Dataset for Emotion source schema", - schema="source", - subset_id="sunda_twitter_emotions", - ), - NusantaraConfig( - name="sunda_twitter_emotions_nusantara_text", - version=NUSANTARA_VERSION, - description="Sundanese Twitter Dataset for Emotion Nusantara schema", - schema="nusantara_text", - subset_id="sunda_twitter_emotions", - ), - ] - - DEFAULT_CONFIG_NAME = "sunda_twitter_emotions_source" - - def _info(self) -> datasets.DatasetInfo: - - if self.config.schema == "source": - features = datasets.Features({ - "index": datasets.Value("string"), - "data": datasets.Value("string"), - "label": datasets.Value("string")}) - - # For example nusantara_kb, nusantara_t2t - elif self.config.schema == "nusantara_text": - features = schemas.text_features(["anger", "joy", "fear", "sadness"]) - - return datasets.DatasetInfo( - description=_DESCRIPTION, - features=features, - homepage=_HOMEPAGE, - license=_LICENSE, - citation=_CITATION, - ) - - def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: - urls = _URLS - data_dir = Path(dl_manager.download_and_extract(urls['datasets'])) - data_files = {"train":data_dir} - - return [ - datasets.SplitGenerator( - name=datasets.Split.TRAIN, - gen_kwargs={ - "filepath": data_files['train'], - "split": "train", - }, - ) - ] - - def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: - - df = pd.read_csv(filepath, sep=",", header="infer").reset_index() - df.columns = ["index","label", "data"] - - if self.config.schema == "source": - for row in df.itertuples(): - ex = {"index": str(row.index), "data": row.data, "label": row.label} - yield row.index, ex - elif self.config.schema == "nusantara_text": - for row in df.itertuples(): - ex = {"id": str(row.index), "text": row.data, "label": row.label} - yield row.index, ex - else: - raise ValueError(f"Invalid config: {self.config.name}") - -if __name__ == "__main__": - datasets.load_dataset(__file__) From b3e311981f098b9ca07054dd8d15643b9725f913 Mon Sep 17 00:00:00 2001 From: Nadya Aditama Date: Mon, 26 Sep 2022 20:01:43 +0700 Subject: [PATCH 04/12] Delete sunda_twitter_emotions.py --- .../sunda_twitter_emotions.py | 125 ------------------ 1 file changed, 125 deletions(-) delete mode 100644 nusacrowd/nusa_datasets/sunda_twitter_emotions/sunda_twitter_emotions.py diff --git a/nusacrowd/nusa_datasets/sunda_twitter_emotions/sunda_twitter_emotions.py b/nusacrowd/nusa_datasets/sunda_twitter_emotions/sunda_twitter_emotions.py deleted file mode 100644 index 4071f6d2..00000000 --- a/nusacrowd/nusa_datasets/sunda_twitter_emotions/sunda_twitter_emotions.py +++ /dev/null @@ -1,125 +0,0 @@ -import os -from pathlib import Path -from typing import Dict, List, Tuple - -import datasets -import pandas as pd - -from nusacrowd.utils import schemas -from nusacrowd.utils.configs import NusantaraConfig -from nusacrowd.utils.constants import DEFAULT_NUSANTARA_VIEW_NAME, DEFAULT_SOURCE_VIEW_NAME, Tasks - -# TODO: Add BibTeX citation -_CITATION = """\ -@INPROCEEDINGS{ -9297929, -author={Putra, Oddy Virgantara and Wasmanson, Fathin Muhammad and Harmini, Triana and Utama, Shoffin Nahwa}, -booktitle={2020 International Conference on Computer Engineering, Network, and Intelligent Multimedia (CENIM)}, -title={Sundanese Twitter Dataset for Emotion Classification}, -year={2020}, -volume={}, -number={}, -pages={391-395}, -doi={10.1109/CENIM51130.2020.9297929} -} -""" - -_DATASETNAME = "sunda_twitter_emotions" - -_DESCRIPTION = """\ -This dataset is designed for Emotion Classification NLP task. -""" -_HOMEPAGE = "" - -_LICENSE = "UNKNOWN" - -_URLS = { - "datasets": "https://raw.githubusercontent.com/virgantara/sundanese-twitter-dataset/master/newdataset.csv" -} - -_SUPPORTED_TASKS = [Tasks.EMOTION_CLASSIFICATION] - - -_SOURCE_VERSION = "1.0.0" - -_NUSANTARA_VERSION = "1.0.0" - - -class NewDataset(datasets.GeneratorBasedBuilder): - """TODO: Short description of my dataset.""" - - SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) - NUSANTARA_VERSION = datasets.Version(_NUSANTARA_VERSION) - - BUILDER_CONFIGS = [ - NusantaraConfig( - name="sunda_twitter_emotions_source", - version=SOURCE_VERSION, - description="Sundanese Twitter Dataset for Emotion source schema", - schema="source", - subset_id="sunda_twitter_emotions", - ), - NusantaraConfig( - name="sunda_twitter_emotions_nusantara_text", - version=NUSANTARA_VERSION, - description="Sundanese Twitter Dataset for Emotion Nusantara schema", - schema="nusantara_text", - subset_id="sunda_twitter_emotions", - ), - ] - - DEFAULT_CONFIG_NAME = "sunda_twitter_emotions_source" - - def _info(self) -> datasets.DatasetInfo: - - if self.config.schema == "source": - features = datasets.Features({ - "index": datasets.Value("string"), - "data": datasets.Value("string"), - "label": datasets.Value("string")}) - - # For example nusantara_kb, nusantara_t2t - elif self.config.schema == "nusantara_text": - features = schemas.text_features(["anger", "joy", "fear", "sadness"]) - - return datasets.DatasetInfo( - description=_DESCRIPTION, - features=features, - homepage=_HOMEPAGE, - license=_LICENSE, - citation=_CITATION, - ) - - def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: - urls = _URLS - data_dir = Path(dl_manager.download_and_extract(urls['datasets'])) - data_files = {"train":data_dir} - - return [ - datasets.SplitGenerator( - name=datasets.Split.TRAIN, - gen_kwargs={ - "filepath": data_files['train'], - "split": "train", - }, - ) - ] - - def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: - - df = pd.read_csv(filepath, sep=",", header="infer").reset_index() - df.columns = ["index","label", "data"] - - if self.config.schema == "source": - for row in df.itertuples(): - ex = {"index": str(row.index), "data": row.data, "label": row.label} - yield row.index, ex - elif self.config.schema == "nusantara_text": - for row in df.itertuples(): - ex = {"id": str(row.index), "text": row.data, "label": row.label} - yield row.index, ex - else: - raise ValueError(f"Invalid config: {self.config.name}") - -if __name__ == "__main__": - datasets.load_dataset(__file__) From c8648a8affe607215ac2adfc182de4be03c11d9b Mon Sep 17 00:00:00 2001 From: Nadya Aditama Date: Mon, 26 Sep 2022 21:06:15 +0700 Subject: [PATCH 05/12] rename files sundanese twitter emotions --- nusacrowd/nusa_datasets/su_emot/su_emot.py | 125 +++++++++++++++++++++ 1 file changed, 125 insertions(+) create mode 100644 nusacrowd/nusa_datasets/su_emot/su_emot.py diff --git a/nusacrowd/nusa_datasets/su_emot/su_emot.py b/nusacrowd/nusa_datasets/su_emot/su_emot.py new file mode 100644 index 00000000..dc53ed5e --- /dev/null +++ b/nusacrowd/nusa_datasets/su_emot/su_emot.py @@ -0,0 +1,125 @@ +import os +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets +import pandas as pd + +from nusacrowd.utils import schemas +from nusacrowd.utils.configs import NusantaraConfig +from nusacrowd.utils.constants import DEFAULT_NUSANTARA_VIEW_NAME, DEFAULT_SOURCE_VIEW_NAME, Tasks + +# TODO: Add BibTeX citation +_CITATION = """\ +@INPROCEEDINGS{ +9297929, +author={Putra, Oddy Virgantara and Wasmanson, Fathin Muhammad and Harmini, Triana and Utama, Shoffin Nahwa}, +booktitle={2020 International Conference on Computer Engineering, Network, and Intelligent Multimedia (CENIM)}, +title={Sundanese Twitter Dataset for Emotion Classification}, +year={2020}, +volume={}, +number={}, +pages={391-395}, +doi={10.1109/CENIM51130.2020.9297929} +} +""" + +_DATASETNAME = "su_emot" + +_DESCRIPTION = """\ +This dataset is designed for Emotion Classification NLP task. +""" +_HOMEPAGE = "" + +_LICENSE = "UNKNOWN" + +_URLS = { + "datasets": "https://raw.githubusercontent.com/virgantara/sundanese-twitter-dataset/master/newdataset.csv" +} + +_SUPPORTED_TASKS = [Tasks.EMOTION_CLASSIFICATION] + + +_SOURCE_VERSION = "1.0.0" + +_NUSANTARA_VERSION = "1.0.0" + + +class NewDataset(datasets.GeneratorBasedBuilder): + """TODO: Short description of my dataset.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + NUSANTARA_VERSION = datasets.Version(_NUSANTARA_VERSION) + + BUILDER_CONFIGS = [ + NusantaraConfig( + name="su_emot_source", + version=SOURCE_VERSION, + description="Sundanese Twitter Dataset for Emotion source schema", + schema="source", + subset_id="su_emot", + ), + NusantaraConfig( + name="su_emot_nusantara_text", + version=NUSANTARA_VERSION, + description="Sundanese Twitter Dataset for Emotion Nusantara schema", + schema="nusantara_text", + subset_id="su_emot", + ), + ] + + DEFAULT_CONFIG_NAME = "su_emot_source" + + def _info(self) -> datasets.DatasetInfo: + + if self.config.schema == "source": + features = datasets.Features({ + "index": datasets.Value("string"), + "data": datasets.Value("string"), + "label": datasets.Value("string")}) + + # For example nusantara_kb, nusantara_t2t + elif self.config.schema == "nusantara_text": + features = schemas.text_features(["anger", "joy", "fear", "sadness"]) + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + urls = _URLS + data_dir = Path(dl_manager.download_and_extract(urls['datasets'])) + data_files = {"train":data_dir} + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": data_files['train'], + "split": "train", + }, + ) + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + + df = pd.read_csv(filepath, sep=",", header="infer").reset_index() + df.columns = ["index","label", "data"] + + if self.config.schema == "source": + for row in df.itertuples(): + ex = {"index": str(row.index), "data": row.data, "label": row.label} + yield row.index, ex + elif self.config.schema == "nusantara_text": + for row in df.itertuples(): + ex = {"id": str(row.index), "text": row.data, "label": row.label} + yield row.index, ex + else: + raise ValueError(f"Invalid config: {self.config.name}") + +if __name__ == "__main__": + datasets.load_dataset(__file__) From 2b7174ca5d6cda465f741e7ca52ffee3d737e16c Mon Sep 17 00:00:00 2001 From: Nadya Aditama Date: Fri, 30 Sep 2022 20:22:58 +0700 Subject: [PATCH 06/12] Updated change sundanese twitter emotions --- nusacrowd/nusa_datasets/su_emot/__init__.py | 0 nusacrowd/nusa_datasets/su_emot/su_emot.py | 22 +++++++++++++-------- 2 files changed, 14 insertions(+), 8 deletions(-) create mode 100644 nusacrowd/nusa_datasets/su_emot/__init__.py diff --git a/nusacrowd/nusa_datasets/su_emot/__init__.py b/nusacrowd/nusa_datasets/su_emot/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/nusacrowd/nusa_datasets/su_emot/su_emot.py b/nusacrowd/nusa_datasets/su_emot/su_emot.py index dc53ed5e..1c0f823c 100644 --- a/nusacrowd/nusa_datasets/su_emot/su_emot.py +++ b/nusacrowd/nusa_datasets/su_emot/su_emot.py @@ -9,7 +9,12 @@ from nusacrowd.utils.configs import NusantaraConfig from nusacrowd.utils.constants import DEFAULT_NUSANTARA_VIEW_NAME, DEFAULT_SOURCE_VIEW_NAME, Tasks -# TODO: Add BibTeX citation +_DATASETNAME = "su_emot" +_SOURCE_VIEW_NAME = DEFAULT_SOURCE_VIEW_NAME +_UNIFIED_VIEW_NAME = DEFAULT_NUSANTARA_VIEW_NAME + +_LANGUAGES = ["sun"] +_LOCAL = False _CITATION = """\ @INPROCEEDINGS{ 9297929, @@ -19,17 +24,18 @@ year={2020}, volume={}, number={}, -pages={391-395}, +pages={391--395}, doi={10.1109/CENIM51130.2020.9297929} } """ -_DATASETNAME = "su_emot" - _DESCRIPTION = """\ -This dataset is designed for Emotion Classification NLP task. +This is a dataset for emotion classification of Sundanese text. The dataset is gathered from Twitter API between January and March 2019 with 2518 tweets in total. +The tweets filtered by using some hashtags which are represented Sundanese emotion, for instance, #persib, #corona, #saredih, #nyakakak, #garoblog, #sangsara, #gumujeng, #bungah, #sararieun, #ceurik, and #hariwang. +This dataset contains four distinctive emotions: anger, joy, fear, and sadness. Each tweet is annotated using related emotion. For data +validation, the authors consulted a Sundanese language teacher for expert validation. """ -_HOMEPAGE = "" +_HOMEPAGE = "https://github.com/virgantara/sundanese-twitter-dataset" _LICENSE = "UNKNOWN" @@ -45,8 +51,8 @@ _NUSANTARA_VERSION = "1.0.0" -class NewDataset(datasets.GeneratorBasedBuilder): - """TODO: Short description of my dataset.""" +class suEmot(datasets.GeneratorBasedBuilder): + """This is a dataset for emotion classification of Sundanese text. The dataset is gathered from Twitter API between January and March 2019 with 2518 tweets in total.""" SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) NUSANTARA_VERSION = datasets.Version(_NUSANTARA_VERSION) From dac6a12226699107a623e1e597de43ea2b65f495 Mon Sep 17 00:00:00 2001 From: Nadya Aditama Date: Sat, 24 Sep 2022 21:52:33 +0700 Subject: [PATCH 07/12] data loader sundanese twitter emotions --- .../sunda_twitter_emotions.py | 125 ++++++++++++++++++ 1 file changed, 125 insertions(+) create mode 100644 nusantara/nusa_datasets/sunda_twitter_emotions/sunda_twitter_emotions.py diff --git a/nusantara/nusa_datasets/sunda_twitter_emotions/sunda_twitter_emotions.py b/nusantara/nusa_datasets/sunda_twitter_emotions/sunda_twitter_emotions.py new file mode 100644 index 00000000..9dfbb741 --- /dev/null +++ b/nusantara/nusa_datasets/sunda_twitter_emotions/sunda_twitter_emotions.py @@ -0,0 +1,125 @@ +import os +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets +import pandas as pd + +from nusantara.utils import schemas +from nusantara.utils.configs import NusantaraConfig +from nusantara.utils.constants import DEFAULT_NUSANTARA_VIEW_NAME, DEFAULT_SOURCE_VIEW_NAME, Tasks + +# TODO: Add BibTeX citation +_CITATION = """\ +@INPROCEEDINGS{ +9297929, +author={Putra, Oddy Virgantara and Wasmanson, Fathin Muhammad and Harmini, Triana and Utama, Shoffin Nahwa}, +booktitle={2020 International Conference on Computer Engineering, Network, and Intelligent Multimedia (CENIM)}, +title={Sundanese Twitter Dataset for Emotion Classification}, +year={2020}, +volume={}, +number={}, +pages={391-395}, +doi={10.1109/CENIM51130.2020.9297929} +} +""" + +_DATASETNAME = "sunda_twitter_emotions" + +_DESCRIPTION = """\ +This dataset is designed for Emotion Classification NLP task. +""" +_HOMEPAGE = "" + +_LICENSE = "UNKNOWN" + +_URLS = { + "datasets": "https://raw.githubusercontent.com/virgantara/sundanese-twitter-dataset/master/newdataset.csv" +} + +_SUPPORTED_TASKS = [Tasks.EMOTION_CLASSIFICATION] + + +_SOURCE_VERSION = "1.0.0" + +_NUSANTARA_VERSION = "1.0.0" + + +class NewDataset(datasets.GeneratorBasedBuilder): + """TODO: Short description of my dataset.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + NUSANTARA_VERSION = datasets.Version(_NUSANTARA_VERSION) + + BUILDER_CONFIGS = [ + NusantaraConfig( + name="sunda_twitter_emotions_source", + version=SOURCE_VERSION, + description="Sundanese Twitter Dataset for Emotion source schema", + schema="source", + subset_id="sunda_twitter_emotions", + ), + NusantaraConfig( + name="sunda_twitter_emotions_nusantara_text", + version=NUSANTARA_VERSION, + description="Sundanese Twitter Dataset for Emotion Nusantara schema", + schema="nusantara_text", + subset_id="sunda_twitter_emotions", + ), + ] + + DEFAULT_CONFIG_NAME = "sunda_twitter_emotions_source" + + def _info(self) -> datasets.DatasetInfo: + + if self.config.schema == "source": + features = datasets.Features({ + "index": datasets.Value("string"), + "data": datasets.Value("string"), + "label": datasets.Value("string")}) + + # For example nusantara_kb, nusantara_t2t + elif self.config.schema == "nusantara_text": + features = schemas.text_features(["anger", "joy", "fear", "sadness"]) + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + urls = _URLS + data_dir = Path(dl_manager.download_and_extract(urls['datasets'])) + data_files = {"train":data_dir} + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": data_files['train'], + "split": "train", + }, + ) + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + + df = pd.read_csv(filepath, sep=",", header="infer").reset_index() + df.columns = ["index","label", "data"] + + if self.config.schema == "source": + for row in df.itertuples(): + ex = {"index": str(row.index), "data": row.data, "label": row.label} + yield row.index, ex + elif self.config.schema == "nusantara_text": + for row in df.itertuples(): + ex = {"id": str(row.index), "text": row.data, "label": row.label} + yield row.index, ex + else: + raise ValueError(f"Invalid config: {self.config.name}") + +if __name__ == "__main__": + datasets.load_dataset(__file__) From edcac302249658a6b6ce5e9e3133b71763182d47 Mon Sep 17 00:00:00 2001 From: Nadya Aditama Date: Mon, 26 Sep 2022 19:02:36 +0700 Subject: [PATCH 08/12] data loader for sunda twitter emotions dataset --- .../sunda_twitter_emotions.py | 125 ++++++++++++++++++ 1 file changed, 125 insertions(+) create mode 100644 nusacrowd/nusa_datasets/sunda_twitter_emotions/sunda_twitter_emotions.py diff --git a/nusacrowd/nusa_datasets/sunda_twitter_emotions/sunda_twitter_emotions.py b/nusacrowd/nusa_datasets/sunda_twitter_emotions/sunda_twitter_emotions.py new file mode 100644 index 00000000..4071f6d2 --- /dev/null +++ b/nusacrowd/nusa_datasets/sunda_twitter_emotions/sunda_twitter_emotions.py @@ -0,0 +1,125 @@ +import os +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets +import pandas as pd + +from nusacrowd.utils import schemas +from nusacrowd.utils.configs import NusantaraConfig +from nusacrowd.utils.constants import DEFAULT_NUSANTARA_VIEW_NAME, DEFAULT_SOURCE_VIEW_NAME, Tasks + +# TODO: Add BibTeX citation +_CITATION = """\ +@INPROCEEDINGS{ +9297929, +author={Putra, Oddy Virgantara and Wasmanson, Fathin Muhammad and Harmini, Triana and Utama, Shoffin Nahwa}, +booktitle={2020 International Conference on Computer Engineering, Network, and Intelligent Multimedia (CENIM)}, +title={Sundanese Twitter Dataset for Emotion Classification}, +year={2020}, +volume={}, +number={}, +pages={391-395}, +doi={10.1109/CENIM51130.2020.9297929} +} +""" + +_DATASETNAME = "sunda_twitter_emotions" + +_DESCRIPTION = """\ +This dataset is designed for Emotion Classification NLP task. +""" +_HOMEPAGE = "" + +_LICENSE = "UNKNOWN" + +_URLS = { + "datasets": "https://raw.githubusercontent.com/virgantara/sundanese-twitter-dataset/master/newdataset.csv" +} + +_SUPPORTED_TASKS = [Tasks.EMOTION_CLASSIFICATION] + + +_SOURCE_VERSION = "1.0.0" + +_NUSANTARA_VERSION = "1.0.0" + + +class NewDataset(datasets.GeneratorBasedBuilder): + """TODO: Short description of my dataset.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + NUSANTARA_VERSION = datasets.Version(_NUSANTARA_VERSION) + + BUILDER_CONFIGS = [ + NusantaraConfig( + name="sunda_twitter_emotions_source", + version=SOURCE_VERSION, + description="Sundanese Twitter Dataset for Emotion source schema", + schema="source", + subset_id="sunda_twitter_emotions", + ), + NusantaraConfig( + name="sunda_twitter_emotions_nusantara_text", + version=NUSANTARA_VERSION, + description="Sundanese Twitter Dataset for Emotion Nusantara schema", + schema="nusantara_text", + subset_id="sunda_twitter_emotions", + ), + ] + + DEFAULT_CONFIG_NAME = "sunda_twitter_emotions_source" + + def _info(self) -> datasets.DatasetInfo: + + if self.config.schema == "source": + features = datasets.Features({ + "index": datasets.Value("string"), + "data": datasets.Value("string"), + "label": datasets.Value("string")}) + + # For example nusantara_kb, nusantara_t2t + elif self.config.schema == "nusantara_text": + features = schemas.text_features(["anger", "joy", "fear", "sadness"]) + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + urls = _URLS + data_dir = Path(dl_manager.download_and_extract(urls['datasets'])) + data_files = {"train":data_dir} + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": data_files['train'], + "split": "train", + }, + ) + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + + df = pd.read_csv(filepath, sep=",", header="infer").reset_index() + df.columns = ["index","label", "data"] + + if self.config.schema == "source": + for row in df.itertuples(): + ex = {"index": str(row.index), "data": row.data, "label": row.label} + yield row.index, ex + elif self.config.schema == "nusantara_text": + for row in df.itertuples(): + ex = {"id": str(row.index), "text": row.data, "label": row.label} + yield row.index, ex + else: + raise ValueError(f"Invalid config: {self.config.name}") + +if __name__ == "__main__": + datasets.load_dataset(__file__) From b83862659bcf52dc226c462732c467d0d290006d Mon Sep 17 00:00:00 2001 From: Nadya Aditama Date: Mon, 26 Sep 2022 19:06:58 +0700 Subject: [PATCH 09/12] Delete sunda_twitter_emotions.py --- .../sunda_twitter_emotions.py | 125 ------------------ 1 file changed, 125 deletions(-) delete mode 100644 nusantara/nusa_datasets/sunda_twitter_emotions/sunda_twitter_emotions.py diff --git a/nusantara/nusa_datasets/sunda_twitter_emotions/sunda_twitter_emotions.py b/nusantara/nusa_datasets/sunda_twitter_emotions/sunda_twitter_emotions.py deleted file mode 100644 index 9dfbb741..00000000 --- a/nusantara/nusa_datasets/sunda_twitter_emotions/sunda_twitter_emotions.py +++ /dev/null @@ -1,125 +0,0 @@ -import os -from pathlib import Path -from typing import Dict, List, Tuple - -import datasets -import pandas as pd - -from nusantara.utils import schemas -from nusantara.utils.configs import NusantaraConfig -from nusantara.utils.constants import DEFAULT_NUSANTARA_VIEW_NAME, DEFAULT_SOURCE_VIEW_NAME, Tasks - -# TODO: Add BibTeX citation -_CITATION = """\ -@INPROCEEDINGS{ -9297929, -author={Putra, Oddy Virgantara and Wasmanson, Fathin Muhammad and Harmini, Triana and Utama, Shoffin Nahwa}, -booktitle={2020 International Conference on Computer Engineering, Network, and Intelligent Multimedia (CENIM)}, -title={Sundanese Twitter Dataset for Emotion Classification}, -year={2020}, -volume={}, -number={}, -pages={391-395}, -doi={10.1109/CENIM51130.2020.9297929} -} -""" - -_DATASETNAME = "sunda_twitter_emotions" - -_DESCRIPTION = """\ -This dataset is designed for Emotion Classification NLP task. -""" -_HOMEPAGE = "" - -_LICENSE = "UNKNOWN" - -_URLS = { - "datasets": "https://raw.githubusercontent.com/virgantara/sundanese-twitter-dataset/master/newdataset.csv" -} - -_SUPPORTED_TASKS = [Tasks.EMOTION_CLASSIFICATION] - - -_SOURCE_VERSION = "1.0.0" - -_NUSANTARA_VERSION = "1.0.0" - - -class NewDataset(datasets.GeneratorBasedBuilder): - """TODO: Short description of my dataset.""" - - SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) - NUSANTARA_VERSION = datasets.Version(_NUSANTARA_VERSION) - - BUILDER_CONFIGS = [ - NusantaraConfig( - name="sunda_twitter_emotions_source", - version=SOURCE_VERSION, - description="Sundanese Twitter Dataset for Emotion source schema", - schema="source", - subset_id="sunda_twitter_emotions", - ), - NusantaraConfig( - name="sunda_twitter_emotions_nusantara_text", - version=NUSANTARA_VERSION, - description="Sundanese Twitter Dataset for Emotion Nusantara schema", - schema="nusantara_text", - subset_id="sunda_twitter_emotions", - ), - ] - - DEFAULT_CONFIG_NAME = "sunda_twitter_emotions_source" - - def _info(self) -> datasets.DatasetInfo: - - if self.config.schema == "source": - features = datasets.Features({ - "index": datasets.Value("string"), - "data": datasets.Value("string"), - "label": datasets.Value("string")}) - - # For example nusantara_kb, nusantara_t2t - elif self.config.schema == "nusantara_text": - features = schemas.text_features(["anger", "joy", "fear", "sadness"]) - - return datasets.DatasetInfo( - description=_DESCRIPTION, - features=features, - homepage=_HOMEPAGE, - license=_LICENSE, - citation=_CITATION, - ) - - def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: - urls = _URLS - data_dir = Path(dl_manager.download_and_extract(urls['datasets'])) - data_files = {"train":data_dir} - - return [ - datasets.SplitGenerator( - name=datasets.Split.TRAIN, - gen_kwargs={ - "filepath": data_files['train'], - "split": "train", - }, - ) - ] - - def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: - - df = pd.read_csv(filepath, sep=",", header="infer").reset_index() - df.columns = ["index","label", "data"] - - if self.config.schema == "source": - for row in df.itertuples(): - ex = {"index": str(row.index), "data": row.data, "label": row.label} - yield row.index, ex - elif self.config.schema == "nusantara_text": - for row in df.itertuples(): - ex = {"id": str(row.index), "text": row.data, "label": row.label} - yield row.index, ex - else: - raise ValueError(f"Invalid config: {self.config.name}") - -if __name__ == "__main__": - datasets.load_dataset(__file__) From 7135a82c33b3ebdedf4ba304e116bc48692526c3 Mon Sep 17 00:00:00 2001 From: Nadya Aditama Date: Mon, 26 Sep 2022 20:01:43 +0700 Subject: [PATCH 10/12] Delete sunda_twitter_emotions.py --- .../sunda_twitter_emotions.py | 125 ------------------ 1 file changed, 125 deletions(-) delete mode 100644 nusacrowd/nusa_datasets/sunda_twitter_emotions/sunda_twitter_emotions.py diff --git a/nusacrowd/nusa_datasets/sunda_twitter_emotions/sunda_twitter_emotions.py b/nusacrowd/nusa_datasets/sunda_twitter_emotions/sunda_twitter_emotions.py deleted file mode 100644 index 4071f6d2..00000000 --- a/nusacrowd/nusa_datasets/sunda_twitter_emotions/sunda_twitter_emotions.py +++ /dev/null @@ -1,125 +0,0 @@ -import os -from pathlib import Path -from typing import Dict, List, Tuple - -import datasets -import pandas as pd - -from nusacrowd.utils import schemas -from nusacrowd.utils.configs import NusantaraConfig -from nusacrowd.utils.constants import DEFAULT_NUSANTARA_VIEW_NAME, DEFAULT_SOURCE_VIEW_NAME, Tasks - -# TODO: Add BibTeX citation -_CITATION = """\ -@INPROCEEDINGS{ -9297929, -author={Putra, Oddy Virgantara and Wasmanson, Fathin Muhammad and Harmini, Triana and Utama, Shoffin Nahwa}, -booktitle={2020 International Conference on Computer Engineering, Network, and Intelligent Multimedia (CENIM)}, -title={Sundanese Twitter Dataset for Emotion Classification}, -year={2020}, -volume={}, -number={}, -pages={391-395}, -doi={10.1109/CENIM51130.2020.9297929} -} -""" - -_DATASETNAME = "sunda_twitter_emotions" - -_DESCRIPTION = """\ -This dataset is designed for Emotion Classification NLP task. -""" -_HOMEPAGE = "" - -_LICENSE = "UNKNOWN" - -_URLS = { - "datasets": "https://raw.githubusercontent.com/virgantara/sundanese-twitter-dataset/master/newdataset.csv" -} - -_SUPPORTED_TASKS = [Tasks.EMOTION_CLASSIFICATION] - - -_SOURCE_VERSION = "1.0.0" - -_NUSANTARA_VERSION = "1.0.0" - - -class NewDataset(datasets.GeneratorBasedBuilder): - """TODO: Short description of my dataset.""" - - SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) - NUSANTARA_VERSION = datasets.Version(_NUSANTARA_VERSION) - - BUILDER_CONFIGS = [ - NusantaraConfig( - name="sunda_twitter_emotions_source", - version=SOURCE_VERSION, - description="Sundanese Twitter Dataset for Emotion source schema", - schema="source", - subset_id="sunda_twitter_emotions", - ), - NusantaraConfig( - name="sunda_twitter_emotions_nusantara_text", - version=NUSANTARA_VERSION, - description="Sundanese Twitter Dataset for Emotion Nusantara schema", - schema="nusantara_text", - subset_id="sunda_twitter_emotions", - ), - ] - - DEFAULT_CONFIG_NAME = "sunda_twitter_emotions_source" - - def _info(self) -> datasets.DatasetInfo: - - if self.config.schema == "source": - features = datasets.Features({ - "index": datasets.Value("string"), - "data": datasets.Value("string"), - "label": datasets.Value("string")}) - - # For example nusantara_kb, nusantara_t2t - elif self.config.schema == "nusantara_text": - features = schemas.text_features(["anger", "joy", "fear", "sadness"]) - - return datasets.DatasetInfo( - description=_DESCRIPTION, - features=features, - homepage=_HOMEPAGE, - license=_LICENSE, - citation=_CITATION, - ) - - def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: - urls = _URLS - data_dir = Path(dl_manager.download_and_extract(urls['datasets'])) - data_files = {"train":data_dir} - - return [ - datasets.SplitGenerator( - name=datasets.Split.TRAIN, - gen_kwargs={ - "filepath": data_files['train'], - "split": "train", - }, - ) - ] - - def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: - - df = pd.read_csv(filepath, sep=",", header="infer").reset_index() - df.columns = ["index","label", "data"] - - if self.config.schema == "source": - for row in df.itertuples(): - ex = {"index": str(row.index), "data": row.data, "label": row.label} - yield row.index, ex - elif self.config.schema == "nusantara_text": - for row in df.itertuples(): - ex = {"id": str(row.index), "text": row.data, "label": row.label} - yield row.index, ex - else: - raise ValueError(f"Invalid config: {self.config.name}") - -if __name__ == "__main__": - datasets.load_dataset(__file__) From 2f5c4d8514e48308dea28909fdb8bf2181d384d0 Mon Sep 17 00:00:00 2001 From: Nadya Aditama Date: Tue, 4 Oct 2022 11:38:10 +0700 Subject: [PATCH 11/12] minor change su_emot --- nusacrowd/nusa_datasets/su_emot/su_emot.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/nusacrowd/nusa_datasets/su_emot/su_emot.py b/nusacrowd/nusa_datasets/su_emot/su_emot.py index 1c0f823c..91c120cb 100644 --- a/nusacrowd/nusa_datasets/su_emot/su_emot.py +++ b/nusacrowd/nusa_datasets/su_emot/su_emot.py @@ -51,7 +51,7 @@ _NUSANTARA_VERSION = "1.0.0" -class suEmot(datasets.GeneratorBasedBuilder): +class SuEmot(datasets.GeneratorBasedBuilder): """This is a dataset for emotion classification of Sundanese text. The dataset is gathered from Twitter API between January and March 2019 with 2518 tweets in total.""" SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) @@ -118,14 +118,12 @@ def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: if self.config.schema == "source": for row in df.itertuples(): - ex = {"index": str(row.index), "data": row.data, "label": row.label} + ex = {"index": str(row.index+1), "data": row.data, "label": row.label} yield row.index, ex elif self.config.schema == "nusantara_text": for row in df.itertuples(): - ex = {"id": str(row.index), "text": row.data, "label": row.label} + ex = {"id": str(row.index+1), "text": row.data, "label": row.label} yield row.index, ex - else: - raise ValueError(f"Invalid config: {self.config.name}") if __name__ == "__main__": datasets.load_dataset(__file__) From 773e9731fd0ee4f46a98d759654dd135ae3870b3 Mon Sep 17 00:00:00 2001 From: Nadya Aditama Date: Tue, 4 Oct 2022 14:17:35 +0700 Subject: [PATCH 12/12] minor change su_emot.py --- nusacrowd/nusa_datasets/su_emot/su_emot.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/nusacrowd/nusa_datasets/su_emot/su_emot.py b/nusacrowd/nusa_datasets/su_emot/su_emot.py index 91c120cb..b7582a08 100644 --- a/nusacrowd/nusa_datasets/su_emot/su_emot.py +++ b/nusacrowd/nusa_datasets/su_emot/su_emot.py @@ -124,6 +124,3 @@ def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: for row in df.itertuples(): ex = {"id": str(row.index+1), "text": row.data, "label": row.label} yield row.index, ex - -if __name__ == "__main__": - datasets.load_dataset(__file__)