From d484f28e0554b1e9f298d7f24fc72d78728310d7 Mon Sep 17 00:00:00 2001
From: Christian Wibisono <christian.wibisono7@gmail.com>
Date: Tue, 28 Jun 2022 18:32:20 +0700
Subject: [PATCH] add and apply pre-commit hook

---
 .github/ISSUE_TEMPLATE/add-dataset.md         |  1 -
 .pre-commit-config.yaml                       | 17 ++++
 DATALOADER.md                                 |  8 +-
 POINTS.id.md                                  |  2 +-
 POINTS.md                                     |  3 +-
 README.md                                     |  8 +-
 UPLOADING.id.md                               | 10 +--
 UPLOADING.md                                  | 26 +++---
 nusantara/nusa_datasets/bapos/bapos.py        | 48 ++++-------
 .../nusa_datasets/bible_en_id/bible_en_id.py  | 58 +++++--------
 .../nusa_datasets/bible_su_id/bible_su_id.py  | 58 +++++--------
 nusantara/nusa_datasets/emot/emot.py          |  3 +-
 .../nusa_datasets/id_abusive/id_abusive.py    |  1 -
 .../id_clickbait/id_clickbait.py              |  2 +-
 .../id_hatespeech/id_hatespeech.py            |  1 -
 .../indo_religious_mt_en_id.py                | 53 ++++++------
 nusantara/nusa_datasets/smsa/smsa.py          | 63 ++++++---------
 .../stif_indonesia/stif_indonesia.py          | 81 +++++++++----------
 nusantara/utils/common_parser.py              | 15 +---
 nusantara/utils/constants.py                  | 24 +++---
 nusantara/utils/schemas/__init__.py           |  9 +--
 nusantara/utils/schemas/seq_label.py          | 10 +--
 nusantara/utils/schemas/text_to_text.py       |  4 +-
 templates/template.py                         |  2 -
 test_example.sh                               |  2 +-
 tests/test_nusantara.py                       | 57 +++----------
 26 files changed, 223 insertions(+), 343 deletions(-)
 create mode 100644 .pre-commit-config.yaml
diff --git a/.github/ISSUE_TEMPLATE/add-dataset.md b/.github/ISSUE_TEMPLATE/add-dataset.md
index 5f68e462..fd81c75d 100644
--- a/.github/ISSUE_TEMPLATE/add-dataset.md
+++ b/.github/ISSUE_TEMPLATE/add-dataset.md
@@ -20,4 +20,3 @@ assignees: ''
 - **Is Synthetic:** *Yes/No. Put yes if the dataset is generated synthetically somehow, for example by translating from other languages, or by generating from language models, or CFG, etc2.*
 - **License:** *Type of license; please provide public for new datasets*
 - **Motivation:** *what are some good reasons to have this dataset*
-
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 00000000..b3b12c88
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,17 @@
+repos:
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v2.3.0
+    hooks:
+    -   id: check-yaml
+    -   id: end-of-file-fixer
+    -   id: trailing-whitespace
+-   repo: https://github.com/hadialqattan/pycln
+    rev: v1.3.5
+    hooks:
+    -   id: pycln
+        args: [--all]
+-   repo: https://github.com/psf/black
+    rev: 22.3.0
+    hooks:
+    -   id: black
+        args: [--line-length=250, --target-version=py38]
diff --git a/DATALOADER.md b/DATALOADER.md
index 927c3276..fe142fea 100644
--- a/DATALOADER.md
+++ b/DATALOADER.md
@@ -9,7 +9,7 @@ You will also need at least Python 3.6+. If you are installing python, we recomm
 **Optional** Setup your GitHub account with SSH ([instructions here](https://docs.github.com/en/authentication/connecting-to-github-with-ssh).)
 
 ### 1. **Assigning a dataloader**
-- Choose a dataset from the [list of Nusantara datasets](https://github.com/orgs/IndoNLP/projects/2). 
+- Choose a dataset from the [list of Nusantara datasets](https://github.com/orgs/IndoNLP/projects/2).
 <p align="center">
     <img src="./docs/_static/img/select-task.jpeg" style="width: 80%;"/>
 </p>
@@ -104,7 +104,7 @@ Make a new directory within the `nusa-crowd/nusantara/nusa_datasets` directory:
 
     mkdir nusantara/nusa_datasets/<dataset_name>
 
-Please use lowercase letters and underscores when choosing a `<dataset_name>`. 
+Please use lowercase letters and underscores when choosing a `<dataset_name>`.
 To implement your dataset, there are three key methods that are important:
 
   * `_info`: Specifies the schema of the expected dataloader
@@ -141,9 +141,9 @@ if __name__ == "__main__":
 ```
 
 If you want to use an interactive debugger during development, you will have to use
-`breakpoint()` instead of setting breakpoints directly in your IDE. Most IDEs will 
+`breakpoint()` instead of setting breakpoints directly in your IDE. Most IDEs will
 recognize the `breakpoint()` statement and pause there during debugging. If your prefered
-IDE doesn't support this, you can always run the script in your terminal and debug with 
+IDE doesn't support this, you can always run the script in your terminal and debug with
 `pdb`.
 
 
diff --git a/POINTS.id.md b/POINTS.id.md
index 5c2b58ff..2e526f2f 100644
--- a/POINTS.id.md
+++ b/POINTS.id.md
@@ -4,7 +4,7 @@ Untuk dianggap sebagai co-author, diperlukan 10 poin kontribusi.
 
 ## Data Loader
 
-Menerapkan data loader apa pun diberikan +3 poin. 
+Menerapkan data loader apa pun diberikan +3 poin.
 Info lebih lanjut dapat ditemukan [di sini](DATALOADER.md).
 
 ## Proposal Dataset
diff --git a/POINTS.md b/POINTS.md
index 55561cbe..6899a5ee 100644
--- a/POINTS.md
+++ b/POINTS.md
@@ -41,8 +41,7 @@ We can have 4 different levels: Small, Medium, Large, XL
 
 ## Examples
 
-Let's assume a new sentiment analysis for one of Papuan language, consisting of 500 sentences. 
+Let's assume a new sentiment analysis for one of Papuan language, consisting of 500 sentences.
 For data size, it is considered small (+1 pts). While sentiment analysis is common, but the language itself is extremely rare and underrepresented, therefore we got +6 pts for this. Lastly, assuming the data is in high-quality, we'll obtain a total of (1 + 6) * 1.5 pts = 10.5pts, which is enough for authorship.
 
 Another example, let's assume a new Natural Language Inference (NLI) dataset for Javanese. NLI by itself is not new for Indonesian languages, and Javanese resource is available. However, Javanese NLI is the first one even, hence it is still considered rare (+6 pts). Assuming the dataset is Small size, with Good quality, we end up with a total of 7 pts. By additionally, implementing the data loader for this dataset, we'll have a total of 10 pts, which is enough for authorship.
-
diff --git a/README.md b/README.md
index 04417ba1..57cd2d66 100644
--- a/README.md
+++ b/README.md
@@ -15,7 +15,7 @@ Indonesian NLP is underrepresented in research community, and one of the reasons
 
 ## How to contribute?
 
-You can contribute by proposing **unregistered NLP dataset** on [our record](https://indonlp.github.io/nusa-catalogue/). You can also propose datasets from your past work that have not been released to the public. [Just fill out this form](https://forms.gle/31dMGZik25DPFYFd6), and we will check and approve your entry. 
+You can contribute by proposing **unregistered NLP dataset** on [our record](https://indonlp.github.io/nusa-catalogue/). You can also propose datasets from your past work that have not been released to the public. [Just fill out this form](https://forms.gle/31dMGZik25DPFYFd6), and we will check and approve your entry.
 
 We will give **contribution points** based on several factors, including: **dataset quality**, **language scarcity**, or **task scarcity**.
 
@@ -36,7 +36,7 @@ The license for a dataset is not always obvious. Here are some strategies to try
 * check publications that announce the release of the dataset
 * check the website of the organization providing the dataset
 
-If no official license is listed anywhere, but you find a webpage that describes general data usage policies for the dataset, you can fall back to providing that URL in the `_LICENSE` variable. If you can't find any license information, please note in your PR and put `_LICENSE="Unknown"` in your dataset script.   
+If no official license is listed anywhere, but you find a webpage that describes general data usage policies for the dataset, you can fall back to providing that URL in the `_LICENSE` variable. If you can't find any license information, please note in your PR and put `_LICENSE="Unknown"` in your dataset script.
 
 #### What if my dataset is not yet publicly available?
 
@@ -49,11 +49,11 @@ Yes, you can ask for helps in NusaCrowd's community channel! Please join our [Wh
 
 ## Thank you!
 
-We greatly appreciate your help! 
+We greatly appreciate your help!
 
 The artifacts of this hackathon will be described in a forthcoming academic paper targeting a machine learning or NLP audience. Please refer to [this section](#contribution-guidelines) for your contribution rewards for helping Nusantara NLP. We recognize that some datasets require more effort than others, so please reach out if you have questions. Our goal is to be inclusive with credit!
 
-<!-- 
+<!--
 ## Acknowledgements
 
 This hackathon guide was heavily inspired by [the BigScience Datasets Hackathon](https://github.com/bigscience-workshop/data_tooling/wiki/datasets-hackathon).
diff --git a/UPLOADING.id.md b/UPLOADING.id.md
index 5741e582..cd59d143 100644
--- a/UPLOADING.id.md
+++ b/UPLOADING.id.md
@@ -33,7 +33,7 @@ Masuk dengan nama pengguna dan kata sandi akun 🤗 Hub Anda.
 Buat repositori melalui Hub [di sini](https://huggingface.co/new-dataset) dengan detail berikut.
 
 + Set Owner: nusantara-datasets
-+ Set Dataset name: the name of the dataset 
++ Set Dataset name: the name of the dataset
 + Set License: the license that applies to this dataset
 + Select Private
 + Click `Create dataset`
@@ -80,14 +80,14 @@ dataset_indobenchmark= load_dataset("indobenchmark/<nama_dataset_anda>", name="i
 from datasets import load_dataset
 
 dataset_orig = load_dataset(
-    "indobenchmark/<nama_dataset_anda>", 
-    name="source", 
+    "indobenchmark/<nama_dataset_anda>",
+    name="source",
     data_dir="/local/path/menuju/data/files",
     use_auth_token=True)
 
 dataset_indobenchmark = load_dataset(
-    "indobenchmark/<nama_dataset_anda>", 
-    name="indobenchmark", 
+    "indobenchmark/<nama_dataset_anda>",
+    name="indobenchmark",
     data_dir="/local/path/menuju/data/files",
     use_auth_token=True)
 ```
diff --git a/UPLOADING.md b/UPLOADING.md
index cc7f9a75..0440af36 100644
--- a/UPLOADING.md
+++ b/UPLOADING.md
@@ -4,14 +4,14 @@
 
 ### 1. Make an account on the Hub
 
-Please do the following before getting started: 
+Please do the following before getting started:
 
-- [Make](https://huggingface.co/join) an account on 🤗's Hub and [login](https://huggingface.co/login). **Choose a good password, as you'll need to authenticate your credentials**. 
+- [Make](https://huggingface.co/join) an account on 🤗's Hub and [login](https://huggingface.co/login). **Choose a good password, as you'll need to authenticate your credentials**.
 
 - Join the Indobenchmark initiative [here](https://huggingface.co/indobenchmark).
     - click the "Request to join this org" button in the upper right corner.
 
-- Make a github account; you can follow instructions to install git [here](https://git-scm.com/book/en/v2/Getting-Started-Installing-Git). 
+- Make a github account; you can follow instructions to install git [here](https://git-scm.com/book/en/v2/Getting-Started-Installing-Git).
 
 
 **Note - your permissions will be set to READ. Please contact an admin in your dataset's github issue to be granted WRITE access; this should be given after your PR is accepted**.
@@ -26,21 +26,21 @@ With your active `nusantara` environment, use the following command:
 huggingface-cli login
 ```
 
-Login with your 🤗 Hub account username and password. 
+Login with your 🤗 Hub account username and password.
 
 ### 3. Create a dataset repository
 
 Make a repository via the 🤗 Hub [here](https://huggingface.co/new-dataset) with the following details.
 
 + Set Owner: nusantara-datasets
-+ Set Dataset name: the name of the dataset 
++ Set Dataset name: the name of the dataset
 + Set License: the license that applies to this dataset
 + Select Private
 + Click `Create dataset`
 
 **Please name your dataloading script with the same name as the dataset.** For example, if your dataset loader script is called `absa_prosa.py`, then your dataset name should be `absa_prosa`.
 
-If there is no appropriate license available in the provided options (for example for datasets with specific data user agreements) you should select "other". 
+If there is no appropriate license available in the provided options (for example for datasets with specific data user agreements) you should select "other".
 
 ### 4. Clone the dataset repository
 
@@ -60,11 +60,11 @@ git commit -m "Adds <your_dataset_name>"
 git push origin
 ```
 
-## 6) Test your data-loader 
+## 6) Test your data-loader
 
 Run the following command **in a folder that does not include your data-loading script**:
 
-Test both the original dataset schema/config and the nusantara schema/config. 
+Test both the original dataset schema/config and the nusantara schema/config.
 
 **Public Dataset**
 ```python
@@ -80,16 +80,16 @@ dataset_indobenchmark= load_dataset("indobenchmark/<your_dataset_name>", name="i
 from datasets import load_dataset
 
 dataset_orig = load_dataset(
-    "indobenchmark/<your_dataset_name>", 
-    name="source", 
+    "indobenchmark/<your_dataset_name>",
+    name="source",
     data_dir="/local/path/to/data/files",
     use_auth_token=True)
 
 dataset_indobenchmark = load_dataset(
-    "indobenchmark/<your_dataset_name>", 
-    name="indobenchmark", 
+    "indobenchmark/<your_dataset_name>",
+    name="indobenchmark",
     data_dir="/local/path/to/data/files",
     use_auth_token=True)
 ```
 
-And with that, you have successfully contributed a data-loading script! 
+And with that, you have successfully contributed a data-loading script!
diff --git a/nusantara/nusa_datasets/bapos/bapos.py b/nusantara/nusa_datasets/bapos/bapos.py
index 0b009380..0888a22a 100644
--- a/nusantara/nusa_datasets/bapos/bapos.py
+++ b/nusantara/nusa_datasets/bapos/bapos.py
@@ -2,7 +2,6 @@
 from typing import List
 
 import datasets
-import pandas as pd
 
 from nusantara.utils import schemas
 from nusantara.utils.configs import NusantaraConfig
@@ -13,7 +12,7 @@
 _SOURCE_VIEW_NAME = DEFAULT_SOURCE_VIEW_NAME
 _UNIFIED_VIEW_NAME = DEFAULT_NUSANTARA_VIEW_NAME
 
-_LANGUAGES = ['ind'] # We follow ISO639-3 language code (https://iso639-3.sil.org/code_tables/639/data)
+_LANGUAGES = ["ind"]  # We follow ISO639-3 language code (https://iso639-3.sil.org/code_tables/639/data)
 _LOCAL = False
 _CITATION = """\
 
@@ -21,7 +20,7 @@
 
 _DESCRIPTION = """\
 BaPOS is a POS tagging dataset contains about 10,000 sentences, collected from the PAN Localization Project tagged with 23 POS tag classes.
-The POS tagset is created through a detailed study and analysis of existing tagsets and the manual tagging of an Indonesian corpus. 
+The POS tagset is created through a detailed study and analysis of existing tagsets and the manual tagging of an Indonesian corpus.
 BaPOS dataset is splitted into 3 sets with 8000 train, 1000 validation, 1029 test data.
 """
 
@@ -32,16 +31,15 @@
 _URLs = {
     "train": "https://raw.githubusercontent.com/IndoNLP/indonlu/master/dataset/bapos_pos-idn/train_preprocess.txt",
     "validation": "https://raw.githubusercontent.com/IndoNLP/indonlu/master/dataset/bapos_pos-idn/valid_preprocess.txt",
-    "test": "https://raw.githubusercontent.com/IndoNLP/indonlu/master/dataset/bapos_pos-idn/test_preprocess_masked_label.txt"
+    "test": "https://raw.githubusercontent.com/IndoNLP/indonlu/master/dataset/bapos_pos-idn/test_preprocess_masked_label.txt",
 }
 
-_SUPPORTED_TASKS = [
-    Tasks.POS_TAGGING
-]
+_SUPPORTED_TASKS = [Tasks.POS_TAGGING]
 
 _SOURCE_VERSION = "1.0.0"
 _NUSANTARA_VERSION = "1.0.0"
 
+
 class BaPOSDataset(datasets.GeneratorBasedBuilder):
     """BaPOS is a POS tagging dataset contains about 10,000 sentences, collected from the PAN Localization Project tagged with 23 POS tag classes."""
 
@@ -59,20 +57,14 @@ class BaPOSDataset(datasets.GeneratorBasedBuilder):
             description="BaPOS Nusantara schema",
             schema="nusantara_seq_label",
             subset_id="bapos",
-        )
+        ),
     ]
 
     DEFAULT_CONFIG_NAME = "bapos_source"
 
     def _info(self):
         if self.config.schema == "source":
-            features = datasets.Features(
-                {
-                    "index": datasets.Value("string"),
-                    "tokens": [datasets.Value("string")],
-                    "pos_tag": [datasets.Value("string")]
-                }
-            )
+            features = datasets.Features({"index": datasets.Value("string"), "tokens": [datasets.Value("string")], "pos_tag": [datasets.Value("string")]})
         elif self.config.schema == "nusantara_seq_label":
             features = schemas.seq_label_features
 
@@ -84,12 +76,10 @@ def _info(self):
             citation=_CITATION,
         )
 
-    def _split_generators(
-        self, dl_manager: datasets.DownloadManager
-    ) -> List[datasets.SplitGenerator]:
-        train_tsv_path = Path(dl_manager.download_and_extract(_URLs['train']))
-        validation_tsv_path = Path(dl_manager.download_and_extract(_URLs['validation']))
-        test_tsv_path = Path(dl_manager.download_and_extract(_URLs['test']))
+    def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
+        train_tsv_path = Path(dl_manager.download_and_extract(_URLs["train"]))
+        validation_tsv_path = Path(dl_manager.download_and_extract(_URLs["validation"]))
+        test_tsv_path = Path(dl_manager.download_and_extract(_URLs["test"]))
         data_files = {
             "train": train_tsv_path,
             "validation": validation_tsv_path,
@@ -112,23 +102,15 @@ def _split_generators(
         ]
 
     def _generate_examples(self, filepath: Path):
-        conll_dataset = load_conll_data(filepath) # [{'sentence': [T1, T2, ..., Tn], 'labels': [L1, L2, ..., Ln]}]
+        conll_dataset = load_conll_data(filepath)  # [{'sentence': [T1, T2, ..., Tn], 'labels': [L1, L2, ..., Ln]}]
 
         if self.config.schema == "source":
             for i, row in enumerate(conll_dataset):
-                ex = {
-                    "index": str(i),
-                    "tokens": row['sentence'],
-                    "pos_tag": row['label']
-                }
+                ex = {"index": str(i), "tokens": row["sentence"], "pos_tag": row["label"]}
                 yield i, ex
         elif self.config.schema == "nusantara_seq_label":
             for i, row in enumerate(conll_dataset):
-                ex = {
-                    "id": str(i),
-                    "tokens": row['sentence'],
-                    "labels": row['label']
-                }
+                ex = {"id": str(i), "tokens": row["sentence"], "labels": row["label"]}
                 yield i, ex
         else:
-            raise ValueError(f"Invalid config: {self.config.name}")
\ No newline at end of file
+            raise ValueError(f"Invalid config: {self.config.name}")
diff --git a/nusantara/nusa_datasets/bible_en_id/bible_en_id.py b/nusantara/nusa_datasets/bible_en_id/bible_en_id.py
index efc57f8d..b58a52ee 100644
--- a/nusantara/nusa_datasets/bible_en_id/bible_en_id.py
+++ b/nusantara/nusa_datasets/bible_en_id/bible_en_id.py
@@ -3,7 +3,6 @@
 
 import datasets
 import json
-import pandas as pd
 
 from nusantara.utils import schemas
 from nusantara.utils.configs import NusantaraConfig
@@ -13,7 +12,7 @@
 _SOURCE_VIEW_NAME = DEFAULT_SOURCE_VIEW_NAME
 _UNIFIED_VIEW_NAME = DEFAULT_NUSANTARA_VIEW_NAME
 
-_LANGUAGES = ['ind', 'eng'] # We follow ISO639-3 language code (https://iso639-3.sil.org/code_tables/639/data)
+_LANGUAGES = ["ind", "eng"]  # We follow ISO639-3 language code (https://iso639-3.sil.org/code_tables/639/data)
 _LOCAL = False
 _CITATION = """\
 @inproceedings{cahyawijaya-etal-2021-indonlg,
@@ -50,17 +49,14 @@
 
 _LICENSE = "Creative Common Attribution Share-Alike 4.0 International"
 
-_URLs = {
-    "indonlg": "https://storage.googleapis.com/babert-pretraining/IndoNLG_finals/downstream_task/downstream_task_datasets.zip"
-}
+_URLs = {"indonlg": "https://storage.googleapis.com/babert-pretraining/IndoNLG_finals/downstream_task/downstream_task_datasets.zip"}
 
-_SUPPORTED_TASKS = [
-    Tasks.MACHINE_TRANSLATION
-]
+_SUPPORTED_TASKS = [Tasks.MACHINE_TRANSLATION]
 
 _SOURCE_VERSION = "1.0.0"
 _NUSANTARA_VERSION = "1.0.0"
 
+
 class BibleEnId(datasets.GeneratorBasedBuilder):
     """Bible En-Id is a machine translation dataset containing Indonesian-English parallel sentences collected from the bible.."""
 
@@ -78,20 +74,14 @@ class BibleEnId(datasets.GeneratorBasedBuilder):
             description="Bible En-Id Nusantara schema",
             schema="nusantara_t2t",
             subset_id="bible_en_id",
-        )
+        ),
     ]
 
     DEFAULT_CONFIG_NAME = "bible_en_id_source"
 
     def _info(self):
         if self.config.schema == "source":
-            features = datasets.Features(
-                {
-                    "id": datasets.Value("string"),
-                    "text": datasets.Value("string"),
-                    "label": datasets.Value("string")
-                }
-            )
+            features = datasets.Features({"id": datasets.Value("string"), "text": datasets.Value("string"), "label": datasets.Value("string")})
         elif self.config.schema == "nusantara_t2t":
             features = schemas.text2text_features
 
@@ -103,14 +93,12 @@ def _info(self):
             citation=_CITATION,
         )
 
-    def _split_generators(
-        self, dl_manager: datasets.DownloadManager
-    ) -> List[datasets.SplitGenerator]:
-        base_path = Path(dl_manager.download_and_extract(_URLs['indonlg'])) / 'IndoNLG_downstream_tasks' / 'MT_ENGKJV_INZNTV'
+    def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
+        base_path = Path(dl_manager.download_and_extract(_URLs["indonlg"])) / "IndoNLG_downstream_tasks" / "MT_ENGKJV_INZNTV"
         data_files = {
-            "train": base_path / 'train_preprocess.json',
-            "validation": base_path / 'valid_preprocess.json',
-            "test": base_path / 'test_preprocess.json',
+            "train": base_path / "train_preprocess.json",
+            "validation": base_path / "valid_preprocess.json",
+            "test": base_path / "test_preprocess.json",
         }
 
         return [
@@ -129,24 +117,20 @@ def _split_generators(
         ]
 
     def _generate_examples(self, filepath: Path):
-        data = json.load(open(filepath, 'r'))
+        data = json.load(open(filepath, "r"))
         if self.config.schema == "source":
             for row in data:
-                ex = {
-                    "id": row['id'],
-                    "text": row['text'],
-                    "label": row['label']
-                }                
-                yield row['id'], ex
+                ex = {"id": row["id"], "text": row["text"], "label": row["label"]}
+                yield row["id"], ex
         elif self.config.schema == "nusantara_t2t":
             for row in data:
                 ex = {
-                    "id": row['id'],
-                    "text_1": row['text'],
-                    "text_2": row['label'],
-                    "text_1_name": 'eng',
-                    "text_2_name": 'ind',
+                    "id": row["id"],
+                    "text_1": row["text"],
+                    "text_2": row["label"],
+                    "text_1_name": "eng",
+                    "text_2_name": "ind",
                 }
-                yield row['id'], ex
+                yield row["id"], ex
         else:
-            raise ValueError(f"Invalid config: {self.config.name}")
\ No newline at end of file
+            raise ValueError(f"Invalid config: {self.config.name}")
diff --git a/nusantara/nusa_datasets/bible_su_id/bible_su_id.py b/nusantara/nusa_datasets/bible_su_id/bible_su_id.py
index 888bb6ff..a2c17864 100644
--- a/nusantara/nusa_datasets/bible_su_id/bible_su_id.py
+++ b/nusantara/nusa_datasets/bible_su_id/bible_su_id.py
@@ -3,7 +3,6 @@
 
 import datasets
 import json
-import pandas as pd
 
 from nusantara.utils import schemas
 from nusantara.utils.configs import NusantaraConfig
@@ -13,7 +12,7 @@
 _SOURCE_VIEW_NAME = DEFAULT_SOURCE_VIEW_NAME
 _UNIFIED_VIEW_NAME = DEFAULT_NUSANTARA_VIEW_NAME
 
-_LANGUAGES = ['ind', 'sun'] # We follow ISO639-3 language code (https://iso639-3.sil.org/code_tables/639/data)
+_LANGUAGES = ["ind", "sun"]  # We follow ISO639-3 language code (https://iso639-3.sil.org/code_tables/639/data)
 _LOCAL = False
 _CITATION = """\
 @inproceedings{cahyawijaya-etal-2021-indonlg,
@@ -50,17 +49,14 @@
 
 _LICENSE = "Creative Common Attribution Share-Alike 4.0 International"
 
-_URLs = {
-    "indonlg": "https://storage.googleapis.com/babert-pretraining/IndoNLG_finals/downstream_task/downstream_task_datasets.zip"
-}
+_URLs = {"indonlg": "https://storage.googleapis.com/babert-pretraining/IndoNLG_finals/downstream_task/downstream_task_datasets.zip"}
 
-_SUPPORTED_TASKS = [
-    Tasks.MACHINE_TRANSLATION
-]
+_SUPPORTED_TASKS = [Tasks.MACHINE_TRANSLATION]
 
 _SOURCE_VERSION = "1.0.0"
 _NUSANTARA_VERSION = "1.0.0"
 
+
 class BibleSuId(datasets.GeneratorBasedBuilder):
     """Bible Su-Id is a machine translation dataset containing Indonesian-Sundanese parallel sentences collected from the bible.."""
 
@@ -78,20 +74,14 @@ class BibleSuId(datasets.GeneratorBasedBuilder):
             description="Bible Su-Id Nusantara schema",
             schema="nusantara_t2t",
             subset_id="bible_su_id",
-        )
+        ),
     ]
 
     DEFAULT_CONFIG_NAME = "bible_su_id_source"
 
     def _info(self):
         if self.config.schema == "source":
-            features = datasets.Features(
-                {
-                    "id": datasets.Value("string"),
-                    "text": datasets.Value("string"),
-                    "label": datasets.Value("string")
-                }
-            )
+            features = datasets.Features({"id": datasets.Value("string"), "text": datasets.Value("string"), "label": datasets.Value("string")})
         elif self.config.schema == "nusantara_t2t":
             features = schemas.text2text_features
 
@@ -103,14 +93,12 @@ def _info(self):
             citation=_CITATION,
         )
 
-    def _split_generators(
-        self, dl_manager: datasets.DownloadManager
-    ) -> List[datasets.SplitGenerator]:
-        base_path = Path(dl_manager.download_and_extract(_URLs['indonlg'])) / 'IndoNLG_downstream_tasks' / 'MT_SUNIBS_INZNTV'
+    def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
+        base_path = Path(dl_manager.download_and_extract(_URLs["indonlg"])) / "IndoNLG_downstream_tasks" / "MT_SUNIBS_INZNTV"
         data_files = {
-            "train": base_path / 'train_preprocess.json',
-            "validation": base_path / 'valid_preprocess.json',
-            "test": base_path / 'test_preprocess.json',
+            "train": base_path / "train_preprocess.json",
+            "validation": base_path / "valid_preprocess.json",
+            "test": base_path / "test_preprocess.json",
         }
 
         return [
@@ -129,24 +117,20 @@ def _split_generators(
         ]
 
     def _generate_examples(self, filepath: Path):
-        data = json.load(open(filepath, 'r'))
+        data = json.load(open(filepath, "r"))
         if self.config.schema == "source":
             for row in data:
-                ex = {
-                    "id": row['id'],
-                    "text": row['text'],
-                    "label": row['label']
-                }                
-                yield row['id'], ex
+                ex = {"id": row["id"], "text": row["text"], "label": row["label"]}
+                yield row["id"], ex
         elif self.config.schema == "nusantara_t2t":
             for row in data:
                 ex = {
-                    "id": row['id'],
-                    "text_1": row['text'],
-                    "text_2": row['label'],
-                    "text_1_name": 'sun',
-                    "text_2_name": 'ind',
+                    "id": row["id"],
+                    "text_1": row["text"],
+                    "text_2": row["label"],
+                    "text_1_name": "sun",
+                    "text_2_name": "ind",
                 }
-                yield row['id'], ex
+                yield row["id"], ex
         else:
-            raise ValueError(f"Invalid config: {self.config.name}")
\ No newline at end of file
+            raise ValueError(f"Invalid config: {self.config.name}")
diff --git a/nusantara/nusa_datasets/emot/emot.py b/nusantara/nusa_datasets/emot/emot.py
index a161fcde..c43083ae 100644
--- a/nusantara/nusa_datasets/emot/emot.py
+++ b/nusantara/nusa_datasets/emot/emot.py
@@ -6,8 +6,7 @@
 
 from nusantara.utils import schemas
 from nusantara.utils.configs import NusantaraConfig
-from nusantara.utils.constants import (DEFAULT_NUSANTARA_VIEW_NAME,
-                                       DEFAULT_SOURCE_VIEW_NAME, Tasks)
+from nusantara.utils.constants import DEFAULT_NUSANTARA_VIEW_NAME, DEFAULT_SOURCE_VIEW_NAME, Tasks
 
 _DATASETNAME = "emot"
 _SOURCE_VIEW_NAME = DEFAULT_SOURCE_VIEW_NAME
diff --git a/nusantara/nusa_datasets/id_abusive/id_abusive.py b/nusantara/nusa_datasets/id_abusive/id_abusive.py
index 9c975b7d..a3f9b602 100644
--- a/nusantara/nusa_datasets/id_abusive/id_abusive.py
+++ b/nusantara/nusa_datasets/id_abusive/id_abusive.py
@@ -1,4 +1,3 @@
-import json
 from pathlib import Path
 from typing import Dict, List, Tuple
 
diff --git a/nusantara/nusa_datasets/id_clickbait/id_clickbait.py b/nusantara/nusa_datasets/id_clickbait/id_clickbait.py
index 72c98fd7..dac51e4e 100644
--- a/nusantara/nusa_datasets/id_clickbait/id_clickbait.py
+++ b/nusantara/nusa_datasets/id_clickbait/id_clickbait.py
@@ -120,4 +120,4 @@ def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]:
                     "text": row["title"],
                     "labels": [row["label"]],
                 }
-                yield row_index, ex
\ No newline at end of file
+                yield row_index, ex
diff --git a/nusantara/nusa_datasets/id_hatespeech/id_hatespeech.py b/nusantara/nusa_datasets/id_hatespeech/id_hatespeech.py
index 8a89fef5..4e012314 100644
--- a/nusantara/nusa_datasets/id_hatespeech/id_hatespeech.py
+++ b/nusantara/nusa_datasets/id_hatespeech/id_hatespeech.py
@@ -1,4 +1,3 @@
-import json
 from pathlib import Path
 from typing import Dict, List, Tuple
 
diff --git a/nusantara/nusa_datasets/indo_religious_mt_en_id/indo_religious_mt_en_id.py b/nusantara/nusa_datasets/indo_religious_mt_en_id/indo_religious_mt_en_id.py
index 2e0a886a..f5f662d3 100644
--- a/nusantara/nusa_datasets/indo_religious_mt_en_id/indo_religious_mt_en_id.py
+++ b/nusantara/nusa_datasets/indo_religious_mt_en_id/indo_religious_mt_en_id.py
@@ -2,8 +2,6 @@
 from typing import List
 
 import datasets
-import json
-import pandas as pd
 
 from nusantara.utils import schemas
 from nusantara.utils.configs import NusantaraConfig
@@ -13,7 +11,7 @@
 _SOURCE_VIEW_NAME = DEFAULT_SOURCE_VIEW_NAME
 _UNIFIED_VIEW_NAME = DEFAULT_NUSANTARA_VIEW_NAME
 
-_LANGUAGES = ['ind', 'eng'] # We follow ISO639-3 language code (https://iso639-3.sil.org/code_tables/639/data)
+_LANGUAGES = ["ind", "eng"]  # We follow ISO639-3 language code (https://iso639-3.sil.org/code_tables/639/data)
 _LOCAL = False
 _CITATION = """\
 @inproceedings{guntara-etal-2020-benchmarking,
@@ -53,13 +51,12 @@
     "train.id.1": "https://raw.githubusercontent.com/gunnxx/indonesian-mt-data/master/religious/train.id.1",
 }
 
-_SUPPORTED_TASKS = [
-    Tasks.MACHINE_TRANSLATION
-]
+_SUPPORTED_TASKS = [Tasks.MACHINE_TRANSLATION]
 
 _SOURCE_VERSION = "1.0.0"
 _NUSANTARA_VERSION = "1.0.0"
 
+
 class IndoReligiousMTEnId(datasets.GeneratorBasedBuilder):
     """Indonesian Religious Domain MT En-Id is a machine translation dataset containing English-Indonesian parallel sentences collected from the religious manuscripts."""
 
@@ -77,7 +74,7 @@ class IndoReligiousMTEnId(datasets.GeneratorBasedBuilder):
             description="Bible En-Id Nusantara schema",
             schema="nusantara_t2t",
             subset_id="indo_religious_mt_en_id",
-        )
+        ),
     ]
 
     DEFAULT_CONFIG_NAME = "indo_religious_mt_en_id_source"
@@ -101,9 +98,7 @@ def _info(self):
             citation=_CITATION,
         )
 
-    def _split_generators(
-        self, dl_manager: datasets.DownloadManager
-    ) -> List[datasets.SplitGenerator]:
+    def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
         data_files = {
             "test.en": Path(dl_manager.download_and_extract(_URLs["test.en"])),
             "test.id": Path(dl_manager.download_and_extract(_URLs["test.id"])),
@@ -118,24 +113,30 @@ def _split_generators(
         return [
             datasets.SplitGenerator(
                 name=datasets.Split.TEST,
-                gen_kwargs={"filepath": {
-                    "en": [data_files["test.en"]],
-                    "id": [data_files["test.id"]],
-                }},
+                gen_kwargs={
+                    "filepath": {
+                        "en": [data_files["test.en"]],
+                        "id": [data_files["test.id"]],
+                    }
+                },
             ),
             datasets.SplitGenerator(
                 name=datasets.Split.VALIDATION,
-                gen_kwargs={"filepath": {
-                    "en": [data_files["valid.en"]],
-                    "id": [data_files["valid.id"]],
-                }},
+                gen_kwargs={
+                    "filepath": {
+                        "en": [data_files["valid.en"]],
+                        "id": [data_files["valid.id"]],
+                    }
+                },
             ),
             datasets.SplitGenerator(
                 name=datasets.Split.TRAIN,
-                gen_kwargs={"filepath": {
-                    "en": [data_files["train.en.0"], data_files["train.en.1"]],
-                    "id": [data_files["train.id.0"], data_files["train.id.1"]],
-                }},
+                gen_kwargs={
+                    "filepath": {
+                        "en": [data_files["train.en.0"], data_files["train.en.1"]],
+                        "id": [data_files["train.id.0"], data_files["train.id.1"]],
+                    }
+                },
             ),
         ]
 
@@ -160,7 +161,7 @@ def _generate_examples(self, filepath: dict):
                 ex = {
                     "text_1": row_en,
                     "text_2": row_id,
-                }                
+                }
                 yield id, ex
         elif self.config.schema == "nusantara_t2t":
             for id, (row_en, row_id) in enumerate(zip(data_en, data_id)):
@@ -168,9 +169,9 @@ def _generate_examples(self, filepath: dict):
                     "id": id,
                     "text_1": row_en,
                     "text_2": row_id,
-                    "text_1_name": 'eng',
-                    "text_2_name": 'ind',
+                    "text_1_name": "eng",
+                    "text_2_name": "ind",
                 }
                 yield id, ex
         else:
-            raise ValueError(f"Invalid config: {self.config.name}")
\ No newline at end of file
+            raise ValueError(f"Invalid config: {self.config.name}")
diff --git a/nusantara/nusa_datasets/smsa/smsa.py b/nusantara/nusa_datasets/smsa/smsa.py
index 0c55a99c..66862329 100644
--- a/nusantara/nusa_datasets/smsa/smsa.py
+++ b/nusantara/nusa_datasets/smsa/smsa.py
@@ -12,22 +12,22 @@
 _SOURCE_VIEW_NAME = DEFAULT_SOURCE_VIEW_NAME
 _UNIFIED_VIEW_NAME = DEFAULT_NUSANTARA_VIEW_NAME
 
-_LANGUAGES = ['ind'] # We follow ISO639-3 language code (https://iso639-3.sil.org/code_tables/639/data)
+_LANGUAGES = ["ind"]  # We follow ISO639-3 language code (https://iso639-3.sil.org/code_tables/639/data)
 _LOCAL = False
 _CITATION = """\
-@INPROCEEDINGS{8904199,  
-    author={Purwarianti, Ayu and Crisdayanti, Ida Ayu Putu Ari},  
-    booktitle={2019 International Conference of Advanced Informatics: Concepts, Theory and Applications (ICAICTA)},   
-    title={Improving Bi-LSTM Performance for Indonesian Sentiment Analysis Using Paragraph Vector},  
-    year={2019},  
-    pages={1-5},  
+@INPROCEEDINGS{8904199,
+    author={Purwarianti, Ayu and Crisdayanti, Ida Ayu Putu Ari},
+    booktitle={2019 International Conference of Advanced Informatics: Concepts, Theory and Applications (ICAICTA)},
+    title={Improving Bi-LSTM Performance for Indonesian Sentiment Analysis Using Paragraph Vector},
+    year={2019},
+    pages={1-5},
     doi={10.1109/ICAICTA.2019.8904199}
 }
 """
 
 _DESCRIPTION = """\
-SmSA is a sentence-level sentiment analysis dataset (Purwarianti and Crisdayanti, 2019) is a collection of comments and reviews 
-in Indonesian obtained from multiple online platforms. The text was crawled and then annotated by several Indonesian linguists 
+SmSA is a sentence-level sentiment analysis dataset (Purwarianti and Crisdayanti, 2019) is a collection of comments and reviews
+in Indonesian obtained from multiple online platforms. The text was crawled and then annotated by several Indonesian linguists
 to construct this dataset. There are three possible sentiments on the SmSA dataset: positive, negative, and neutral
 """
 
@@ -38,16 +38,15 @@
 _URLs = {
     "train": "https://github.com/IndoNLP/indonlu/raw/master/dataset/smsa_doc-sentiment-prosa/train_preprocess.tsv",
     "validation": "https://github.com/IndoNLP/indonlu/raw/master/dataset/smsa_doc-sentiment-prosa/valid_preprocess.tsv",
-    "test": "https://github.com/IndoNLP/indonlu/raw/master/dataset/smsa_doc-sentiment-prosa/test_preprocess_masked_label.tsv"
+    "test": "https://github.com/IndoNLP/indonlu/raw/master/dataset/smsa_doc-sentiment-prosa/test_preprocess_masked_label.tsv",
 }
 
-_SUPPORTED_TASKS = [
-    Tasks.SENTIMENT_ANALYSIS
-]
+_SUPPORTED_TASKS = [Tasks.SENTIMENT_ANALYSIS]
 
 _SOURCE_VERSION = "1.0.0"
 _NUSANTARA_VERSION = "1.0.0"
 
+
 class SMSA(datasets.GeneratorBasedBuilder):
     """SMSA is a sentiment analysis dataset consisting of 3 labels (positive, neutral, and negative) which comes from comments and reviews collected from multiple online platforms."""
 
@@ -65,20 +64,14 @@ class SMSA(datasets.GeneratorBasedBuilder):
             description="SMSA Nusantara schema",
             schema="nusantara_text",
             subset_id="smsa",
-        )
+        ),
     ]
 
     DEFAULT_CONFIG_NAME = "smsa_source"
 
     def _info(self):
         if self.config.schema == "source":
-            features = datasets.Features(
-                {
-                    "index": datasets.Value("string"),
-                    "sentence": datasets.Value("string"),
-                    "label": datasets.Value("string")
-                }
-            )
+            features = datasets.Features({"index": datasets.Value("string"), "sentence": datasets.Value("string"), "label": datasets.Value("string")})
         elif self.config.schema == "nusantara_text":
             features = schemas.text_features
 
@@ -90,12 +83,10 @@ def _info(self):
             citation=_CITATION,
         )
 
-    def _split_generators(
-        self, dl_manager: datasets.DownloadManager
-    ) -> List[datasets.SplitGenerator]:
-        train_tsv_path = Path(dl_manager.download_and_extract(_URLs['train']))
-        validation_tsv_path = Path(dl_manager.download_and_extract(_URLs['validation']))
-        test_tsv_path = Path(dl_manager.download_and_extract(_URLs['test']))
+    def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
+        train_tsv_path = Path(dl_manager.download_and_extract(_URLs["train"]))
+        validation_tsv_path = Path(dl_manager.download_and_extract(_URLs["validation"]))
+        test_tsv_path = Path(dl_manager.download_and_extract(_URLs["test"]))
         data_files = {
             "train": train_tsv_path,
             "validation": validation_tsv_path,
@@ -118,24 +109,16 @@ def _split_generators(
         ]
 
     def _generate_examples(self, filepath: Path):
-        df = pd.read_csv(filepath, sep='\t', header=None).reset_index()
-        df.columns = ['id', 'sentence', 'label']
+        df = pd.read_csv(filepath, sep="\t", header=None).reset_index()
+        df.columns = ["id", "sentence", "label"]
 
         if self.config.schema == "source":
             for row in df.itertuples():
-                ex = {
-                    "index": str(row.id),
-                    "sentence": row.sentence,
-                    "label": row.label
-                }                
+                ex = {"index": str(row.id), "sentence": row.sentence, "label": row.label}
                 yield row.id, ex
         elif self.config.schema == "nusantara_text":
             for row in df.itertuples():
-                ex = {
-                    "id": str(row.id),
-                    "text": row.sentence,
-                    "labels": [row.label]
-                }
+                ex = {"id": str(row.id), "text": row.sentence, "labels": [row.label]}
                 yield row.id, ex
         else:
-            raise ValueError(f"Invalid config: {self.config.name}")
\ No newline at end of file
+            raise ValueError(f"Invalid config: {self.config.name}")
diff --git a/nusantara/nusa_datasets/stif_indonesia/stif_indonesia.py b/nusantara/nusa_datasets/stif_indonesia/stif_indonesia.py
index 5332598c..54480650 100644
--- a/nusantara/nusa_datasets/stif_indonesia/stif_indonesia.py
+++ b/nusantara/nusa_datasets/stif_indonesia/stif_indonesia.py
@@ -2,8 +2,6 @@
 from typing import List
 
 import datasets
-import json
-import pandas as pd
 
 from nusantara.utils import schemas
 from nusantara.utils.configs import NusantaraConfig
@@ -13,7 +11,7 @@
 _SOURCE_VIEW_NAME = DEFAULT_SOURCE_VIEW_NAME
 _UNIFIED_VIEW_NAME = DEFAULT_NUSANTARA_VIEW_NAME
 
-_LANGUAGES = ['ind'] # We follow ISO639-3 language code (https://iso639-3.sil.org/code_tables/639/data)
+_LANGUAGES = ["ind"]  # We follow ISO639-3 language code (https://iso639-3.sil.org/code_tables/639/data)
 _LOCAL = False
 _CITATION = """\
 @inproceedings{wibowo2020semi,
@@ -36,21 +34,20 @@
 
 _BASEURL = "https://raw.githubusercontent.com/haryoa/stif-indonesia/main/data/labelled/"
 _URLs = {
-        "dev.for": _BASEURL + "dev.for",
-        "dev.inf": _BASEURL + "dev.inf",
-        "test.for": _BASEURL + "test.for",
-        "test.inf": _BASEURL + "test.inf",
-        "train.for": _BASEURL + "train.for",
-        "train.inf": _BASEURL + "train.inf",
+    "dev.for": _BASEURL + "dev.for",
+    "dev.inf": _BASEURL + "dev.inf",
+    "test.for": _BASEURL + "test.for",
+    "test.inf": _BASEURL + "test.inf",
+    "train.for": _BASEURL + "train.for",
+    "train.inf": _BASEURL + "train.inf",
 }
 
-_SUPPORTED_TASKS = [
-    Tasks.PARAPHRASING
-]
+_SUPPORTED_TASKS = [Tasks.PARAPHRASING]
 
 _SOURCE_VERSION = "1.0.0"
 _NUSANTARA_VERSION = "1.0.0"
 
+
 class STIFIndonesia(datasets.GeneratorBasedBuilder):
     """STIF-Indonesia is formal-informal/colloquial style transfer for Indonesian."""
 
@@ -68,20 +65,14 @@ class STIFIndonesia(datasets.GeneratorBasedBuilder):
             description="STIF Indonesia Nusantara schema",
             schema="nusantara_t2t",
             subset_id="stif_indonesia",
-        )
+        ),
     ]
 
     DEFAULT_CONFIG_NAME = "stif_indonesia_source"
 
     def _info(self):
         if self.config.schema == "source":
-            features = datasets.Features(
-                {
-                    "id": datasets.Value("string"),
-                    "formal": datasets.Value("string"),
-                    "informal": datasets.Value("string")
-                }
-            )
+            features = datasets.Features({"id": datasets.Value("string"), "formal": datasets.Value("string"), "informal": datasets.Value("string")})
         elif self.config.schema == "nusantara_t2t":
             features = schemas.text2text_features
 
@@ -93,9 +84,7 @@ def _info(self):
             citation=_CITATION,
         )
 
-    def _split_generators(
-        self, dl_manager: datasets.DownloadManager
-    ) -> List[datasets.SplitGenerator]:
+    def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
         data_files = {}
         for key in _URLs:
             data_files[key] = Path(dl_manager.download_and_extract(_URLs[key]))
@@ -103,38 +92,40 @@ def _split_generators(
         return [
             datasets.SplitGenerator(
                 name=datasets.Split.TEST,
-                gen_kwargs={"filepath": {
-                    "formal": data_files["test.for"],
-                    "informal": data_files["test.inf"],
-                }},
+                gen_kwargs={
+                    "filepath": {
+                        "formal": data_files["test.for"],
+                        "informal": data_files["test.inf"],
+                    }
+                },
             ),
             datasets.SplitGenerator(
                 name=datasets.Split.VALIDATION,
-                gen_kwargs={"filepath": {
-                    "formal": data_files["dev.for"],
-                    "informal": data_files["dev.inf"],
-                }},
+                gen_kwargs={
+                    "filepath": {
+                        "formal": data_files["dev.for"],
+                        "informal": data_files["dev.inf"],
+                    }
+                },
             ),
             datasets.SplitGenerator(
                 name=datasets.Split.TRAIN,
-                gen_kwargs={"filepath": {
-                    "formal": data_files["train.for"],
-                    "informal": data_files["train.inf"],
-                }},
+                gen_kwargs={
+                    "filepath": {
+                        "formal": data_files["train.for"],
+                        "informal": data_files["train.inf"],
+                    }
+                },
             ),
         ]
 
     def _generate_examples(self, filepath: Path):
-        data_for = open(filepath["formal"], 'r').readlines()
-        data_inf = open(filepath["informal"], 'r').readlines()
+        data_for = open(filepath["formal"], "r").readlines()
+        data_inf = open(filepath["informal"], "r").readlines()
 
         if self.config.schema == "source":
             for id, (row_for, row_inf) in enumerate(zip(data_for, data_inf)):
-                ex = {
-                    "id": id,
-                    "formal": row_for.strip(),
-                    "informal": row_inf.strip()
-                }                
+                ex = {"id": id, "formal": row_for.strip(), "informal": row_inf.strip()}
                 yield id, ex
         elif self.config.schema == "nusantara_t2t":
             for id, (row_for, row_inf) in enumerate(zip(data_for, data_inf)):
@@ -142,9 +133,9 @@ def _generate_examples(self, filepath: Path):
                     "id": id,
                     "text_1": row_for.strip(),
                     "text_2": row_inf.strip(),
-                    "text_1_name": 'formal',
-                    "text_2_name": 'informal',
-                }                
+                    "text_1_name": "formal",
+                    "text_2_name": "informal",
+                }
                 yield id, ex
         else:
             raise ValueError(f"Invalid config: {self.config.name}")
diff --git a/nusantara/utils/common_parser.py b/nusantara/utils/common_parser.py
index fbeb2d8c..64a5e2a0 100644
--- a/nusantara/utils/common_parser.py
+++ b/nusantara/utils/common_parser.py
@@ -1,24 +1,17 @@
-import os, sys
-import json
-import pickle
-
 def load_conll_data(file_path):
     # Read file
-    data = open(file_path,'r').readlines()
+    data = open(file_path, "r").readlines()
 
     # Prepare buffer
     dataset = []
     sentence, seq_label = [], []
     for line in data:
         if len(line.strip()) > 0:
-            token, label = line[:-1].split('\t')
+            token, label = line[:-1].split("\t")
             sentence.append(token)
             seq_label.append(label)
         else:
-            dataset.append({
-                'sentence': sentence,
-                'label': seq_label
-            })
+            dataset.append({"sentence": sentence, "label": seq_label})
             sentence = []
             seq_label = []
-    return dataset
\ No newline at end of file
+    return dataset
diff --git a/nusantara/utils/constants.py b/nusantara/utils/constants.py
index ddd77c9b..739ae714 100644
--- a/nusantara/utils/constants.py
+++ b/nusantara/utils/constants.py
@@ -1,20 +1,18 @@
 from enum import Enum
-import json
-from collections import defaultdict
 from enum import Enum
 from types import SimpleNamespace
 
-from nusantara.utils.schemas import (kb_features, pairs_features, qa_features, text2text_features, text_features, seq_label_features)
 METADATA = ["_LOCAL", "_LANGUAGES"]
 
 NusantaraValues = SimpleNamespace(NULL="<NUSA_NULL_STR>")
 
 # Default View Name
-DEFAULT_SOURCE_VIEW_NAME = 'source'
-DEFAULT_NUSANTARA_VIEW_NAME = 'nusantara'
+DEFAULT_SOURCE_VIEW_NAME = "source"
+DEFAULT_NUSANTARA_VIEW_NAME = "nusantara"
+
 
 class Tasks(Enum):
-    DEPENDENCY_PARSING = "DEP"   
+    DEPENDENCY_PARSING = "DEP"
     WORD_SENSE_DISAMBIGUATION = "WSD"
     KEYWORD_EXTRACTION = "KE"
     COREFERENCE_RESOLUTION = "COREF"
@@ -23,11 +21,11 @@ class Tasks(Enum):
     SENTIMENT_ANALYSIS = "SA"
     ASPECT_BASED_SENTIMENT_ANALYSIS = "ABSA"
     EMOTION_CLASSIFICATION = "EC"
-    
+
     # Single Text Sequence Labeling
-    POS_TAGGING = "POS"    
+    POS_TAGGING = "POS"
     NAMED_ENTITY_RECOGNITION = "NER"
-    
+
     # Pair Text Classification
     QUESTION_ANSWERING = "QA"
     TEXTUAL_ENTAILMENT = "TE"
@@ -37,14 +35,14 @@ class Tasks(Enum):
     MACHINE_TRANSLATION = "MT"
     PARAPHRASING = "PARA"
     SUMMARIZATION = "SUM"
-    
+
     # Multi Text Generation
     DIALOGUE_SYSTEM = "DS"
-    
+
     # Self Supervised Pretraining
     SELF_SUPERVISED_PRETRAINING = "SSP"
 
-    
+
 # TASK_TO_SCHEMA = {
 #     Tasks.NAMED_ENTITY_RECOGNITION: "KB",
 #     Tasks.DEPENDENCY_PARSING: "KB",
@@ -74,4 +72,4 @@ class Tasks(Enum):
 #     "T2T": text2text_features,
 #     "TEXT": text_features,
 #     "PAIRS": pairs_features,
-# }
\ No newline at end of file
+# }
diff --git a/nusantara/utils/schemas/__init__.py b/nusantara/utils/schemas/__init__.py
index 9108b2ba..90e37db3 100644
--- a/nusantara/utils/schemas/__init__.py
+++ b/nusantara/utils/schemas/__init__.py
@@ -5,11 +5,4 @@
 from .text_to_text import features as text2text_features
 from .seq_label import features as seq_label_features
 
-__all__ = [
-    "kb_features",
-    "qa_features",
-    "text2text_features",
-    "text_features",
-    "pairs_features",
-    "seq_label_features"
-]
+__all__ = ["kb_features", "qa_features", "text2text_features", "text_features", "pairs_features", "seq_label_features"]
diff --git a/nusantara/utils/schemas/seq_label.py b/nusantara/utils/schemas/seq_label.py
index 7afa4d3e..f7450653 100644
--- a/nusantara/utils/schemas/seq_label.py
+++ b/nusantara/utils/schemas/seq_label.py
@@ -1,4 +1,4 @@
-""" 
+"""
 Seq Labeling Schema
 
 Several tasks boil down to transforming sequence of tokens into annother sequence of tokens with the same length, including:
@@ -9,10 +9,4 @@
 """
 import datasets
 
-features = datasets.Features(
-    {
-        "id": datasets.Value("string"),
-        "tokens": [datasets.Value("string")],
-        "labels": [datasets.Value("string")]
-    }
-)
+features = datasets.Features({"id": datasets.Value("string"), "tokens": [datasets.Value("string")], "labels": [datasets.Value("string")]})
diff --git a/nusantara/utils/schemas/text_to_text.py b/nusantara/utils/schemas/text_to_text.py
index 70ffa874..060ad5df 100644
--- a/nusantara/utils/schemas/text_to_text.py
+++ b/nusantara/utils/schemas/text_to_text.py
@@ -1,11 +1,11 @@
-""" 
+"""
 Text to Text Schema
 
 Several tasks boil down to transforming 1 string into annother string, including:
 
 - Translation
 - Summarization
-- Paraphrasing 
+- Paraphrasing
 """
 import datasets
 
diff --git a/templates/template.py b/templates/template.py
index 90620aff..5c91083c 100644
--- a/templates/template.py
+++ b/templates/template.py
@@ -36,9 +36,7 @@
 
 import datasets
 
-from nusantara.utils import schemas
 from nusantara.utils.configs import NusantaraConfig
-from nusantara.utils.constants import Tasks
 
 # TODO: Add BibTeX citation
 _CITATION = """\
diff --git a/test_example.sh b/test_example.sh
index 21773c38..6008d56e 100755
--- a/test_example.sh
+++ b/test_example.sh
@@ -1,2 +1,2 @@
 # Test for BaPOS dataset
-python -m tests.test_nusantara nusantara/nusa_datasets/bapos/bapos.py
\ No newline at end of file
+python -m tests.test_nusantara nusantara/nusa_datasets/bapos/bapos.py
diff --git a/tests/test_nusantara.py b/tests/test_nusantara.py
index 512dce34..662e058b 100644
--- a/tests/test_nusantara.py
+++ b/tests/test_nusantara.py
@@ -12,7 +12,7 @@
 import datasets
 from datasets import DatasetDict, Features
 from nusantara.utils.constants import Tasks
-from nusantara.utils.schemas import (kb_features, pairs_features, qa_features, text2text_features, text_features, seq_label_features)
+from nusantara.utils.schemas import kb_features, pairs_features, qa_features, text2text_features, text_features, seq_label_features
 
 sys.path.append(str(Path(__file__).parent.parent))
 
@@ -27,24 +27,18 @@
     Tasks.KEYWORD_EXTRACTION: "KB",
     Tasks.COREFERENCE_RESOLUTION: "KB",
     Tasks.DIALOGUE_SYSTEM: "KB",
-
     Tasks.NAMED_ENTITY_RECOGNITION: "SEQ_LABEL",
     Tasks.POS_TAGGING: "SEQ_LABEL",
-    
     Tasks.QUESTION_ANSWERING: "QA",
-    
     Tasks.TEXTUAL_ENTAILMENT: "PAIRS",
     Tasks.SEMANTIC_SIMILARITY: "PAIRS",
-    
     Tasks.PARAPHRASING: "T2T",
     Tasks.MACHINE_TRANSLATION: "T2T",
     Tasks.SUMMARIZATION: "T2T",
-    
     Tasks.SENTIMENT_ANALYSIS: "TEXT",
     Tasks.ASPECT_BASED_SENTIMENT_ANALYSIS: "TEXT",
     Tasks.EMOTION_CLASSIFICATION: "TEXT",
-    Tasks.SELF_SUPERVISED_PRETRAINING: "TEXT"
-
+    Tasks.SELF_SUPERVISED_PRETRAINING: "TEXT",
 }
 
 _VALID_TASKS = set(_TASK_TO_SCHEMA.keys())
@@ -76,12 +70,7 @@ def _get_example_text(example: dict) -> str:
     return " ".join([t for p in example["passages"] for t in p["text"]])
 
 
-OFFSET_ERROR_MSG = (
-    "\n\n"
-    "There are features with wrong offsets!"
-    " This is not a hard failure, as it is common for this type of datasets."
-    " However, if the error list is long (e.g. >10) you should double check your code. \n\n"
-)
+OFFSET_ERROR_MSG = "\n\n" "There are features with wrong offsets!" " This is not a hard failure, as it is common for this type of datasets." " However, if the error list is long (e.g. >10) you should double check your code. \n\n"
 
 
 class TestDataLoader(unittest.TestCase):
@@ -149,7 +138,6 @@ def runTest(self):
                 with self.subTest("Check multiple choice"):
                     self.test_multiple_choice(dataset_nusantara)
 
-
     def setUp(self) -> None:
         """Load original and nusantara schema views"""
 
@@ -164,7 +152,7 @@ def setUp(self) -> None:
         if module.endswith(".py"):
             module = module[:-3]
         module = module.replace("/", ".")
-        print('module', module)
+        print("module", module)
         self._SUPPORTED_TASKS = importlib.import_module(module)._SUPPORTED_TASKS
         logger.info(f"Found _SUPPORTED_TASKS={self._SUPPORTED_TASKS}")
         invalid_tasks = set(self._SUPPORTED_TASKS) - _VALID_TASKS
@@ -324,14 +312,10 @@ def test_do_all_referenced_ids_exist(self, dataset_nusantara: DatasetDict):
                 for ref_id, ref_type in referenced_ids:
                     if ref_type == "event":
                         if not ((ref_id, "entity") in existing_ids or (ref_id, "event") in existing_ids):
-                            logger.warning(
-                                f"Referenced element ({ref_id}, entity/event) could not be found in existing ids {existing_ids}. Please make sure that this is not because of a bug in your data loader."
-                            )
+                            logger.warning(f"Referenced element ({ref_id}, entity/event) could not be found in existing ids {existing_ids}. Please make sure that this is not because of a bug in your data loader.")
                     else:
                         if not (ref_id, ref_type) in referenced_ids:
-                            logger.warning(
-                                f"Referenced element {(ref_id, ref_type)} could not be found in existing ids {existing_ids}. Please make sure that this is not because of a bug in your data loader."
-                            )
+                            logger.warning(f"Referenced element {(ref_id, ref_type)} could not be found in existing ids {existing_ids}. Please make sure that this is not because of a bug in your data loader.")
 
     def test_passages_offsets(self, dataset_nusantara: DatasetDict):
         """
@@ -392,9 +376,7 @@ def _check_offsets(
         """  # noqa
 
         if len(texts) != len(offsets):
-            logger.warning(
-                f"Split:{split} - Example:{example_id} - Number of texts {len(texts)} != number of offsets {len(offsets)}. Please make sure that this error already exists in the original data and was not introduced in the data loader."
-            )
+            logger.warning(f"Split:{split} - Example:{example_id} - Number of texts {len(texts)} != number of offsets {len(offsets)}. Please make sure that this error already exists in the original data and was not introduced in the data loader.")
 
         self._test_is_list(
             msg=f"Split:{split} - Example:{example_id} - Text fields paired with offsets must be in the form [`text`, ...]",
@@ -504,10 +486,7 @@ def test_coref_ids(self, dataset_nusantara: DatasetDict):
                     # check all coref entity ids are in entity lookup
                     for coref in example["coreferences"]:
                         for entity_id in coref["entity_ids"]:
-                            assert (
-                                entity_id in entity_lookup
-                            ), f"Split:{split} - Example:{example_id} - Entity:{entity_id} not found!"
-
+                            assert entity_id in entity_lookup, f"Split:{split} - Example:{example_id} - Entity:{entity_id} not found!"
 
     def test_multiple_choice(self, dataset_nusantara: DatasetDict):
         """
@@ -519,25 +498,17 @@ def test_multiple_choice(self, dataset_nusantara: DatasetDict):
             for example in dataset_nusantara[split]:
 
                 if len(example["choices"]) > 0:
-                    assert(
-                        example["type"] == "multiple_choice"  # can change this to "in" if we include ranking
-                    ), f"example has populated choices, but is not type 'multiple_choice' {example}"
+                    assert example["type"] == "multiple_choice", f"example has populated choices, but is not type 'multiple_choice' {example}"  # can change this to "in" if we include ranking
 
                 if example["type"] == "multiple_choice":
-                    assert(
-                        len(example["choices"]) > 0
-                    ), f"example has type 'multiple_choice' but no values in 'choices' {example}"
+                    assert len(example["choices"]) > 0, f"example has type 'multiple_choice' but no values in 'choices' {example}"
 
                     for answer in example["answer"]:
-                        assert(
-                            answer in example["choices"]
-                        ), f"example has an answer that is not present in 'choices' {example}"
-
+                        assert answer in example["choices"], f"example has an answer that is not present in 'choices' {example}"
 
     def test_schema(self, schema: str):
         """Search supported tasks within a dataset and verify nusantara schema"""
 
-
         non_empty_features = set()
         if schema == "KB":
             features = kb_features
@@ -558,8 +529,6 @@ def test_schema(self, schema: str):
                 if count > 0 and feature not in non_empty_features and feature in set().union(*_TASK_TO_FEATURES.values()):
                     logger.warning(f"Found instances of '{feature}' but there seems to be no task in 'SUPPORTED_TASKS' for them. Is 'SUPPORTED_TASKS' correct?")
 
-
-
     def _test_is_list(self, msg: str, field: list):
         with self.subTest(
             msg,
@@ -578,9 +547,7 @@ def _test_has_only_one_item(self, msg: str, field: list):
 if __name__ == "__main__":
     logging.basicConfig(level=logging.INFO)
 
-    parser = argparse.ArgumentParser(
-        description="Unit tests for Nusantara datasets. Args are passed to `datasets.load_dataset`"
-    )
+    parser = argparse.ArgumentParser(description="Unit tests for Nusantara datasets. Args are passed to `datasets.load_dataset`")
 
     parser.add_argument("path", type=str, help="path to dataloader script (e.g. examples/n2c2_2011.py)")
     parser.add_argument(