diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 5a87de32..450fcce8 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -1,3 +1,3 @@
 # These are the current maintainers/admin of the nusantara-datasets repo
 
-* @samuelcahyawijaya @afaji @holylovenia @gentaiscool @bryanwilie @fajri91
+* @samuelcahyawijaya @afaji @holylovenia @gentaiscool @bryanwilie @fajri91 @christianwbsn @muhsatrio
diff --git a/.github/workflows/update-readme.yml b/.github/workflows/update-readme.yml
new file mode 100644
index 00000000..d3718a89
--- /dev/null
+++ b/.github/workflows/update-readme.yml
@@ -0,0 +1,31 @@
+name: Update README
+on:
+  push:
+    branches:
+    - master
+  workflow_dispatch:
+  schedule:
+    - cron:  '0 0 * * 0'
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Check out repo
+      uses: actions/checkout@v2
+    - name: Set up Python
+      uses: actions/setup-python@v2
+      with:
+        python-version: 3.8
+        architecture: x64
+    - name: Update README
+      run: |-
+        python update_readme.py
+        cat README.md
+    - name: Commit and push if changed
+      run: |-
+        git diff
+        git config --global user.email "readme-bot@indonlp.com"
+        git config --global user.name "README-Bot"
+        git add -A
+        git commit -m "Updated progress bar" || exit 0
+        git push
diff --git a/README.id.md b/README.id.md
index 9dbff227..7dccf742 100644
--- a/README.id.md
+++ b/README.id.md
@@ -4,13 +4,15 @@
 
 ![Dataset claimed](https://progress-bar.dev/81/?title=Datasets%20Claimed%20(77%20Datasets%20Claimed))
 
+<!-- milestone starts -->
 ![Milestone 1](https://progress-bar.dev/100/?title=Milestone%201%20(30%20Datasets%20Completed))
 
-![Milestone 2](https://progress-bar.dev/80/?title=Milestone%202%20(60%20Datasets%20Completed))
+![Milestone 2](https://progress-bar.dev/95/?title=Milestone%202%20(60%20Datasets%20Completed))
 
-![Milestone 3](https://progress-bar.dev/48/?title=Milestone%203%20(100%20Datasets%20Completed))
+![Milestone 3](https://progress-bar.dev/57/?title=Milestone%203%20(100%20Datasets%20Completed))
 
-![Milestone 4](https://progress-bar.dev/32/?title=Milestone%204%20(150%20Datasets%20Completed))
+![Milestone 4](https://progress-bar.dev/38/?title=Milestone%204%20(150%20Datasets%20Completed))
+<!-- milestone ends -->
 
 *Read this README in [English](README.md).*
 
diff --git a/README.md b/README.md
index 8c1b859f..06cb27ff 100644
--- a/README.md
+++ b/README.md
@@ -4,13 +4,15 @@
 
 ![Dataset claimed](https://progress-bar.dev/80/?title=Datasets%20Claimed%20(77%20Datasets%20Claimed))
 
+<!-- milestone starts -->
 ![Milestone 1](https://progress-bar.dev/100/?title=Milestone%201%20(30%20Datasets%20Completed))
 
-![Milestone 2](https://progress-bar.dev/80/?title=Milestone%202%20(60%20Datasets%20Completed))
+![Milestone 2](https://progress-bar.dev/95/?title=Milestone%202%20(60%20Datasets%20Completed))
 
-![Milestone 3](https://progress-bar.dev/48/?title=Milestone%203%20(100%20Datasets%20Completed))
+![Milestone 3](https://progress-bar.dev/57/?title=Milestone%203%20(100%20Datasets%20Completed))
 
-![Milestone 4](https://progress-bar.dev/32/?title=Milestone%204%20(150%20Datasets%20Completed))
+![Milestone 4](https://progress-bar.dev/38/?title=Milestone%204%20(150%20Datasets%20Completed))
+<!-- milestone ends -->
 
 *Baca README ini dalam [Bahasa Indonesia](README.id.md).*
 
@@ -60,7 +62,7 @@ You can upload your dataset publicly first, eg. on Github.
 
 #### Can I create a PR if I have an idea?
 
-If you have an idea to improve or change the code of the nusa-crowd repository, please create an `issue` and ask for `feedback` before starting any PRs. 
+If you have an idea to improve or change the code of the nusa-crowd repository, please create an `issue` and ask for `feedback` before starting any PRs.
 
 #### I am confused, can you help me?
 
diff --git a/nusantara/nusa_datasets/id_qqp/id_qqp.py b/nusantara/nusa_datasets/id_qqp/id_qqp.py
index 87f292e0..496becdf 100644
--- a/nusantara/nusa_datasets/id_qqp/id_qqp.py
+++ b/nusantara/nusa_datasets/id_qqp/id_qqp.py
@@ -10,14 +10,22 @@
 import json
 
 _CITATION = """\
-    https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs
+@misc{quoraFirstQuora,
+	author = {},
+	title = {{F}irst {Q}uora {D}ataset {R}elease: {Q}uestion {P}airs --- quoradata.quora.com},
+	howpublished = {https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs},
+	year = 2017,
+	note = {Online},
+}
 """
 
 _DATASETNAME = "id_qqp"
 
 _DESCRIPTION = """\
-INDOSUM is a new benchmark dataset for Indonesian text summarization. 
-The dataset consists of news articles and manually constructed summaries.
+Quora Question Pairs (QQP) dataset consists of over 400,000 question pairs, 
+and each question pair is annotated with a binary value indicating whether 
+the two questions are paraphrase of each other. This dataset is translated 
+version of QQP to Indonesian Language.
 """
 
 _HOMEPAGE = "https://github.com/louisowen6/quora_paraphrasing_id"
@@ -38,8 +46,13 @@
 _NUSANTARA_VERSION = "1.0.0"
 
 
-class IndoSUM(datasets.GeneratorBasedBuilder):
-    """INDOSUM is a new benchmark dataset for Indonesian text summarization. The dataset consists of news articles and manually constructed summaries."""
+class IdQuoraQuestionPairs(datasets.GeneratorBasedBuilder):
+    """
+    Quora Question Pairs (QQP) dataset consists of over 400,000 question pairs, 
+    and each question pair is annotated with a binary value indicating whether 
+    the two questions are paraphrase of each other. This dataset is translated 
+    version of QQP to Indonesian Language.
+    """
 
     SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
     NUSANTARA_VERSION = datasets.Version(_NUSANTARA_VERSION)
diff --git a/nusantara/nusa_datasets/id_stance/id_stance.py b/nusantara/nusa_datasets/id_stance/id_stance.py
new file mode 100644
index 00000000..7a248db6
--- /dev/null
+++ b/nusantara/nusa_datasets/id_stance/id_stance.py
@@ -0,0 +1,133 @@
+import json
+from pathlib import Path
+from typing import List
+
+import datasets
+import pandas as pd
+
+from nusantara.utils import schemas
+from nusantara.utils.configs import NusantaraConfig
+from nusantara.utils.constants import Tasks
+
+_CITATION = """\
+@INPROCEEDINGS{8629144,  
+  author={R. {Jannati} and R. {Mahendra} and C. W. {Wardhana} and M. {Adriani}},
+  booktitle={2018 International Conference on Asian Language Processing (IALP)},
+  title={Stance Classification Towards Political Figures on Blog Writing},
+  year={2018},
+  volume={},
+  number={},
+  pages={96-101},
+}
+"""
+_DATASETNAME = "id_stance"
+_DESCRIPTION = """\
+Stance Classification Towards Political Figures on Blog Writing.
+This dataset contains dataset from the second research, which is combined from the first research and new dataset.
+The dataset consist of 337 data, about five target and every target have 1 different event.
+Two label are used: 'For' and 'Againts'.
+1. For - the text that is created by author is support the target in an event
+2. Against - the text that is created by author is oppose the target in an event
+"""
+_HOMEPAGE = "https://github.com/reneje/id_stance_dataset_article-Stance-Classification-Towards-Political-Figures-on-Blog-Writing"
+_LICENSE = "Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License"
+_URLs = {
+    _DATASETNAME: "https://raw.githubusercontent.com/reneje/id_stance_dataset_article-Stance-Classification-Towards-Political-Figures-on-Blog-Writing/master/dataset_stance_2_label_2018_building_by_rini.csv"
+}
+_SUPPORTED_TASKS = [Tasks.TEXTUAL_ENTAILMENT]
+_SOURCE_VERSION = "1.0.0"
+_NUSANTARA_VERSION = "1.0.0"
+
+
+def parse_list(content):
+    if (not content):
+        return []
+    try:
+        return json.loads(content)
+    except:
+        return json.loads("[\"" + content[1:-1].replace("\"", "\\\"") + "\"]")
+
+
+class IdStance(datasets.GeneratorBasedBuilder):
+    """The ID Stance dataset is annotated with a label whether the article is in favor of the person in the context of the event"""
+
+    BUILDER_CONFIGS = [
+        NusantaraConfig(
+            name="id_stance_source",
+            version=datasets.Version(_SOURCE_VERSION),
+            description="IdStance source schema",
+            schema="source",
+            subset_id="id_stance",
+        ),
+        NusantaraConfig(
+            name="id_stance_nusantara_pairs",
+            version=datasets.Version(_NUSANTARA_VERSION),
+            description="IdStance Nusantara schema",
+            schema="nusantara_pairs",
+            subset_id="id_stance",
+        ),
+    ]
+
+    DEFAULT_CONFIG_NAME = "id_stance_source"
+
+    def _info(self):
+        if self.config.schema == "source":
+            features = datasets.Features(
+                {
+                    "person": datasets.Value("string"),
+                    "event": datasets.Value("string"),
+                    "title": datasets.Value("string"),
+                    "content": datasets.Value("string"),
+                    "stance_final": datasets.Value("string"),
+                }
+            )
+        elif self.config.schema == "nusantara_pairs":
+            features = schemas.pairs_features(["for", "against", "againts", "no"])
+
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=features,
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
+        data_path = Path(dl_manager.download_and_extract(_URLs[_DATASETNAME]))
+        data_files = {
+            "train": data_path,
+        }
+
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                gen_kwargs={"filepath": data_files["train"]},
+            ),
+        ]
+
+    def _generate_examples(self, filepath: Path):
+        df = pd.read_csv(filepath, sep=";", header="infer", keep_default_na=False).reset_index()
+        df.columns = ["id", "person", "event", "title", "content", "stance_final", ""]
+        df.content = df.content.apply(parse_list)
+
+        if self.config.schema == "source":
+            for row in df.itertuples():
+                ex = {
+                    "person": row.person,
+                    "event": row.event,
+                    "title": row.title,
+                    "content": " ".join(row.content),
+                    "stance_final": row.stance_final
+                }
+                yield row.id, ex
+        elif self.config.schema == "nusantara_pairs":
+            for row in df.itertuples():
+                ex = {
+                    "id": row.id,
+                    "text_1": row.person + " | " + row.event,
+                    "text_2": " ".join([row.title] + row.content),
+                    "label": row.stance_final
+                }
+                yield row.id, ex
+        else:
+            raise ValueError(f"Invalid config: {self.config.name}")
diff --git a/nusantara/nusa_datasets/indo_puisi/indo_puisi.py b/nusantara/nusa_datasets/indo_puisi/indo_puisi.py
new file mode 100644
index 00000000..f6d16dc6
--- /dev/null
+++ b/nusantara/nusa_datasets/indo_puisi/indo_puisi.py
@@ -0,0 +1,114 @@
+from pathlib import Path
+from typing import Dict, List, Tuple
+
+import datasets
+import pandas as pd
+
+from nusantara.utils import schemas
+from nusantara.utils.configs import NusantaraConfig
+from nusantara.utils.constants import (DEFAULT_NUSANTARA_VIEW_NAME,
+                                       DEFAULT_SOURCE_VIEW_NAME, Tasks)
+
+_DATASETNAME = "indo_puisi"
+_SOURCE_VIEW_NAME = DEFAULT_SOURCE_VIEW_NAME
+_UNIFIED_VIEW_NAME = DEFAULT_NUSANTARA_VIEW_NAME
+
+_CITATION = """
+"""
+
+_DESCRIPTION = """\
+Puisi is an Indonesian poetic form. The dataset was collected by scraping various websites. It contains 7223 Indonesian puisi along with the title and author.
+"""
+
+_HOMEPAGE = "https://github.com/ilhamfp/puisi-pantun-generator"
+
+_LICENSE = "Creative Commons Attribution Share-Alike 4.0 International"
+
+_SUPPORTED_TASKS = [Tasks.SELF_SUPERVISED_PRETRAINING]
+
+_SOURCE_VERSION = "1.0.0"
+
+_NUSANTARA_VERSION = "1.0.0"
+
+_URLS = {
+    "train": "https://raw.githubusercontent.com/ilhamfp/puisi-pantun-generator/main/data/puisi.csv",
+}
+
+
+class IndoPuisi(datasets.GeneratorBasedBuilder):
+    """IndoPuisi contains 7223 Indonesian puisi along with the title and author."""
+
+    BUILDER_CONFIGS = (
+        NusantaraConfig(
+            name="indo_puisi_source",
+            version=_SOURCE_VERSION,
+            description="Indo puisi source schema",
+            schema="source",
+            subset_id="indo_puisi",
+        ),
+        NusantaraConfig(
+            name="indo_puisi_nusantara_ssp",
+            version=_NUSANTARA_VERSION,
+            description="Indo puisi Nusantara schema",
+            schema="nusantara_ssp",
+            subset_id="indo_puisi",
+        ),
+    )
+
+    DEFAULT_CONFIG_NAME = "indo_puisi_source"
+
+    def _info(self) -> datasets.DatasetInfo:
+        if self.config.schema == "source":
+            features = datasets.Features(
+                {
+                    "id": datasets.Value("string"),
+                    "puisi": datasets.Value("string"),
+                    "title": datasets.Value("string"),
+                    "author": datasets.Value("string"),
+                    "puisi_with_header": datasets.Value("string"),
+                }
+            )
+        elif self.config.schema == "nusantara_ssp":
+            features = schemas.self_supervised_pretraining.features
+        else:
+            raise ValueError(f"Invalid config schema: {self.config.schema}")
+
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=features,
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
+        """Returns SplitGenerators."""
+        train_csv_path = Path(dl_manager.download(_URLS["train"]))
+
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                gen_kwargs={"filepath": train_csv_path},
+            ),
+        ]
+
+    def _generate_examples(self, filepath: Path) -> Tuple[int, Dict]:
+        if self.config.schema != "source" and self.config.schema != "nusantara_ssp":
+            raise ValueError(f"Invalid config schema: {self.config.schema}")
+
+        df = pd.read_csv(filepath).reset_index()
+        if self.config.name == "indo_puisi_source":
+            for row in df.itertuples():
+                ex = {
+                    "id": str(row.index),
+                    "puisi": str(row.puisi).rstrip(),
+                    "title": row.title,
+                    "author": row.author,
+                    "puisi_with_header": str(row.puisi_with_header).rstrip(),
+                }
+                yield row.index, ex
+
+        elif self.config.name == "indo_puisi_nusantara_ssp":
+            for row in df.itertuples():
+                ex = {"id": str(row.index), "text": str(row.puisi).rstrip()}
+                yield row.index, ex
diff --git a/nusantara/nusa_datasets/pos_sun_mono/pos_sun_mono.py b/nusantara/nusa_datasets/pos_sun_mono/pos_sun_mono.py
new file mode 100644
index 00000000..175973dc
--- /dev/null
+++ b/nusantara/nusa_datasets/pos_sun_mono/pos_sun_mono.py
@@ -0,0 +1,268 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+from pathlib import Path
+from typing import Dict, List, Tuple
+
+import datasets
+
+from nusantara.utils import schemas
+from nusantara.utils.configs import NusantaraConfig
+from nusantara.utils.constants import Tasks
+
+_CITATION = """\
+@data{FK2/VTAHRH_2022,
+    author = {ARDIYANTI SURYANI, ARIE and Widyantoro, Dwi Hendratmo and Purwarianti, Ayu and Sudaryat, Yayat},
+    publisher = {Telkom University Dataverse},
+    title = {{PoSTagged Sundanese Monolingual Corpus}},
+    year = {2022},
+    version = {DRAFT VERSION},
+    doi = {10.34820/FK2/VTAHRH},
+    url = {https://doi.org/10.34820/FK2/VTAHRH}
+}
+
+@INPROCEEDINGS{7437678,
+  author={Suryani, Arie Ardiyanti and Widyantoro, Dwi Hendratmo and Purwarianti, Ayu and Sudaryat, Yayat},
+  booktitle={2015 International Conference on Information Technology Systems and Innovation (ICITSI)},
+  title={Experiment on a phrase-based statistical machine translation using PoS Tag information for Sundanese into Indonesian},
+  year={2015},
+  volume={},
+  number={},
+  pages={1-6},
+  doi={10.1109/ICITSI.2015.7437678}
+}
+"""
+
+_DATASETNAME = "pos_sun_mono"
+
+_DESCRIPTION = """\
+This dataset contains 3616 lines of Sundanese sentences taken from several online magazines (Mangle, Dewan Dakwah Jabar, and Balebat). \
+Annotated with PoS Labels by several undergraduates of the Sundanese Language Education Study Program (PPBS), UPI Bandung.
+"""
+
+_HOMEPAGE = "https://dataverse.telkomuniversity.ac.id/dataset.xhtml?persistentId=doi:10.34820/FK2/VTAHRH"
+
+_LICENSE = 'CC0 - "Public Domain Dedication"'
+
+_URLS = {
+    _DATASETNAME: "https://dataverse.telkomuniversity.ac.id/api/access/datafile/:persistentId?persistentId=doi:10.34820/FK2/VTAHRH/WQIFK8",
+}
+
+_SUPPORTED_TASKS = [Tasks.POS_TAGGING]
+
+_SOURCE_VERSION = "1.1.0"
+
+_NUSANTARA_VERSION = "1.0.0"
+
+
+class PosSunMonoDataset(datasets.GeneratorBasedBuilder):
+    """PoSTagged Sundanese Monolingual Corpus"""
+
+    SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
+    NUSANTARA_VERSION = datasets.Version(_NUSANTARA_VERSION)
+
+    # Based on Wicaksono, A. F., & Purwarianti, A. (2010). HMM Based Part-of-Speech Tagger for Bahasa Indonesia. On Proceedings of 4th International MALINDO (Malay and Indonesian Language) Workshop.
+    POS_TAGS = [
+        "",
+        "!",
+        '"',
+        "'",
+        ")",
+        ",",
+        "-",
+        ".",
+        "...",
+        "....",
+        "/",
+        ":",
+        ";",
+        "?",
+        "C",
+        "CBI",
+        "CC",
+        "CDC",
+        "CDI",
+        "CDO",
+        "CDP",
+        "CDT",
+        "CP",
+        "CRB",
+        "CS",
+        "DC",
+        "DT",
+        "FE",
+        "FW",
+        "GM",
+        "IN",
+        "J",
+        "JJ",
+        "KA",
+        "KK",
+        "MD",
+        "MG",
+        "MN",
+        "N",
+        "NEG",
+        "NN",
+        "NNA",
+        "NNG",
+        "NNN",
+        "NNO",
+        "NNP",
+        "NNPP",
+        "NP",
+        "NPP",
+        "OP",
+        "PB",
+        "PCDP",
+        "PR",
+        "PRL",
+        "PRL|IN",
+        "PRN",
+        "PRP",
+        "RB",
+        "RBT",
+        "RB|RP",
+        "RN",
+        "RP",
+        "SC",
+        "SCC",
+        "SC|IN",
+        "SYM",
+        "UH",
+        "VB",
+        "VBI",
+        "VBT",
+        "VRB",
+        "W",
+        "WH",
+        "WHP",
+        "WRP",
+        "`",
+        "–",
+        "—",
+        "‘",
+        "’",
+        "“",
+        "”",
+    ]
+
+    BUILDER_CONFIGS = [
+        NusantaraConfig(
+            name=f"{_DATASETNAME}_source",
+            version=SOURCE_VERSION,
+            description=f"{_DATASETNAME} source schema",
+            schema="source",
+            subset_id=f"{_DATASETNAME}",
+        ),
+        NusantaraConfig(
+            name=f"{_DATASETNAME}_nusantara_seq_label",
+            version=NUSANTARA_VERSION,
+            description=f"{_DATASETNAME} Nusantara Seq Label schema",
+            schema="nusantara_seq_label",
+            subset_id=f"{_DATASETNAME}",
+        ),
+    ]
+
+    DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source"
+
+    def _info(self) -> datasets.DatasetInfo:
+
+        if self.config.schema == "source":
+            features = datasets.Features({"labeled_sentence": datasets.Value("string")})
+        elif self.config.schema == "nusantara_seq_label":
+            features = schemas.seq_label_features(self.POS_TAGS)
+
+        else:
+            raise NotImplementedError(f"Schema '{self.config.schema}' is not defined.")
+
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=features,
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
+        """Returns SplitGenerators."""
+        urls = _URLS[_DATASETNAME]
+        data_path = dl_manager.download(urls)
+
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                gen_kwargs={
+                    "filepath": data_path,
+                },
+            ),
+        ]
+
+    def _generate_examples(self, filepath: Path) -> Tuple[int, Dict]:
+        """Yields examples as (key, example) tuples."""
+
+        def __hotfix(line):
+            if line.endswith(" taun|NN 1953.|."):
+                return line.replace(" taun|NN 1953.|.", " taun|NN 1953|CDP .|.")
+            elif line.endswith(" jeung|CC|CC sasab|RB .|."):
+                return line.replace(" jeung|CC|CC sasab|RB .|.", " jeung|CC sasab|RB .|.")
+            elif line.startswith("Kagiatan|NN éta|DT dihadiran|VBT kira|-kira "):
+                return line.replace("Kagiatan|NN éta|DT dihadiran|VBT kira|-kira ", "Kagiatan|NN éta|DT dihadiran|VBT kira-kira|DT ")
+            return line
+
+        with open(filepath, "r", encoding="utf8") as ipt:
+            raw = list(map(lambda l: __hotfix(l.rstrip("\n ")), ipt))
+
+        pat_0 = r"(,\|,|\?\|\?|-\|-|!\|!)"
+        repl_spc = r" \1 "
+
+        pat_1 = r"([A-Z”])(\.\|\.)"
+        pat_2 = r"(\.\|\.)([^. ])"
+        repl_spl = r"\1 \2"
+
+        pat_3 = r"([^ ]+\|[^ ]+)\| "
+        repl_del = r"\1 "
+
+        pat_4 = r"\|\|"
+        repl_dup = r"|"
+
+        def __apply_regex(txt):
+            for pat, repl in [(pat_0, repl_spc), (pat_1, repl_spl), (pat_2, repl_spl), (pat_3, repl_del), (pat_4, repl_dup)]:
+                txt = re.sub(pat, repl, txt)
+            return txt
+
+        def __cleanse_label(token):
+            text, label = token
+            return text, re.sub(r"([A-Z]+)[.,)]", r"\1", label.upper())
+
+        if self.config.schema == "source":
+            for key, example in enumerate(raw):
+                yield key, {"labeled_sentence": example}
+
+        elif self.config.schema == "nusantara_seq_label":
+            spaced = list(map(__apply_regex, raw))
+            data = list(map(lambda l: [__cleanse_label(tok.split("|", 1)) for tok in filter(None, l.split(" "))], spaced))
+
+            for key, example in enumerate(data):
+                tokens, labels = zip(*example)
+                yield key, {"id": str(key), "tokens": tokens, "labels": labels}
+
+        else:
+            raise NotImplementedError(f"Schema '{self.config.schema}' is not defined.")
+
+
+if __name__ == "__main__":
+    datasets.load_dataset(__file__)
diff --git a/nusantara/nusa_datasets/ud_id_csui/__init__.py b/nusantara/nusa_datasets/ud_id_csui/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/nusantara/nusa_datasets/ud_id_csui/ud_id_csui.py b/nusantara/nusa_datasets/ud_id_csui/ud_id_csui.py
new file mode 100644
index 00000000..5a9281c0
--- /dev/null
+++ b/nusantara/nusa_datasets/ud_id_csui/ud_id_csui.py
@@ -0,0 +1,239 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from pathlib import Path
+from typing import Dict, List, Tuple
+
+import datasets
+from conllu import TokenList
+
+from nusantara.utils import schemas
+from nusantara.utils.common_parser import load_ud_data, load_ud_data_as_nusantara_kb
+from nusantara.utils.configs import NusantaraConfig
+from nusantara.utils.constants import Tasks
+
+_CITATION = """\
+@article {10.3844/jcssp.2020.1585.1597,
+author = {Alfina, Ika and Budi, Indra and Suhartanto, Heru},
+title = {Tree Rotations for Dependency Trees: Converting the Head-Directionality of Noun Phrases},
+article_type = {journal},
+volume = {16},
+number = {11},
+year = {2020},
+month = {Nov},
+pages = {1585-1597},
+doi = {10.3844/jcssp.2020.1585.1597},
+url = {https://thescipub.com/abstract/jcssp.2020.1585.1597},
+journal = {Journal of Computer Science},
+publisher = {Science Publications}
+}
+"""
+
+_DATASETNAME = "ud_id_csui"
+
+_DESCRIPTION = """\
+UD Indonesian-CSUI is a conversion from an Indonesian constituency treebank in the Penn Treebank format named Kethu that was also a conversion from a constituency treebank built by Dinakaramani et al. (2015).
+This treebank is named after the place where treebanks were built: Faculty of Computer Science (CS), Universitas Indonesia (UI).
+
+About this treebank:
+- Genre is news in formal Indonesian (the majority is economic news)
+- 1030 sentences (28K words) divided into testing and training dataset of around 10K words and around 18K words respectively.
+- Average of 27.4 words per-sentence.
+"""
+
+_HOMEPAGE = "https://github.com/UniversalDependencies/UD_Indonesian-CSUI"
+
+_LICENSE = "CC BY-SA 4.0"
+
+_URLS = {
+    _DATASETNAME: {
+        "train": "https://raw.githubusercontent.com/UniversalDependencies/UD_Indonesian-CSUI/master/id_csui-ud-train.conllu",
+        "test": "https://raw.githubusercontent.com/UniversalDependencies/UD_Indonesian-CSUI/master/id_csui-ud-test.conllu",
+    },
+}
+
+_SUPPORTED_TASKS = [Tasks.DEPENDENCY_PARSING, Tasks.MACHINE_TRANSLATION, Tasks.POS_TAGGING]
+
+_SOURCE_VERSION = "1.0.0"
+
+_NUSANTARA_VERSION = "1.0.0"
+
+
+class UdIdCsuiDataset(datasets.GeneratorBasedBuilder):
+    """Treebank of formal Indonesian news which consists of 1030 sentences (28K words)"""
+
+    SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
+    NUSANTARA_VERSION = datasets.Version(_NUSANTARA_VERSION)
+
+    # source: https://universaldependencies.org/u/pos/
+    UPOS_TAGS = ["ADJ", "ADP", "ADV", "AUX", "CCONJ", "DET", "INTJ", "NOUN", "NUM", "PART", "PRON", "PROPN", "PUNCT", "SCONJ", "SYM", "VERB", "X"]
+
+    BUILDER_CONFIGS = [
+        NusantaraConfig(
+            name=f"{_DATASETNAME}_source",
+            version=SOURCE_VERSION,
+            description=f"{_DATASETNAME} source schema",
+            schema="source",
+            subset_id=f"{_DATASETNAME}",
+        ),
+        NusantaraConfig(
+            name=f"{_DATASETNAME}_nusantara_kb",
+            version=NUSANTARA_VERSION,
+            description=f"{_DATASETNAME} Nusantara KB schema",
+            schema="nusantara_kb",
+            subset_id=f"{_DATASETNAME}",
+        ),
+        NusantaraConfig(
+            name=f"{_DATASETNAME}_nusantara_t2t",
+            version=NUSANTARA_VERSION,
+            description=f"{_DATASETNAME} Nusantara Text to Text schema",
+            schema="nusantara_t2t",
+            subset_id=f"{_DATASETNAME}",
+        ),
+        NusantaraConfig(
+            name=f"{_DATASETNAME}_nusantara_seq_label",
+            version=NUSANTARA_VERSION,
+            description=f"{_DATASETNAME} Nusantara Seq Label schema",
+            schema="nusantara_seq_label",
+            subset_id=f"{_DATASETNAME}",
+        ),
+    ]
+
+    DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source"
+
+    def _info(self) -> datasets.DatasetInfo:
+
+        if self.config.schema == "source":
+            features = datasets.Features(
+                {
+                    # metadata
+                    "sent_id": datasets.Value("string"),
+                    "text": datasets.Value("string"),
+                    "text_en": datasets.Value("string"),
+                    # tokens
+                    "id": [datasets.Value("string")],
+                    "form": [datasets.Value("string")],
+                    "lemma": [datasets.Value("string")],
+                    "upos": [datasets.Value("string")],
+                    "xpos": [datasets.Value("string")],
+                    "feats": [datasets.Value("string")],
+                    "head": [datasets.Value("string")],
+                    "deprel": [datasets.Value("string")],
+                    "deps": [datasets.Value("string")],
+                    "misc": [datasets.Value("string")],
+                }
+            )
+
+        elif self.config.schema == "nusantara_kb":
+            features = schemas.kb_features
+
+        elif self.config.schema == "nusantara_t2t":
+            features = schemas.text2text_features
+
+        elif self.config.schema == "nusantara_seq_label":
+            features = schemas.seq_label_features(self.UPOS_TAGS)
+
+        else:
+            raise NotImplementedError(f"Schema '{self.config.schema}' is not defined.")
+
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=features,
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
+        """Returns SplitGenerators."""
+        urls = _URLS[_DATASETNAME]
+        data_path = dl_manager.download(urls)
+
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                gen_kwargs={
+                    "filepath": data_path["train"],
+                },
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.TEST,
+                gen_kwargs={
+                    "filepath": data_path["test"],
+                },
+            ),
+        ]
+
+    @staticmethod
+    def _assert_multispan_range_is_one(token_list: TokenList):
+        """
+        Asserting that all tokens with multiple span can only have 2 span, and \
+        no field other than form has important information
+        """
+        for token in token_list.filter(id=lambda i: not isinstance(i, int)):
+            _id = token["id"]
+            assert len(_id) == 3, f"Unexpected length of non-int CONLLU Token's id. Expected 3, found {len(_id)};"
+            assert all(isinstance(a, b) for a, b in zip(_id, [int, str, int])), f"Non-int ID should be in format of '\\d+-\\d+'. Found {_id};"
+            assert _id[2] - _id[0] == 1, f"Token has more than 2 spans. Found {_id[2] - _id[0] + 1} spans;"
+            for key in ["lemma", "upos", "xpos", "feats", "head", "deprel", "deps"]:
+                assert token[key] in {"_", None}, f"Field other than 'form' should not contain extra information. Found: '{key}' = '{token[key]}'"
+
+    def _generate_examples(self, filepath: Path) -> Tuple[int, Dict]:
+        """Yields examples as (key, example) tuples."""
+        # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
+
+        dataset = list(load_ud_data(filepath, filter_kwargs={"id": lambda i: isinstance(i, int)}, assert_fn=self._assert_multispan_range_is_one))
+
+        if self.config.schema == "source":
+            pass
+
+        elif self.config.schema == "nusantara_kb":
+            dataset = load_ud_data_as_nusantara_kb(filepath, dataset)
+
+        elif self.config.schema == "nusantara_t2t":
+            dataset = list(
+                map(
+                    lambda d: {
+                        "id": d["sent_id"],
+                        "text_1": d["text"],
+                        "text_2": d["text_en"],
+                        "text_1_name": "ind",
+                        "text_2_name": "eng",
+                    },
+                    dataset,
+                )
+            )
+
+        elif self.config.schema == "nusantara_seq_label":
+            dataset = list(
+                map(
+                    lambda d: {
+                        "id": d["sent_id"],
+                        "tokens": d["form"],
+                        "labels": d["upos"],
+                    },
+                    dataset,
+                )
+            )
+
+        else:
+            raise NotImplementedError(f"Schema '{self.config.schema}' is not defined.")
+
+        for key, example in enumerate(dataset):
+            yield key, example
+
+
+if __name__ == "__main__":
+    datasets.load_dataset(__file__)
diff --git a/nusantara/nusa_datasets/xpersona_id/xpersona_id.py b/nusantara/nusa_datasets/xpersona_id/xpersona_id.py
new file mode 100644
index 00000000..c99d8abc
--- /dev/null
+++ b/nusantara/nusa_datasets/xpersona_id/xpersona_id.py
@@ -0,0 +1,189 @@
+import os
+from pathlib import Path
+from typing import Dict, List, Tuple
+from nusantara.utils.constants import Tasks
+from nusantara.utils import schemas
+
+import datasets
+import json
+
+from nusantara.utils.configs import NusantaraConfig
+
+_CITATION = """\
+@article{lin2020xpersona,
+  title={XPersona: Evaluating multilingual personalized chatbot},
+  author={Lin, Zhaojiang and Liu, Zihan and Winata, Genta Indra and Cahyawijaya, Samuel and Madotto, Andrea and Bang, Yejin and Ishii, Etsuko and Fung, Pascale},
+  journal={arXiv preprint arXiv:2003.07568},
+  year={2020}
+}
+@inproceedings{cahyawijaya-etal-2021-indonlg,
+    title = "{I}ndo{NLG}: Benchmark and Resources for Evaluating {I}ndonesian Natural Language Generation",
+    author = "Cahyawijaya, Samuel  and
+      Winata, Genta Indra  and
+      Wilie, Bryan  and
+      Vincentio, Karissa  and
+      Li, Xiaohong  and
+      Kuncoro, Adhiguna  and
+      Ruder, Sebastian  and
+      Lim, Zhi Yuan  and
+      Bahar, Syafri  and
+      Khodra, Masayu  and
+      Purwarianti, Ayu  and
+      Fung, Pascale",
+    booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing",
+    month = nov,
+    year = "2021",
+    address = "Online and Punta Cana, Dominican Republic",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/2021.emnlp-main.699",
+    doi = "10.18653/v1/2021.emnlp-main.699",
+    pages = "8875--8898"
+}
+"""
+
+_DATASETNAME = "xpersona_id"
+
+_DESCRIPTION = """\
+XPersona is a multi-lingual extension of Persona-Chat. 
+XPersona dataset includes persona conversations in six different languages other than English for building and evaluating multilingual personalized agents.
+"""
+
+_HOMEPAGE = ""
+
+_LICENSE = "CC-BY-SA 4.0"
+
+_URLS = {
+    _DATASETNAME: "https://storage.googleapis.com/babert-pretraining/IndoNLG_finals/downstream_task/downstream_task_datasets.zip",
+}
+
+_SUPPORTED_TASKS = [Tasks.MACHINE_TRANSLATION]
+
+_SOURCE_VERSION = "1.0.0"
+
+_NUSANTARA_VERSION = "1.0.0"
+
+class XPersonaID(datasets.GeneratorBasedBuilder):
+
+    SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
+    NUSANTARA_VERSION = datasets.Version(_NUSANTARA_VERSION)
+
+    BUILDER_CONFIGS = [
+        NusantaraConfig(
+            name="xpersona_id_source",
+            version=SOURCE_VERSION,
+            description="XPersona ID source schema",
+            schema="source",
+            subset_id="xpersona_id",
+        ),
+        NusantaraConfig(
+            name="xpersona_id_nusantara_t2t",
+            version=NUSANTARA_VERSION,
+            description="XPersona ID Nusantara schema",
+            schema="nusantara_t2t",
+            subset_id="xpersona_id",
+        ),
+    ]
+
+    DEFAULT_CONFIG_NAME = "xpersona_id_source"
+
+    def _info(self) -> datasets.DatasetInfo:
+
+        if self.config.schema == "source":
+            features = datasets.Features(
+                {
+                    "persona": datasets.Sequence(
+                        datasets.Value("string")
+                    ),
+                    "dialogue": datasets.Sequence(
+                        datasets.Sequence(
+                            datasets.Value("string")
+                        )
+                    )
+                }
+            )
+
+        elif self.config.schema == "nusantara_t2t":
+            features = schemas.text2text_features
+
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=features,
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
+        urls = _URLS[_DATASETNAME]
+        data_dir = dl_manager.download_and_extract(urls)
+
+        data_dir = os.path.join(data_dir, "IndoNLG_downstream_tasks/xpersona")
+
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                # Whatever you put in gen_kwargs will be passed to _generate_examples
+                gen_kwargs={
+                    "filepath": os.path.join(data_dir, "Id_persona_train_corrected.json"),
+                    "split": "train",
+                },
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.TEST,
+                gen_kwargs={
+                    "filepath": os.path.join(data_dir, "Id_persona_split_test_human_annotated.json"),
+                    "split": "test",
+                },
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.VALIDATION,
+                gen_kwargs={
+                    "filepath": os.path.join(data_dir, "Id_persona_split_valid_human_annotated.json"),
+                    "split": "dev",
+                },
+            ),
+        ]
+
+    def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]:
+        data = json.load(open(filepath, "r"))
+
+        if self.config.schema == "source":
+            key = 0
+            for each_data in data:
+                example = {
+                    "persona": each_data["persona"],
+                    "dialogue": each_data["dialogue"]
+                }
+                yield key, example
+                key+=1
+
+        elif self.config.schema == "nusantara_t2t":
+            id = 0
+            key = 0
+            for each_data in data:
+                persona = " | ".join(each_data["persona"])
+                for i in range(len(each_data["dialogue"]) - 1):
+                    example = {
+                        "text_1_name": persona,
+                        "text_2_name": "response"
+                    }
+
+                    # for first turn
+
+                    if i == 0:
+                        example["id"] = "{}_{}".format(id, i)
+                        example["text_1"] = "U: {}".format(each_data["dialogue"][i][0])
+                        example["text_2"] = each_data["dialogue"][i][1]
+                        yield key, example
+                        key+=1
+
+                    # for second turn and other until last turn
+
+                    example["id"] = "{}_{}".format(id, i+1)
+                    example["text_1"] = "U: {} | S: {} | U: {}".format(each_data["dialogue"][i][0], each_data["dialogue"][i][1], each_data["dialogue"][i+1][0])
+                    example["text_2"] = each_data["dialogue"][i+1][1]
+                    yield key, example
+                    key+=1
+                id+=1
+                
+
diff --git a/nusantara/utils/common_parser.py b/nusantara/utils/common_parser.py
index 0581bf2b..85e92e54 100644
--- a/nusantara/utils/common_parser.py
+++ b/nusantara/utils/common_parser.py
@@ -1,3 +1,5 @@
+from typing import Iterable
+
 import pandas as pd
 from conllu import parse
 
@@ -21,29 +23,38 @@ def load_conll_data(file_path):
     return dataset
 
 
-def load_ud_data(filepath):
+def load_ud_data(filepath, filter_kwargs=None, assert_fn=None):
     """
     Load and parse conllu data.
 
     Proposed by @fhudi for issue #34 and #9.
 
     :param filepath: file path
+    :param filter_kwargs: filtering tokens, see conllu.models.TokenList.filter()
+    :param assert_fn: assertion to make sure raw data is in the expected format
     :return: generator with schema following CONLLU
     """
     dataset_raw = parse(open(filepath).read())
-    return map(lambda sent: {**sent.metadata, **pd.DataFrame(sent).to_dict(orient="list")}, dataset_raw)
+
+    filter_kwargs = filter_kwargs or dict()
+    if callable(assert_fn):
+        for token_list in dataset_raw:
+            assert_fn(token_list)
+
+    return map(lambda sent: {**sent.metadata, **pd.DataFrame(sent.filter(**filter_kwargs)).to_dict(orient="list")}, dataset_raw)
 
 
-def load_ud_data_as_nusantara_kb(filepath):
+def load_ud_data_as_nusantara_kb(filepath, dataset_source: Iterable = tuple()):
     """
     Load and parse conllu data, followed by mapping its elements to Nusantara Knowledge Base schema.
 
     Proposed by @fhudi for issue #34 and #9.
 
     :param filepath: file path
+    :param dataset_source: dataset with source schema (output of load_ud_data())
     :return: generator for Nusantara KB schema
     """
-    dataset_source = list(load_ud_data(filepath))
+    dataset_source = dataset_source or list(load_ud_data(filepath))
 
     def as_nusa_kb(tokens):
         sent_id = tokens["sent_id"]
diff --git a/nusantara/utils/constants.py b/nusantara/utils/constants.py
index d29f26b1..b1f532be 100644
--- a/nusantara/utils/constants.py
+++ b/nusantara/utils/constants.py
@@ -48,6 +48,11 @@ class Tasks(Enum):
     # Speech Recognition
     SPEECH_RECOGNITION = "ASR"
 
+    # ImageText
+    IMAGE_CAPTIONING = "IC"
+    STYLIZED_IMAGE_CAPTIONING = "SIC"
+    VISUALLY_GROUNDED_REASONING = "VGR"
+
 
 # TASK_TO_SCHEMA = {
 #     Tasks.NAMED_ENTITY_RECOGNITION: "KB",
diff --git a/nusantara/utils/schemas/__init__.py b/nusantara/utils/schemas/__init__.py
index 1671351e..67af2e0e 100644
--- a/nusantara/utils/schemas/__init__.py
+++ b/nusantara/utils/schemas/__init__.py
@@ -8,5 +8,6 @@
 from .seq_label import features as seq_label_features
 from .self_supervised_pretraining import features as ssp_features
 from .speech_recognition import features as asr_features
+from .image_text import features as image_text_features
 
-__all__ = ["kb_features", "qa_features", "text2text_features", "text_features", "text_multi_features", "pairs_features", "pairs_features_score", "seq_label_features", "ssp_features", "asr_features"]
+__all__ = ["kb_features", "qa_features", "text2text_features", "text_features", "text_multi_features", "pairs_features", "pairs_features_score", "seq_label_features", "ssp_features", "asr_features", "image_text_features"]
diff --git a/nusantara/utils/schemas/image_text.py b/nusantara/utils/schemas/image_text.py
new file mode 100644
index 00000000..59ef6d87
--- /dev/null
+++ b/nusantara/utils/schemas/image_text.py
@@ -0,0 +1,18 @@
+"""
+General ImageText Classification Schema
+"""
+import datasets
+
+def features(label_names = ["Yes", "No"]):
+    return datasets.Features(
+        {
+            "id": datasets.Value("string"),
+            "image_paths": datasets.Sequence(datasets.Value("string")),
+            "texts": datasets.Value("string"),
+            "metadata": {
+                "context": datasets.Value("string"),
+                "labels": datasets.Sequence(datasets.ClassLabel(names=label_names)),
+            }
+        }
+    )
+
diff --git a/tests/test_nusantara.py b/tests/test_nusantara.py
index a36bcad3..c484b5c6 100644
--- a/tests/test_nusantara.py
+++ b/tests/test_nusantara.py
@@ -12,7 +12,7 @@
 import datasets
 from datasets import DatasetDict, Features
 from nusantara.utils.constants import Tasks
-from nusantara.utils.schemas import kb_features, pairs_features, pairs_features_score, qa_features, text2text_features, text_features, text_multi_features, seq_label_features, ssp_features, asr_features
+from nusantara.utils.schemas import kb_features, pairs_features, pairs_features_score, qa_features, text2text_features, text_features, text_multi_features, seq_label_features, ssp_features, asr_features, image_text_features
 
 sys.path.append(str(Path(__file__).parent.parent))
 
@@ -43,6 +43,9 @@
     Tasks.EMOTION_CLASSIFICATION: "TEXT",
     Tasks.SELF_SUPERVISED_PRETRAINING: "SSP",
     Tasks.SPEECH_RECOGNITION: "ASR",
+    Tasks.IMAGE_CAPTIONING: "IC",
+    Tasks.STYLIZED_IMAGE_CAPTIONING: "SIC",
+    Tasks.VISUALLY_GROUNDED_REASONING: "VGR",
 }
 
 _VALID_TASKS = set(_TASK_TO_SCHEMA.keys())
@@ -59,6 +62,9 @@
     "SEQ_LABEL": seq_label_features(),
     "SSP": ssp_features,
     "ASR": asr_features,
+    "IC": image_text_features(),
+    "SIC": image_text_features(),
+    "VGR": image_text_features(),
 }
 
 _TASK_TO_FEATURES = {
diff --git a/update_readme.py b/update_readme.py
new file mode 100644
index 00000000..05f2d265
--- /dev/null
+++ b/update_readme.py
@@ -0,0 +1,57 @@
+import os
+import pathlib
+import re
+
+root = pathlib.Path(__file__).parent.resolve()
+
+
+def replace_writing(content, marker, chunk, inline=False):
+    r = re.compile(
+        r"<!\-\- {} starts \-\->.*<!\-\- {} ends \-\->".format(marker, marker),
+        re.DOTALL,
+    )
+    if not inline:
+        chunk = "\n{}\n".format(chunk)
+    chunk = "<!-- {} starts -->{}<!-- {} ends -->".format(marker, chunk, marker)
+    return r.sub(chunk, content)
+
+
+def build_progress_bar(milestones={}):
+    progress_bar = []
+    progress_bar_template = "![Milestone {}](https://progress-bar.dev/{}/?title=Milestone%20{}%20({}%20Datasets%20Completed))"
+    for index, target in enumerate(milestones):
+        percentage = min(milestones[target] * 100 // target, 100)
+        bar = progress_bar_template.format(index + 1, percentage, index + 1, target)
+        progress_bar.append(bar)
+    return progress_bar
+
+
+def calculate_completed_dataset(base_dir="nusantara/nusa_datasets"):
+    count = 0
+    for path in os.listdir(base_dir):
+        if os.path.isdir(os.path.join(base_dir, path)) and path != "__pycache__":
+            count += 1
+    return count
+
+
+if __name__ == "__main__":
+    # read current readme
+    readme_path = root / "README.md"
+    readme = readme_path.open().read()
+
+    readme_id_path = root / "README.id.md"
+    readme_id = readme_id_path.open().read()
+
+    # calculate progress
+    target = [30, 60, 100, 150]
+    count_completed_dataset = calculate_completed_dataset()
+    milestones = {k: count_completed_dataset for k in target}
+    progress_bar = build_progress_bar(milestones=milestones)
+    entries_md = "\n\n".join(progress_bar)
+
+    # Update entries
+    rewritten_entries = replace_writing(readme, "milestone", entries_md)
+    readme_path.open("w").write(rewritten_entries)
+
+    rewritten_id_entries = replace_writing(readme_id, "milestone", entries_md)
+    readme_id_path.open("w").write(rewritten_id_entries)