bigcode-project · loubnabnl · Sep 20, 2023 · Sep 20, 2023 · Sep 21, 2023 · Sep 21, 2023
diff --git a/data_analysis/kaggle/README.md b/data_analysis/kaggle/README.md
@@ -0,0 +1,3 @@
+# Kaggle data
+
+Code for curation of kaggle notebooks
diff --git a/data_analysis/kaggle/curation/README.md b/data_analysis/kaggle/curation/README.md
@@ -0,0 +1,9 @@
+# data-curation-kaggle
+
+Code from: https://github.com/bigcode-project/data-curation-kaggle/tree/main
+You can apply the following filtering:
+
+- length-based filtering
+- rule-based filtering
+
+All the notebooks will be converted into a python script.
diff --git a/data_analysis/kaggle/curation/manual_sharding.py b/data_analysis/kaggle/curation/manual_sharding.py
@@ -0,0 +1,66 @@
+import os
+import time
+from multiprocessing import Pool
+from tqdm import tqdm
+
+from huggingface_hub import Repository
+
+
+def save_shard(shard_tuple):
+    """Save shard"""
+    filename, shard = shard_tuple
+    # use to_json instead to save as json file
+    shard.to_parquet(filename)
+
+
+def save_manual_shards(ds, user="loubnabnl", remote_dataset_repo="bigcode-pii-pjj"):
+    """Save sharded data
+    Args:
+        ds (Dataset): dataset to be saved
+        user (str): user name
+        remote_dataset_repo (str): remote dataset repository
+        out_path (str): path to save the shards"""
+    # this will create a folder OUT_PATH that is a clone of REMOTE_DATASET_REPO
+    # you can save the shards inside it and do git add/commit/push to push data to the hub
+    out_path = remote_dataset_repo
+    # if out path doesnt already exist
+    if not os.path.exists(out_path):
+        repo = Repository(
+            local_dir=out_path,
+            clone_from=user + "/" + remote_dataset_repo,
+            repo_type="dataset",
+            use_auth_token=True,
+            git_user=user,
+        )
+
+    # files will be numerous we save them in a folder called data inside out_path
+    os.mkdir(out_path + "/data")
+    SHARD_SIZE = 1000 << 20
+    if ds._indices is not None:
+        dataset_nbytes = ds.data.nbytes * len(ds._indices) / len(ds.data)
+    else:
+        dataset_nbytes = ds.data.nbytes
+    num_shards = int(dataset_nbytes / SHARD_SIZE) + 1
+    print(f"Number of shards: {num_shards}")
+
+    print("sharding the dataset")
+    t_start = time.time()
+    shards = (
+        ds.shard(num_shards=num_shards, index=i, contiguous=True)
+        for i in range(num_shards)
+    )
+    # use f"{OUT_PATH}/data/train-{index:05d}-of-{num_shards:05d}.json" instead for json files
+    filenames = (
+        f"{out_path}/data/train-{index:05d}-of-{num_shards:05d}.parquet"
+        for index in range(num_shards)
+    )
+
+    with Pool(16) as p:
+        list(
+            tqdm(
+                p.imap_unordered(save_shard, zip(filenames, shards), chunksize=4),
+                total=num_shards,
+            )
+        )
+    print(f"Time to save dataset: {time.time()-t_start:.2f}")
+    # to push dataset to hub do: git add/commit/push inside OUT_PATH
diff --git a/data_analysis/kaggle/curation/process_kaggle.py b/data_analysis/kaggle/curation/process_kaggle.py
@@ -0,0 +1,59 @@
+from datasets import load_dataset
+from utils import parse_jupyter_into_script
+import black
+from manual_sharding import save_manual_shards
+
+TEMPLATE = '# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python\nimport numpy as np  # linear algebra\nimport pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)\n\n# Input data files are available in the read-only "../input/" directory\n# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\nimport os\n\nfor dirname, _, filenames in os.walk("/kaggle/input"):\n    for filename in filenames:\n        print(os.path.join(dirname, filename))\n# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"\n# You can also write temporary files to /kaggle/temp/, but they won\'t be saved outside of the current session'
+SHORT_TEMPLATE = '# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python\n'
+
+def check_syntax(code):
+    try:
+        compile(code, "<string>", "exec")
+        return True
+    except Exception as e:
+        return False
+
+def format_code(example):
+    try:
+        # sometimes autopep8 will be stuck, so we need to set a timeout
+        formatted_code = black.format_str(example["script"] , mode=black.FileMode())
+        if formatted_code.startswith(TEMPLATE):
+            formatted_code = formatted_code[len(TEMPLATE):].strip()
+        if formatted_code.startswith(SHORT_TEMPLATE):
+            formatted_code = formatted_code[len(SHORT_TEMPLATE):].strip()
+        example["script"] = formatted_code
+    except Exception as e:
+        print(e)
+        pass
+    return example
+
+def parse_whole_content_kaggle(example):
+    notebook = example["content"]
+    script_content = parse_jupyter_into_script(notebook, False)
+    example["script"] = script_content
+    return example
+
+def process_kaggle_jupyter(dataset, output_path, use_code_execution, workers=1):
+    init_size = len(dataset)
+    dataset = dataset.filter(lambda x: len(x["content"]) <= 500_0000, num_proc=workers)
+    dataset = dataset.map(parse_whole_content_kaggle, num_proc=90)
+    dataset = dataset.filter(lambda x: len(x["script"]) > 100, num_proc=workers)
+    print(f"Finish parsing the whole content, total {len(dataset)} notebooks, dropped {100 - len(dataset)/init_size * 100:.2f}% of the original dataset")
+    init_size = len(dataset)
+    # filter the syntax error
+    dataset = dataset.filter(lambda x: check_syntax(x["script"]), num_proc=workers)
+    dataset = dataset.map(format_code, num_proc=90, load_from_cache_file=False)
+    print(f"Check the syntax, total {len(dataset)} notebooks, dropped {100 - len(dataset)/init_size * 100:.2f}% more of the original dataset")
+    save_manual_shards(
+        dataset, user="loubnabnl", remote_dataset_repo="kaggle-scripts-clean",
+    )
+    print("DONE! Example:\n")
+    print(dataset[0]["script"][:100])
+
+
+if __name__ == '__main__':
+    dataset = load_dataset("bigcode/kaggle-notebooks-data",
+                           split="train")
+    process_kaggle_jupyter(dataset,
+                           use_code_execution=False,
+                           workers=36)
diff --git a/data_analysis/kaggle/curation/requirements.txt b/data_analysis/kaggle/curation/requirements.txt
@@ -0,0 +1,7 @@
+beautifulsoup4
+tqdm
+nbformat
+torch
+transformers
+datasets
+black
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# Kaggle data

		Code for curation of kaggle notebooks