diff --git a/data_analysis/kaggle/README.md b/data_analysis/kaggle/README.md
new file mode 100644
index 0000000..6ccde6b
--- /dev/null
+++ b/data_analysis/kaggle/README.md
@@ -0,0 +1,3 @@
+# Kaggle data
+
+Code for curation of kaggle notebooks
\ No newline at end of file
diff --git a/data_analysis/kaggle/curation/README.md b/data_analysis/kaggle/curation/README.md
new file mode 100644
index 0000000..8d50465
--- /dev/null
+++ b/data_analysis/kaggle/curation/README.md
@@ -0,0 +1,9 @@
+# data-curation-kaggle
+
+Code from: https://github.com/bigcode-project/data-curation-kaggle/tree/main
+You can apply the following filtering:
+
+- length-based filtering
+- rule-based filtering
+
+All the notebooks will be converted into a python script.
\ No newline at end of file
diff --git a/data_analysis/kaggle/curation/manual_sharding.py b/data_analysis/kaggle/curation/manual_sharding.py
new file mode 100644
index 0000000..904b30b
--- /dev/null
+++ b/data_analysis/kaggle/curation/manual_sharding.py
@@ -0,0 +1,66 @@
+import os
+import time
+from multiprocessing import Pool
+from tqdm import tqdm
+
+from huggingface_hub import Repository
+
+
+def save_shard(shard_tuple):
+    """Save shard"""
+    filename, shard = shard_tuple
+    # use to_json instead to save as json file
+    shard.to_parquet(filename)
+
+
+def save_manual_shards(ds, user="loubnabnl", remote_dataset_repo="bigcode-pii-pjj"):
+    """Save sharded data
+    Args:
+        ds (Dataset): dataset to be saved
+        user (str): user name
+        remote_dataset_repo (str): remote dataset repository
+        out_path (str): path to save the shards"""
+    # this will create a folder OUT_PATH that is a clone of REMOTE_DATASET_REPO
+    # you can save the shards inside it and do git add/commit/push to push data to the hub
+    out_path = remote_dataset_repo
+    # if out path doesnt already exist
+    if not os.path.exists(out_path):
+        repo = Repository(
+            local_dir=out_path,
+            clone_from=user + "/" + remote_dataset_repo,
+            repo_type="dataset",
+            use_auth_token=True,
+            git_user=user,
+        )
+
+    # files will be numerous we save them in a folder called data inside out_path
+    os.mkdir(out_path + "/data")
+    SHARD_SIZE = 1000 << 20
+    if ds._indices is not None:
+        dataset_nbytes = ds.data.nbytes * len(ds._indices) / len(ds.data)
+    else:
+        dataset_nbytes = ds.data.nbytes
+    num_shards = int(dataset_nbytes / SHARD_SIZE) + 1
+    print(f"Number of shards: {num_shards}")
+
+    print("sharding the dataset")
+    t_start = time.time()
+    shards = (
+        ds.shard(num_shards=num_shards, index=i, contiguous=True)
+        for i in range(num_shards)
+    )
+    # use f"{OUT_PATH}/data/train-{index:05d}-of-{num_shards:05d}.json" instead for json files
+    filenames = (
+        f"{out_path}/data/train-{index:05d}-of-{num_shards:05d}.parquet"
+        for index in range(num_shards)
+    )
+
+    with Pool(16) as p:
+        list(
+            tqdm(
+                p.imap_unordered(save_shard, zip(filenames, shards), chunksize=4),
+                total=num_shards,
+            )
+        )
+    print(f"Time to save dataset: {time.time()-t_start:.2f}")
+    # to push dataset to hub do: git add/commit/push inside OUT_PATH
diff --git a/data_analysis/kaggle/curation/process_kaggle.py b/data_analysis/kaggle/curation/process_kaggle.py
new file mode 100644
index 0000000..6f5de6e
--- /dev/null
+++ b/data_analysis/kaggle/curation/process_kaggle.py
@@ -0,0 +1,59 @@
+from datasets import load_dataset
+from utils import parse_jupyter_into_script
+import black
+from manual_sharding import save_manual_shards
+
+TEMPLATE = '# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python\nimport numpy as np  # linear algebra\nimport pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)\n\n# Input data files are available in the read-only "../input/" directory\n# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\nimport os\n\nfor dirname, _, filenames in os.walk("/kaggle/input"):\n    for filename in filenames:\n        print(os.path.join(dirname, filename))\n# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"\n# You can also write temporary files to /kaggle/temp/, but they won\'t be saved outside of the current session'
+SHORT_TEMPLATE = '# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python\n'
+
+def check_syntax(code):
+    try:
+        compile(code, "<string>", "exec")
+        return True
+    except Exception as e:
+        return False
+
+def format_code(example):
+    try:
+        # sometimes autopep8 will be stuck, so we need to set a timeout
+        formatted_code = black.format_str(example["script"] , mode=black.FileMode())
+        if formatted_code.startswith(TEMPLATE):
+            formatted_code = formatted_code[len(TEMPLATE):].strip()
+        if formatted_code.startswith(SHORT_TEMPLATE):
+            formatted_code = formatted_code[len(SHORT_TEMPLATE):].strip()
+        example["script"] = formatted_code
+    except Exception as e:
+        print(e)
+        pass
+    return example
+    
+def parse_whole_content_kaggle(example):
+    notebook = example["content"]
+    script_content = parse_jupyter_into_script(notebook, False)
+    example["script"] = script_content
+    return example
+
+def process_kaggle_jupyter(dataset, output_path, use_code_execution, workers=1):
+    init_size = len(dataset)
+    dataset = dataset.filter(lambda x: len(x["content"]) <= 500_0000, num_proc=workers)
+    dataset = dataset.map(parse_whole_content_kaggle, num_proc=90)
+    dataset = dataset.filter(lambda x: len(x["script"]) > 100, num_proc=workers)
+    print(f"Finish parsing the whole content, total {len(dataset)} notebooks, dropped {100 - len(dataset)/init_size * 100:.2f}% of the original dataset")
+    init_size = len(dataset)
+    # filter the syntax error
+    dataset = dataset.filter(lambda x: check_syntax(x["script"]), num_proc=workers)
+    dataset = dataset.map(format_code, num_proc=90, load_from_cache_file=False)
+    print(f"Check the syntax, total {len(dataset)} notebooks, dropped {100 - len(dataset)/init_size * 100:.2f}% more of the original dataset")
+    save_manual_shards(
+        dataset, user="loubnabnl", remote_dataset_repo="kaggle-scripts-clean",
+    )
+    print("DONE! Example:\n")
+    print(dataset[0]["script"][:100])
+
+
+if __name__ == '__main__':
+    dataset = load_dataset("bigcode/kaggle-notebooks-data",
+                           split="train")
+    process_kaggle_jupyter(dataset,
+                           use_code_execution=False,
+                           workers=36)
diff --git a/data_analysis/kaggle/curation/requirements.txt b/data_analysis/kaggle/curation/requirements.txt
new file mode 100644
index 0000000..b966e9b
--- /dev/null
+++ b/data_analysis/kaggle/curation/requirements.txt
@@ -0,0 +1,7 @@
+beautifulsoup4
+tqdm
+nbformat
+torch
+transformers
+datasets
+black
\ No newline at end of file
diff --git a/data_analysis/kaggle/curation/utils.py b/data_analysis/kaggle/curation/utils.py
new file mode 100644
index 0000000..eab6fde
--- /dev/null
+++ b/data_analysis/kaggle/curation/utils.py
@@ -0,0 +1,336 @@
+import json
+import itertools
+import re
+from tqdm import tqdm
+import sys
+from typing import List
+from bs4 import BeautifulSoup
+import random
+import keyword
+import signal
+
+DEFAULT_OUTPUT = "<empty>"
+DEFAULT_TABLE_TEMPLATE = "{}"
+# truncate the part after 200 tokens
+MAX_OUTPUT_LEN = 3 * 200
+# if longer than 200 tokens, we will drop the line since it is abnormal
+MAX_CODE_LEN = 3 * 200
+
+MAX_TABLE_COLUMN, MAX_TABLE_ROW = 5, 5
+
+def parse_html_table(html):
+    def normalize_cell_text(text):
+        return text.strip().replace('\n', ' ').replace('|', '&#124;')
+
+    soup = BeautifulSoup(html, 'html.parser')
+    table = soup.find('table')
+
+    if not table:
+        return ''
+
+    rows = table.find_all('tr')
+    headers = [normalize_cell_text(cell.get_text()) for cell in rows[0].find_all('th')]
+    rows = [[normalize_cell_text(cell.get_text()) for cell in row.find_all('td')] for row in rows[1:]]
+
+    if len(rows) > MAX_TABLE_ROW:
+        # random select 10 rows
+        rows = random.sample(rows, 5)
+
+    # if the columns are too many, then we only select the first 10 columns
+    if len(headers) > MAX_TABLE_COLUMN:
+        headers = headers[:5]
+        rows = [[row[i] for i in range(5)] for row in rows]
+
+    if len(headers) > 0:
+        markdown_table = ['| ' + ' | '.join(headers) + ' |', '| ' + ' | '.join(['---'] * len(headers)) + ' |']
+    else:
+        markdown_table = []
+    for row in rows:
+        if len(row):
+            markdown_table.append('| ' + ' | '.join(row) + ' |')
+
+    if len(markdown_table):
+        return DEFAULT_TABLE_TEMPLATE.format('\n'.join(markdown_table))
+    else:
+        return ''
+
+
+def is_single_variable(line):
+    line = line.strip()
+
+    # check if the line is a single variable
+    pattern = r'^\w+[\.\w+\(\)\d]*$'
+    match = re.match(pattern, line)
+
+    # if match and line not in keyword.kwlist
+    if match and line not in keyword.kwlist:
+        return True
+    else:
+        return False
+
+
+def filter_unused_output(plain_text: List):
+    content = "".join(plain_text)
+    # filter out progress bar
+    if ", ?B/s]" in content:
+        return False
+    if "................" in content:
+        return False
+    return True
+
+
+def clean_code(input_code: str):
+    # TODO: we can extract the in-line comment and place them in the above line
+    input_code = re.sub("\n(\n+)", "\n\n", input_code)
+    code_lines = input_code.split("\n")
+    black_phrase_list = [
+        "This Python 3 environment comes",
+        "here's several helpful packages to load",
+        "defined by the kaggle/python docker image",
+        "defined by the kaggle/python Docker image",
+        "Copyright (C)",
+    ]
+    for idx in range(len(code_lines)):
+        line = code_lines[idx].strip()
+        # magic command
+        if line.startswith("%") or line.startswith("!") or line.startswith("cd ") \
+                    or line.startswith("pip ") or line.startswith("apt ") or line.startswith("wget "):
+            code_lines[idx] = ""
+        elif len(line) > MAX_CODE_LEN:
+            # unexpected long, just drop it
+            code_lines[idx] = ""
+        for phrase in black_phrase_list:
+            if phrase in line:
+                code_lines[idx] = ""
+        # TODO: single variable, and we should be careful about the plt.show() case
+        # elif is_single_variable(line):
+        #     code_lines[idx] = "print({})".format(line)
+    code_lines = [line for line in code_lines if line.strip() != ""]
+    # since we will have a unified formatter latter, so we don't need to worry about the space
+    # TODO: autopep8 will be stuck sometimes, so we need to set a timeout in the future
+    # return autopep8.fix_code("\n".join(code_lines))
+    return "\n".join(code_lines)
+
+
+def timeout_handler(signum, frame):
+    # Handle the action to be performed after the timeout triggers
+    # You can raise an exception or perform any other desired action
+    raise TimeoutError("Timeout occurred")
+
+
+def set_timeout(seconds):
+    # Register the timeout handler function
+    signal.signal(signal.SIGALRM, timeout_handler)
+    # Set the timeout duration
+    signal.alarm(seconds)
+
+
+def clean_output(outputs: List):
+    output_text = ""
+    if len(outputs) > 0:
+        # deal with figure
+        for output in outputs:
+            if 'data' in output.keys():
+                all_data_keys = output['data'].keys()
+                # fetch text
+                for key in all_data_keys:
+                    # there will be always a html field, it yes then parse it
+                    # if not, then just use the plain text
+                    content = "".join(output['data'][key])
+                    if key == "text/html" and ("dataframe" in content or "<table" in content):
+                        # give at most 10 seconds to parse the table
+                        set_timeout(10)
+                        try:
+                            # sometimes bs4 will be stuck, so we need to set a timeout
+                            result = parse_html_table(content)
+                        except TimeoutError as e:
+                            print("Timeout occurred in parsing html table")
+                        else:
+                            output_text += result
+                        finally:
+                            signal.alarm(0)
+                            break
+                    elif key in ["text/markdown", "text/latex"]:
+                        output_text += "".join(content)
+                        # randomly drop the output
+                        output_text = DEFAULT_OUTPUT if random.random() < 0.2 else output_text
+                        break
+                    elif key in ["application/javascript",
+                                 # TODO: this field indicates the possible interactive tutorial
+                                 # ['parent.postMessage({"jupyterEvent": "custom.exercise_interaction", "data": {"outcomeType": 1, "valueTowardsComplet
+                                 # ion": 0.3333333333333333, "interactionType": 1, "questionType": 2, "learnTutorialId": 110, "questionId": "1_EarlyExi
+                                 # tDebugging", "learnToolsVersion": "0.3.2", "failureMessage": "", "exceptionClass": "", "trace": ""}}, "*")']
+                                 "application/vnd.jupyter.widget-view+json",
+                                 # {'model_id': 'd78d9bdce1344ecab6ccc6e64b0d03f2', 'version_major': 2, 'version_minor': 0}
+                                 "image/png",
+                                 "image/jpeg",
+                                 "application/vnd.plotly.v1+json",
+                                 "text/vnd.plotly.v1+html",
+                                 "image/svg+xml",
+                                 "application/vnd.bokehjs_exec.v0+json",
+                                 "application/vnd.bokehjs_load.v0+json"]:
+                        # we rely on the text part to give some hints
+                        pass
+                    elif key not in ["text/plain", "text/html"]:
+                        print("Unknown key: {}".format(key))
+
+                    if key == "text/plain" or key == "text/html":
+                        # remove meaningless output
+                        if filter_unused_output(output['data'][key]):
+                            # every line already has a \n at the end
+                            output_text += "".join(output['data'][key]).strip("\n") + "\n"
+                        # escape """
+                        output_text = output_text.replace('"""', '')
+                        # TODO Qian: randomly set 20% output as the default output
+                        output_text = DEFAULT_OUTPUT if random.random() < 0.2 else output_text
+
+            elif 'text' in output.keys():
+                output_text += "".join(output['text']).strip("\n") + "\n"
+    output_text = output_text.strip("\n")
+    # if output_text is too long, truncate it
+    if len(output_text) > MAX_OUTPUT_LEN:
+        output_text = output_text[:MAX_OUTPUT_LEN] + "..."
+    # set a default value
+    if output_text == "":
+        output_text = DEFAULT_OUTPUT
+    return output_text
+
+
+def clean_markdown(text):
+    text = re.sub(r'<.*?>', '', text)
+    text = re.sub(r'\r\n', '\n', text)
+    text = re.sub(r'\n+', '\n', text)
+    # TODO: we want to keep the hirechical structure of markdown
+    # text = text.replace('#', '')
+    # IntroductionGreetings from the Kaggle bot!  ...
+    black_phrase_list = [
+        "IntroductionGreetings from the Kaggle bot!",
+        "Kaggle bot",
+        "Kaggle kerneler bot",
+        "automatically-generated",
+        "clicking run or pressing Shift+Enter",
+        "https://www.kaggle.com/learn/python",
+        "Thank You",
+    ]
+    for phrase in black_phrase_list:
+        if phrase in text:
+            text = ""
+    return text
+
+
+def segment_blocks(content):
+    cells = []
+    cell_types = []
+    for cell in content:
+        if len(cell['source']) > 0:
+            output = DEFAULT_OUTPUT
+            if 'outputs' in cell.keys():
+                output = clean_output(cell['outputs'])
+            cells.append({"input": ''.join(cell['source']),
+                          "output": output})
+            cell_types.append(cell['cell_type'])
+    # if the current cell is empty, then merge it with the next cell if they have the same type
+    for i in range(len(cells) - 1):
+        if cells[i]["output"] == DEFAULT_OUTPUT and cell_types[i] == cell_types[i + 1]:
+            separator = '\n'
+            cells[i + 1]["input"] = cells[i]["input"] + separator + cells[i + 1]["input"]
+            cells[i]["input"] = ''
+            cell_types[i] = ''
+    cells = [cell for cell in cells if cell["input"] != '']
+    cell_types = [cell_type for cell_type in cell_types if cell_type != '']
+    return cells, cell_types
+
+
+def formatter(content, option):
+    assert option in ['code', 'markdown', 'result', 'raw'], "Unknown option: {}".format(option)
+    if option == 'code':
+        return clean_code(content)
+    elif option == 'markdown':
+        content = clean_markdown(content)
+        if content != "":
+            return "\n".join(["# " + line.strip() for line in content.split("\n")])
+        else:
+            return ""
+    elif option == 'result' and content != DEFAULT_OUTPUT:
+        result_lines = content.split("\n")
+        if len(result_lines) >= 5:
+            result_lines = result_lines[:5] + ["..."]
+        wrapper = '"""Example Output:\n{}\n"""'
+        content = wrapper.format("\n".join(result_lines))
+        return content
+    else:
+        return ""
+
+
+def count_ratio_of_markdown_cells(types):
+    # statics the ratio of markdown cells
+    markdown_count = 0
+    for cell_type in types:
+        if cell_type == "markdown":
+            markdown_count += 1
+    return markdown_count / len(types)
+
+
+
+
+def parse_jupyter_into_script(notebook_json_str, use_code_execution):
+    """
+    Why we do not use jupytext is that we want to keep the output results of the notebook
+    """
+    try:
+        notebook = json.loads(notebook_json_str)
+        script_content = ""
+        conversation_text = ""
+        # add the filtering: notebook without more than 4 cells will be ignored
+        if len(notebook) < 4:
+            return ""
+
+        cells, types = segment_blocks(notebook)
+        if "code" not in types:
+            # no code, no need to parse
+            return ""
+        # follow paper https://arxiv.org/abs/2201.12901
+        # here we remove the jupyter notebook whose markdown cells are less than 30%
+        # TODO: after discussion with pengcheng, we do not use this by now
+        # if count_ratio_of_markdown_cells(types) < 0.3:
+        #     return ""
+        # flatten the list of cells to incorporate markdown and code
+
+        for i in range(len(cells)):
+            # if this is the last cell and it is a markdown cell, then we do not need to parse it
+            if i == len(cells) - 1 and types[i] == 'markdown':
+                break
+            cell, cell_type = cells[i], types[i]
+            if cell['output'] == "<empty>":
+                cell['output'] = ""
+            cell_script = ""
+            text_code_part = formatter(cell['input'], cell_type)
+            
+            # if do not use, then set it as empty
+            if use_code_execution:
+                result_part = formatter(cell['output'], 'result')
+            else:
+                result_part = ""
+
+            if result_part != "" and text_code_part != "":
+                cell_script = text_code_part + "\n" + result_part + "\n\n"
+            elif len(cell['output']) != 0 and cell_type == 'markdown':
+                # the markdown indicates an interactive widget but we cannot show it now, so ignore it
+                pass
+            elif result_part == "" and text_code_part != "":
+                cell_script = text_code_part + "\n"
+                # markdown should be separated with the previous code
+                if cell_type == 'markdown':
+                    cell_script = "\n" + cell_script
+                if "def " in text_code_part:
+                    # excpliitly add a new line between the current code to the next markdown
+                    # if the current code has a function definition, then we also add a new line
+                    cell_script += "\n"
+
+            conversation_text += cell_script
+        script_content = conversation_text
+        return script_content
+    except Exception as e:
+        print("Failed to parse the notebook: {}".format(e))
+        # traceback.print_exc()
+        return ""
diff --git a/data_analysis/kaggle/metadata/kaggle_data.ipynb b/data_analysis/kaggle/metadata/kaggle_data.ipynb
new file mode 100644
index 0000000..06eda1c
--- /dev/null
+++ b/data_analysis/kaggle/metadata/kaggle_data.ipynb
@@ -0,0 +1,1108 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Getting started\n",
+    "- Download metadata tables from https://www.kaggle.com/datasets/kaggle/meta-kaggle at /fsx/loubna/kaggle_data/metadata_kaggle/\n",
+    "- Download notebooks dataset from https://www.kaggle.com/datasets/kaggle/meta-kaggle-code at /fsx/loubna/kaggle_data/kaggle-code-data/data (note: it can take many hours)\n",
+    "\n",
+    "Some mapping between the code dataset `meta-kaggle-code` and the csv tables available at `meta-kaggle` needs to be done to retrieve the metadata of each notebook, in particular we want to find the dataset name (owner/data_name) to download datasets using kaggle API so we can add information about the dataset used in each notebooks. We also want to add upvotes, title, data description and competition description/title and any other relevant information..."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import json\n",
+    "from pathlib import Path\n",
+    "\n",
+    "\n",
+    "# the Meta Kaggle Code dataset with notebooks\n",
+    "code_base_path = Path('/fsx/loubna/kaggle_data/kaggle-code-data/data')\n",
+    "# match id to the filename in Meta Kaggle Code\n",
+    "kv_csv = '/fsx/loubna/kaggle_data/metadata_kaggle/KernelVersions.csv'\n",
+    "\n",
+    "# to get the name of the dataset used in each notebook\n",
+    "kernelversions_datasetsources_csv = '/fsx/loubna/kaggle_data/metadata_kaggle/KernelVersionDatasetSources.csv'\n",
+    "datasets_versions_csv = '/fsx/loubna/kaggle_data/metadata_kaggle/DatasetVersions.csv'\n",
+    "# to get the org user id of the dataset\n",
+    "datasets_csv = '/fsx/loubna/kaggle_data/metadata_kaggle/Datasets.csv'\n",
+    "# to get owner name from its id\n",
+    "users_csv = '/fsx/loubna/kaggle_data/metadata_kaggle/Users.csv'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# retrieve notebooks, tehre are also .py and .r files to be analyzed\n",
+    "notebooks = code_base_path.glob('*/*/*.ipynb')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "kversions = pd.read_csv(kv_csv)\n",
+    "datasets_versions = pd.read_csv(datasets_versions_csv)\n",
+    "datasets = pd.read_csv(datasets_csv)\n",
+    "kernelversions_datasetsources = pd.read_csv(kernelversions_datasetsources_csv)\n",
+    "users = pd.read_csv(users_csv)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Id</th>\n",
+       "      <th>ScriptId</th>\n",
+       "      <th>ParentScriptVersionId</th>\n",
+       "      <th>ScriptLanguageId</th>\n",
+       "      <th>AuthorUserId</th>\n",
+       "      <th>CreationDate</th>\n",
+       "      <th>VersionNumber</th>\n",
+       "      <th>Title</th>\n",
+       "      <th>EvaluationDate</th>\n",
+       "      <th>IsChange</th>\n",
+       "      <th>TotalLines</th>\n",
+       "      <th>LinesInsertedFromPrevious</th>\n",
+       "      <th>LinesChangedFromPrevious</th>\n",
+       "      <th>LinesUnchangedFromPrevious</th>\n",
+       "      <th>LinesInsertedFromFork</th>\n",
+       "      <th>LinesDeletedFromFork</th>\n",
+       "      <th>LinesChangedFromFork</th>\n",
+       "      <th>LinesUnchangedFromFork</th>\n",
+       "      <th>TotalVotes</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>3107</td>\n",
+       "      <td>865</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>2</td>\n",
+       "      <td>294199</td>\n",
+       "      <td>04/26/2015 08:03:20</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Keras deep net starter code</td>\n",
+       "      <td>04/26/2015</td>\n",
+       "      <td>False</td>\n",
+       "      <td>158.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>158.0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>3321</td>\n",
+       "      <td>991</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1</td>\n",
+       "      <td>28963</td>\n",
+       "      <td>04/27/2015 15:13:37</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Yo buddies, let's party</td>\n",
+       "      <td>04/27/2015</td>\n",
+       "      <td>False</td>\n",
+       "      <td>4.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>4.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>10018</td>\n",
+       "      <td>4583</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>2</td>\n",
+       "      <td>216445</td>\n",
+       "      <td>05/27/2015 19:12:59</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>scTryOut</td>\n",
+       "      <td>05/27/2015</td>\n",
+       "      <td>True</td>\n",
+       "      <td>81.0</td>\n",
+       "      <td>9.0</td>\n",
+       "      <td>5.0</td>\n",
+       "      <td>67.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>10115</td>\n",
+       "      <td>4682</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1889</td>\n",
+       "      <td>05/28/2015 03:39:58</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Max(Time Elapsed,Mean Time) Benchmark</td>\n",
+       "      <td>05/28/2015</td>\n",
+       "      <td>True</td>\n",
+       "      <td>22.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>20.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>20.0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>10194</td>\n",
+       "      <td>4702</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>2</td>\n",
+       "      <td>303928</td>\n",
+       "      <td>05/28/2015 11:30:32</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Simple Lasagne NN</td>\n",
+       "      <td>05/28/2015</td>\n",
+       "      <td>False</td>\n",
+       "      <td>246.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>247.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>245.0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "      Id  ScriptId  ParentScriptVersionId  ScriptLanguageId  AuthorUserId  \\\n",
+       "0   3107       865                    NaN                 2        294199   \n",
+       "1   3321       991                    NaN                 1         28963   \n",
+       "2  10018      4583                    NaN                 2        216445   \n",
+       "3  10115      4682                    NaN                 1          1889   \n",
+       "4  10194      4702                    NaN                 2        303928   \n",
+       "\n",
+       "          CreationDate  VersionNumber                                  Title  \\\n",
+       "0  04/26/2015 08:03:20            NaN            Keras deep net starter code   \n",
+       "1  04/27/2015 15:13:37            NaN                Yo buddies, let's party   \n",
+       "2  05/27/2015 19:12:59            NaN                               scTryOut   \n",
+       "3  05/28/2015 03:39:58            NaN  Max(Time Elapsed,Mean Time) Benchmark   \n",
+       "4  05/28/2015 11:30:32            NaN                      Simple Lasagne NN   \n",
+       "\n",
+       "  EvaluationDate  IsChange  TotalLines  LinesInsertedFromPrevious  \\\n",
+       "0     04/26/2015     False       158.0                        NaN   \n",
+       "1     04/27/2015     False         4.0                        0.0   \n",
+       "2     05/27/2015      True        81.0                        9.0   \n",
+       "3     05/28/2015      True        22.0                        0.0   \n",
+       "4     05/28/2015     False       246.0                        0.0   \n",
+       "\n",
+       "   LinesChangedFromPrevious  LinesUnchangedFromPrevious  \\\n",
+       "0                       NaN                         NaN   \n",
+       "1                       0.0                         4.0   \n",
+       "2                       5.0                        67.0   \n",
+       "3                       2.0                        20.0   \n",
+       "4                       0.0                       247.0   \n",
+       "\n",
+       "   LinesInsertedFromFork  LinesDeletedFromFork  LinesChangedFromFork  \\\n",
+       "0                    0.0                   0.0                   0.0   \n",
+       "1                    NaN                   NaN                   NaN   \n",
+       "2                    NaN                   NaN                   NaN   \n",
+       "3                    0.0                   0.0                   2.0   \n",
+       "4                    0.0                   0.0                   2.0   \n",
+       "\n",
+       "   LinesUnchangedFromFork  TotalVotes  \n",
+       "0                   158.0           0  \n",
+       "1                     NaN           0  \n",
+       "2                     NaN           0  \n",
+       "3                    20.0           0  \n",
+       "4                   245.0           0  "
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "kversions.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Id</th>\n",
+       "      <th>KernelVersionId</th>\n",
+       "      <th>SourceDatasetVersionId</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>292938</td>\n",
+       "      <td>888680</td>\n",
+       "      <td>1491</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>299142</td>\n",
+       "      <td>888506</td>\n",
+       "      <td>1491</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>293256</td>\n",
+       "      <td>889995</td>\n",
+       "      <td>1491</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>293955</td>\n",
+       "      <td>889967</td>\n",
+       "      <td>1491</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>303817</td>\n",
+       "      <td>889848</td>\n",
+       "      <td>1491</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "       Id  KernelVersionId  SourceDatasetVersionId\n",
+       "0  292938           888680                    1491\n",
+       "1  299142           888506                    1491\n",
+       "2  293256           889995                    1491\n",
+       "3  293955           889967                    1491\n",
+       "4  303817           889848                    1491"
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "kernelversions_datasetsources.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/fsx/loubna/kaggle_data/kaggle-code-data/data/0069/046/69046416.ipynb\n",
+      "69046416\n"
+     ]
+    }
+   ],
+   "source": [
+    "fp = next(notebooks) \n",
+    "with open(fp,'r') as f:\n",
+    "    content = f.readlines()[0]\n",
+    "    content = json.loads(content)\n",
+    "    cells = content['cells']\n",
+    "    sample = {\"content\": cells}\n",
+    "file_id = str(fp).split('/')[-1].split('.')[0]\n",
+    "print(fp)\n",
+    "# the file id is its name\n",
+    "print(file_id)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pprint import pprint\n",
+    "\n",
+    "pprint(cells)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We now have the content of the notebook, let's retrieve its metadata: dataset, description, competition upvotes..\n",
+    "\n",
+    "There are several tables to retrieve this data from:\n",
+    "- KernelVersions for some metadata that will link to other tables\n",
+    "- KernelVersionDatasetSources for the data source\n",
+    "- DatasetVersion with dataset name\n",
+    "- Datasets with owners of the dataset\n",
+    "\n",
+    "=> can be used to download the corresponding kaggla dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Id</th>\n",
+       "      <th>ScriptId</th>\n",
+       "      <th>ParentScriptVersionId</th>\n",
+       "      <th>ScriptLanguageId</th>\n",
+       "      <th>AuthorUserId</th>\n",
+       "      <th>CreationDate</th>\n",
+       "      <th>VersionNumber</th>\n",
+       "      <th>Title</th>\n",
+       "      <th>EvaluationDate</th>\n",
+       "      <th>IsChange</th>\n",
+       "      <th>TotalLines</th>\n",
+       "      <th>LinesInsertedFromPrevious</th>\n",
+       "      <th>LinesChangedFromPrevious</th>\n",
+       "      <th>LinesUnchangedFromPrevious</th>\n",
+       "      <th>LinesInsertedFromFork</th>\n",
+       "      <th>LinesDeletedFromFork</th>\n",
+       "      <th>LinesChangedFromFork</th>\n",
+       "      <th>LinesUnchangedFromFork</th>\n",
+       "      <th>TotalVotes</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>5221710</th>\n",
+       "      <td>69046416</td>\n",
+       "      <td>18825679</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>9</td>\n",
+       "      <td>7571614</td>\n",
+       "      <td>07/26/2021 08:39:57</td>\n",
+       "      <td>4.0</td>\n",
+       "      <td>King County Houses Neighborhood Classification</td>\n",
+       "      <td>07/26/2021</td>\n",
+       "      <td>True</td>\n",
+       "      <td>269.0</td>\n",
+       "      <td>10.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>259.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "               Id  ScriptId  ParentScriptVersionId  ScriptLanguageId  \\\n",
+       "5221710  69046416  18825679                    NaN                 9   \n",
+       "\n",
+       "         AuthorUserId         CreationDate  VersionNumber  \\\n",
+       "5221710       7571614  07/26/2021 08:39:57            4.0   \n",
+       "\n",
+       "                                                  Title EvaluationDate  \\\n",
+       "5221710  King County Houses Neighborhood Classification     07/26/2021   \n",
+       "\n",
+       "         IsChange  TotalLines  LinesInsertedFromPrevious  \\\n",
+       "5221710      True       269.0                       10.0   \n",
+       "\n",
+       "         LinesChangedFromPrevious  LinesUnchangedFromPrevious  \\\n",
+       "5221710                       0.0                       259.0   \n",
+       "\n",
+       "         LinesInsertedFromFork  LinesDeletedFromFork  LinesChangedFromFork  \\\n",
+       "5221710                    NaN                   NaN                   NaN   \n",
+       "\n",
+       "         LinesUnchangedFromFork  TotalVotes  \n",
+       "5221710                     NaN           1  "
+      ]
+     },
+     "execution_count": 38,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# first metadata\n",
+    "kversion = kversions[kversions['Id']==int(file_id)]\n",
+    "kversion"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Id</th>\n",
+       "      <th>KernelVersionId</th>\n",
+       "      <th>SourceDatasetVersionId</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>3297208</th>\n",
+       "      <td>91775956</td>\n",
+       "      <td>69046416</td>\n",
+       "      <td>270</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "               Id  KernelVersionId  SourceDatasetVersionId\n",
+       "3297208  91775956         69046416                     270"
+      ]
+     },
+     "execution_count": 41,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# find data source\n",
+    "data_source_kernel = kernelversions_datasetsources[kernelversions_datasetsources['KernelVersionId']==int(file_id)]\n",
+    "data_source_kernel"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Id</th>\n",
+       "      <th>DatasetId</th>\n",
+       "      <th>DatasourceVersionId</th>\n",
+       "      <th>CreatorUserId</th>\n",
+       "      <th>LicenseName</th>\n",
+       "      <th>CreationDate</th>\n",
+       "      <th>VersionNumber</th>\n",
+       "      <th>Title</th>\n",
+       "      <th>Slug</th>\n",
+       "      <th>Subtitle</th>\n",
+       "      <th>Description</th>\n",
+       "      <th>VersionNotes</th>\n",
+       "      <th>TotalCompressedBytes</th>\n",
+       "      <th>TotalUncompressedBytes</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>6</td>\n",
+       "      <td>6</td>\n",
+       "      <td>6</td>\n",
+       "      <td>1</td>\n",
+       "      <td>CC0: Public Domain</td>\n",
+       "      <td>07/18/2015 00:51:12</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>2013 American Community Survey</td>\n",
+       "      <td>2013-american-community-survey</td>\n",
+       "      <td>Find insights in the 2013 American Community S...</td>\n",
+       "      <td>The [American Community Survey](http://www.cen...</td>\n",
+       "      <td>Initial Release</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>8</td>\n",
+       "      <td>8</td>\n",
+       "      <td>8</td>\n",
+       "      <td>1</td>\n",
+       "      <td>CC0: Public Domain</td>\n",
+       "      <td>08/18/2015 21:53:00</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>Ocean Ship Logbooks (1750-1850)</td>\n",
+       "      <td>climate-data-from-ocean-ships</td>\n",
+       "      <td>Explore changing climatology with data from ea...</td>\n",
+       "      <td>In the mid-eighteenth to nineteenth centuries,...</td>\n",
+       "      <td>Initial release</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   Id  DatasetId  DatasourceVersionId  CreatorUserId         LicenseName  \\\n",
+       "0   6          6                    6              1  CC0: Public Domain   \n",
+       "1   8          8                    8              1  CC0: Public Domain   \n",
+       "\n",
+       "          CreationDate  VersionNumber                            Title  \\\n",
+       "0  07/18/2015 00:51:12            1.0   2013 American Community Survey   \n",
+       "1  08/18/2015 21:53:00            1.0  Ocean Ship Logbooks (1750-1850)   \n",
+       "\n",
+       "                             Slug  \\\n",
+       "0  2013-american-community-survey   \n",
+       "1   climate-data-from-ocean-ships   \n",
+       "\n",
+       "                                            Subtitle  \\\n",
+       "0  Find insights in the 2013 American Community S...   \n",
+       "1  Explore changing climatology with data from ea...   \n",
+       "\n",
+       "                                         Description     VersionNotes  \\\n",
+       "0  The [American Community Survey](http://www.cen...  Initial Release   \n",
+       "1  In the mid-eighteenth to nineteenth centuries,...  Initial release   \n",
+       "\n",
+       "   TotalCompressedBytes  TotalUncompressedBytes  \n",
+       "0                   NaN                     NaN  \n",
+       "1                   NaN                     NaN  "
+      ]
+     },
+     "execution_count": 45,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "datasets_versions.head(2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Id</th>\n",
+       "      <th>DatasetId</th>\n",
+       "      <th>DatasourceVersionId</th>\n",
+       "      <th>CreatorUserId</th>\n",
+       "      <th>LicenseName</th>\n",
+       "      <th>CreationDate</th>\n",
+       "      <th>VersionNumber</th>\n",
+       "      <th>Title</th>\n",
+       "      <th>Slug</th>\n",
+       "      <th>Subtitle</th>\n",
+       "      <th>Description</th>\n",
+       "      <th>VersionNotes</th>\n",
+       "      <th>TotalCompressedBytes</th>\n",
+       "      <th>TotalUncompressedBytes</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>94</th>\n",
+       "      <td>270</td>\n",
+       "      <td>128</td>\n",
+       "      <td>270</td>\n",
+       "      <td>680332</td>\n",
+       "      <td>CC0: Public Domain</td>\n",
+       "      <td>08/25/2016 15:52:49</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>House Sales in King County, USA</td>\n",
+       "      <td>housesalesprediction</td>\n",
+       "      <td>Predict house price using regression</td>\n",
+       "      <td>This dataset contains house sale prices for Ki...</td>\n",
+       "      <td>Initial release</td>\n",
+       "      <td>2515206.0</td>\n",
+       "      <td>2515206.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "     Id  DatasetId  DatasourceVersionId  CreatorUserId         LicenseName  \\\n",
+       "94  270        128                  270         680332  CC0: Public Domain   \n",
+       "\n",
+       "           CreationDate  VersionNumber                            Title  \\\n",
+       "94  08/25/2016 15:52:49            1.0  House Sales in King County, USA   \n",
+       "\n",
+       "                    Slug                              Subtitle  \\\n",
+       "94  housesalesprediction  Predict house price using regression   \n",
+       "\n",
+       "                                          Description     VersionNotes  \\\n",
+       "94  This dataset contains house sale prices for Ki...  Initial release   \n",
+       "\n",
+       "    TotalCompressedBytes  TotalUncompressedBytes  \n",
+       "94             2515206.0               2515206.0  "
+      ]
+     },
+     "execution_count": 54,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "source_id = data_source_kernel['SourceDatasetVersionId']\n",
+    "dataset_name = datasets_versions[datasets_versions['Id']==int(source_id)]\n",
+    "dataset_name"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 77,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'housesalesprediction'"
+      ]
+     },
+     "execution_count": 77,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# get str in dataset_name[\"Slug\"]\n",
+    "data_name = dataset_name[\"Slug\"].values[0]\n",
+    "data_name"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 65,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Id</th>\n",
+       "      <th>CreatorUserId</th>\n",
+       "      <th>OwnerUserId</th>\n",
+       "      <th>OwnerOrganizationId</th>\n",
+       "      <th>CurrentDatasetVersionId</th>\n",
+       "      <th>CurrentDatasourceVersionId</th>\n",
+       "      <th>ForumId</th>\n",
+       "      <th>Type</th>\n",
+       "      <th>CreationDate</th>\n",
+       "      <th>LastActivityDate</th>\n",
+       "      <th>TotalViews</th>\n",
+       "      <th>TotalDownloads</th>\n",
+       "      <th>TotalVotes</th>\n",
+       "      <th>TotalKernels</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>128</td>\n",
+       "      <td>680332</td>\n",
+       "      <td>680332.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>270.0</td>\n",
+       "      <td>270.0</td>\n",
+       "      <td>1447</td>\n",
+       "      <td>2</td>\n",
+       "      <td>08/25/2016 15:52:49</td>\n",
+       "      <td>02/06/2018</td>\n",
+       "      <td>996866</td>\n",
+       "      <td>172516</td>\n",
+       "      <td>2041</td>\n",
+       "      <td>1225</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "     Id  CreatorUserId  OwnerUserId  OwnerOrganizationId  \\\n",
+       "11  128         680332     680332.0                  NaN   \n",
+       "\n",
+       "    CurrentDatasetVersionId  CurrentDatasourceVersionId  ForumId  Type  \\\n",
+       "11                    270.0                       270.0     1447     2   \n",
+       "\n",
+       "           CreationDate LastActivityDate  TotalViews  TotalDownloads  \\\n",
+       "11  08/25/2016 15:52:49       02/06/2018      996866          172516   \n",
+       "\n",
+       "    TotalVotes  TotalKernels  \n",
+       "11        2041          1225  "
+      ]
+     },
+     "execution_count": 65,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# check in datasets if a row has id =dataset_name[\"DatasetId\"]\n",
+    "owner = datasets[datasets['Id']==int(dataset_name[\"DatasetId\"])]\n",
+    "owner"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 66,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Id</th>\n",
+       "      <th>UserName</th>\n",
+       "      <th>DisplayName</th>\n",
+       "      <th>RegisterDate</th>\n",
+       "      <th>PerformanceTier</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1</td>\n",
+       "      <td>kaggleteam</td>\n",
+       "      <td>Kaggle Team</td>\n",
+       "      <td>03/24/2011</td>\n",
+       "      <td>5</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>368</td>\n",
+       "      <td>antgoldbloom</td>\n",
+       "      <td>Anthony Goldbloom</td>\n",
+       "      <td>01/20/2010</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "    Id      UserName        DisplayName RegisterDate  PerformanceTier\n",
+       "0    1    kaggleteam        Kaggle Team   03/24/2011                5\n",
+       "1  368  antgoldbloom  Anthony Goldbloom   01/20/2010                2"
+      ]
+     },
+     "execution_count": 66,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "users.head(2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 68,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "11    680332.0\n",
+       "Name: OwnerUserId, dtype: float64"
+      ]
+     },
+     "execution_count": 68,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "owner[\"OwnerUserId\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 69,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Id</th>\n",
+       "      <th>UserName</th>\n",
+       "      <th>DisplayName</th>\n",
+       "      <th>RegisterDate</th>\n",
+       "      <th>PerformanceTier</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>597836</th>\n",
+       "      <td>680332</td>\n",
+       "      <td>harlfoxem</td>\n",
+       "      <td>harlfoxem</td>\n",
+       "      <td>08/05/2016</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "            Id   UserName DisplayName RegisterDate  PerformanceTier\n",
+       "597836  680332  harlfoxem   harlfoxem   08/05/2016                1"
+      ]
+     },
+     "execution_count": 69,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# let's get username\n",
+    "user_id = users[users['Id']==int(owner[\"OwnerUserId\"])]\n",
+    "user_id"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 81,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'harlfoxem/housesalesprediction'"
+      ]
+     },
+     "execution_count": 81,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "final_data = f'{user_id[\"UserName\"].values[0]}/{data_name}'\n",
+    "final_data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can then retrieve the data from: https://www.kaggle.com/datasets/harlfoxem/housesalesprediction 🎉"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.10.9 ('eval-harness': conda)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.9"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "271972ab9158cd42175bc1ec5288153b91d150291a0b625c2babd1911356e891"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/data_analysis/kaggle/metadata/manual_sharding.py b/data_analysis/kaggle/metadata/manual_sharding.py
new file mode 100644
index 0000000..904b30b
--- /dev/null
+++ b/data_analysis/kaggle/metadata/manual_sharding.py
@@ -0,0 +1,66 @@
+import os
+import time
+from multiprocessing import Pool
+from tqdm import tqdm
+
+from huggingface_hub import Repository
+
+
+def save_shard(shard_tuple):
+    """Save shard"""
+    filename, shard = shard_tuple
+    # use to_json instead to save as json file
+    shard.to_parquet(filename)
+
+
+def save_manual_shards(ds, user="loubnabnl", remote_dataset_repo="bigcode-pii-pjj"):
+    """Save sharded data
+    Args:
+        ds (Dataset): dataset to be saved
+        user (str): user name
+        remote_dataset_repo (str): remote dataset repository
+        out_path (str): path to save the shards"""
+    # this will create a folder OUT_PATH that is a clone of REMOTE_DATASET_REPO
+    # you can save the shards inside it and do git add/commit/push to push data to the hub
+    out_path = remote_dataset_repo
+    # if out path doesnt already exist
+    if not os.path.exists(out_path):
+        repo = Repository(
+            local_dir=out_path,
+            clone_from=user + "/" + remote_dataset_repo,
+            repo_type="dataset",
+            use_auth_token=True,
+            git_user=user,
+        )
+
+    # files will be numerous we save them in a folder called data inside out_path
+    os.mkdir(out_path + "/data")
+    SHARD_SIZE = 1000 << 20
+    if ds._indices is not None:
+        dataset_nbytes = ds.data.nbytes * len(ds._indices) / len(ds.data)
+    else:
+        dataset_nbytes = ds.data.nbytes
+    num_shards = int(dataset_nbytes / SHARD_SIZE) + 1
+    print(f"Number of shards: {num_shards}")
+
+    print("sharding the dataset")
+    t_start = time.time()
+    shards = (
+        ds.shard(num_shards=num_shards, index=i, contiguous=True)
+        for i in range(num_shards)
+    )
+    # use f"{OUT_PATH}/data/train-{index:05d}-of-{num_shards:05d}.json" instead for json files
+    filenames = (
+        f"{out_path}/data/train-{index:05d}-of-{num_shards:05d}.parquet"
+        for index in range(num_shards)
+    )
+
+    with Pool(16) as p:
+        list(
+            tqdm(
+                p.imap_unordered(save_shard, zip(filenames, shards), chunksize=4),
+                total=num_shards,
+            )
+        )
+    print(f"Time to save dataset: {time.time()-t_start:.2f}")
+    # to push dataset to hub do: git add/commit/push inside OUT_PATH
diff --git a/data_analysis/kaggle/metadata/process_data.py b/data_analysis/kaggle/metadata/process_data.py
new file mode 100644
index 0000000..3d73011
--- /dev/null
+++ b/data_analysis/kaggle/metadata/process_data.py
@@ -0,0 +1,58 @@
+# code for converting the kaggle dataset to standard dataframe (metadata not added here, see retrieve_metadata.py)
+import pandas as pd
+import json
+from pathlib import Path
+from multiprocessing import Pool, cpu_count
+from datasets import Dataset
+
+code_base_path = Path('/fsx/loubna/kaggle_data/kaggle-code-data/data')
+
+# Function to extract content from a notebook
+def extract_content(fp):
+    try:
+        with open(fp, 'r', encoding='utf-8') as f:
+            content = json.load(f)
+            cells = content.get('cells', [])
+            cells = json.dumps(cells)
+            file_id = fp.stem
+            return {'file_id': file_id, 'content': cells, 'local_path': str(fp)}
+    except json.JSONDecodeError:
+        print(f"Error decoding JSON for file: {fp}")
+        return {'file_id': None, 'content': None, 'local_path': str(fp)}
+
+
+def find_notebooks(base_dir):
+    return list(base_dir.glob('*/*.ipynb'))
+
+
+def main():
+    sub_dirs = [x for x in code_base_path.iterdir() if x.is_dir()]
+
+    # Use a Pool of workers to find notebooks
+    with Pool(cpu_count()) as p:
+        notebook_lists = p.map(find_notebooks, sub_dirs)
+    print(f"number of notebook dirs retrieved {len(notebook_lists)}")
+    # Flatten the list of lists
+    all_notebooks = [item for sublist in notebook_lists for item in sublist]
+    print(f"total number of notebooks {len(all_notebooks)}")
+
+    # Use a Pool of workers to extract content
+    print("starting extraction...")
+    with Pool(cpu_count()) as p:
+        data = p.map(extract_content, all_notebooks)
+    print("extraction finished")
+
+    # save data
+    df = pd.DataFrame(data)
+    df.to_csv('kaggle_notebooks.csv', index=False) 
+    print("saved to csv file")
+    ds = Dataset.from_pandas(df)
+    # filter out None values
+    ds = ds.filter(lambda x: x['file_id'] is not None)
+    print(f"number of notebooks after filtering {len(ds)}"d)
+    ds.push_to_hub("kaggle-notebooks-data")
+    print("pushed to hub")
+    return ds
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/data_analysis/kaggle/metadata/retreive_metadata.py b/data_analysis/kaggle/metadata/retreive_metadata.py
new file mode 100644
index 0000000..b393cfa
--- /dev/null
+++ b/data_analysis/kaggle/metadata/retreive_metadata.py
@@ -0,0 +1,85 @@
+# code for getting metadata based on file id 
+import pandas as pd
+import json
+from datasets import load_dataset
+from manual_sharding import save_manual_shards
+
+<<<<<<< HEAD:data_analysis/kaggle/metadata/retreive_metadata.py
+ds = load_dataset("/fsx/loubna/kaggle-scripts-dedup", split="train", num_proc=36)
+
+print(f"dataset loaded with {len(ds)} rows")
+=======
+ds = load_dataset("bigcode/kaggle-notebooks-data", use_auth_token=True, split="train")
+print("dataset loaded")
+>>>>>>> 591eb7a64ae49589fd866b41adb9d653105b87d6:data_analysis/kaggle/retreive_metadata.py
+
+kv_csv = '/fsx/loubna/kaggle_data/metadata_kaggle/KernelVersions.csv'
+kernelversions_datasetsources_csv = '/fsx/loubna/kaggle_data/metadata_kaggle/KernelVersionDatasetSources.csv'
+datasets_versions_csv = '/fsx/loubna/kaggle_data/metadata_kaggle/DatasetVersions.csv'
+datasets_csv = '/fsx/loubna/kaggle_data/metadata_kaggle/Datasets.csv'
+users_csv = '/fsx/loubna/kaggle_data/metadata_kaggle/Users.csv'
+
+kversions = pd.read_csv(kv_csv)
+datasets_versions = pd.read_csv(datasets_versions_csv)
+datasets = pd.read_csv(datasets_csv)
+kernelversions_datasetsources = pd.read_csv(kernelversions_datasetsources_csv)
+users = pd.read_csv(users_csv)
+print("metadata loaded")
+
+def safe_get(dataframe, condition, column=None):
+    """Utility function to safely get value from DataFrame."""
+    result = dataframe[condition]
+    if result.empty:
+        return None
+    if column:
+        return result[column].values[0]
+    return result
+
+def get_metadata(file_id):
+    """given the id of a notebook (=the stem of its path) we retrieve metadata from csv tables
+    provided by kaggle"""
+
+    file_id_int = int(file_id)
+    kversion = safe_get(kversions, kversions['Id'] == file_id_int)
+    data_source_kernel = safe_get(kernelversions_datasetsources, kernelversions_datasetsources['KernelVersionId'] == file_id_int)
+    
+    source_id = None if data_source_kernel is None else data_source_kernel['SourceDatasetVersionId'].values[0]
+    dataset_v = safe_get(datasets_versions, datasets_versions['Id'] == source_id)
+    
+    data_name = dataset_v["Slug"].values[0] if dataset_v is not None else None
+    dataset_id = dataset_v["DatasetId"].values[0] if dataset_v is not None else None
+    
+    source_dataset = safe_get(datasets, datasets['Id'] == dataset_id)
+    owner_user_id = None if source_dataset is None else source_dataset["OwnerUserId"].values[0]
+    
+    user = safe_get(users, users['Id'] == owner_user_id)
+    user_name = None if user is None else user["UserName"].values[0]
+
+    return {
+        'kaggle_dataset_name': data_name,
+        'kaggle_dataset_owner': user_name,
+        'kversion': json.dumps(kversion.to_dict(orient='records')) if kversion is not None else None,
+        'kversion_datasetsources': json.dumps(data_source_kernel.to_dict(orient='records')) if data_source_kernel is not None else None,
+        'dataset_versions': json.dumps(dataset_v.to_dict(orient='records')) if dataset_v is not None else None,
+        'datasets': json.dumps(source_dataset.to_dict(orient='records')) if source_dataset is not None else None,
+        'users': json.dumps(user.to_dict(orient='records')) if user is not None else None
+    }
+
+
+def retrive_metadata(row):
+    output = get_metadata(row['file_id'])
+    return output
+
+# issue when using map with multipprocessing new values are None
+new_ds = ds.map(retrive_metadata)
+<<<<<<< HEAD:data_analysis/kaggle/metadata/retreive_metadata.py
+save_manual_shards(
+    new_ds, user="loubnabnl", remote_dataset_repo="kaggle-scripts-clean-dedup-meta",
+)
+subset = ds.select(range(10_000))
+subset.push_to_hub("kaggle_scripts_subset")
+print("Done! 💃🏻💥")
+#new_ds.push_to_hub("kaggle-notebooks-data-w-metadata")
+=======
+new_ds.push_to_hub("kaggle-notebooks-data-metadata-20k")
+>>>>>>> 591eb7a64ae49589fd866b41adb9d653105b87d6:data_analysis/kaggle/retreive_metadata.py