diff --git a/data_analysis/kaggle/README.md b/data_analysis/kaggle/README.md new file mode 100644 index 0000000..6ccde6b --- /dev/null +++ b/data_analysis/kaggle/README.md @@ -0,0 +1,3 @@ +# Kaggle data + +Code for curation of kaggle notebooks \ No newline at end of file diff --git a/data_analysis/kaggle/curation/README.md b/data_analysis/kaggle/curation/README.md new file mode 100644 index 0000000..8d50465 --- /dev/null +++ b/data_analysis/kaggle/curation/README.md @@ -0,0 +1,9 @@ +# data-curation-kaggle + +Code from: https://github.com/bigcode-project/data-curation-kaggle/tree/main +You can apply the following filtering: + +- length-based filtering +- rule-based filtering + +All the notebooks will be converted into a python script. \ No newline at end of file diff --git a/data_analysis/kaggle/curation/manual_sharding.py b/data_analysis/kaggle/curation/manual_sharding.py new file mode 100644 index 0000000..904b30b --- /dev/null +++ b/data_analysis/kaggle/curation/manual_sharding.py @@ -0,0 +1,66 @@ +import os +import time +from multiprocessing import Pool +from tqdm import tqdm + +from huggingface_hub import Repository + + +def save_shard(shard_tuple): + """Save shard""" + filename, shard = shard_tuple + # use to_json instead to save as json file + shard.to_parquet(filename) + + +def save_manual_shards(ds, user="loubnabnl", remote_dataset_repo="bigcode-pii-pjj"): + """Save sharded data + Args: + ds (Dataset): dataset to be saved + user (str): user name + remote_dataset_repo (str): remote dataset repository + out_path (str): path to save the shards""" + # this will create a folder OUT_PATH that is a clone of REMOTE_DATASET_REPO + # you can save the shards inside it and do git add/commit/push to push data to the hub + out_path = remote_dataset_repo + # if out path doesnt already exist + if not os.path.exists(out_path): + repo = Repository( + local_dir=out_path, + clone_from=user + "/" + remote_dataset_repo, + repo_type="dataset", + use_auth_token=True, + git_user=user, + ) + + # files will be numerous we save them in a folder called data inside out_path + os.mkdir(out_path + "/data") + SHARD_SIZE = 1000 << 20 + if ds._indices is not None: + dataset_nbytes = ds.data.nbytes * len(ds._indices) / len(ds.data) + else: + dataset_nbytes = ds.data.nbytes + num_shards = int(dataset_nbytes / SHARD_SIZE) + 1 + print(f"Number of shards: {num_shards}") + + print("sharding the dataset") + t_start = time.time() + shards = ( + ds.shard(num_shards=num_shards, index=i, contiguous=True) + for i in range(num_shards) + ) + # use f"{OUT_PATH}/data/train-{index:05d}-of-{num_shards:05d}.json" instead for json files + filenames = ( + f"{out_path}/data/train-{index:05d}-of-{num_shards:05d}.parquet" + for index in range(num_shards) + ) + + with Pool(16) as p: + list( + tqdm( + p.imap_unordered(save_shard, zip(filenames, shards), chunksize=4), + total=num_shards, + ) + ) + print(f"Time to save dataset: {time.time()-t_start:.2f}") + # to push dataset to hub do: git add/commit/push inside OUT_PATH diff --git a/data_analysis/kaggle/curation/process_kaggle.py b/data_analysis/kaggle/curation/process_kaggle.py new file mode 100644 index 0000000..6f5de6e --- /dev/null +++ b/data_analysis/kaggle/curation/process_kaggle.py @@ -0,0 +1,59 @@ +from datasets import load_dataset +from utils import parse_jupyter_into_script +import black +from manual_sharding import save_manual_shards + +TEMPLATE = '# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python\nimport numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n\n# Input data files are available in the read-only "../input/" directory\n# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\nimport os\n\nfor dirname, _, filenames in os.walk("/kaggle/input"):\n for filename in filenames:\n print(os.path.join(dirname, filename))\n# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"\n# You can also write temporary files to /kaggle/temp/, but they won\'t be saved outside of the current session' +SHORT_TEMPLATE = '# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python\n' + +def check_syntax(code): + try: + compile(code, "", "exec") + return True + except Exception as e: + return False + +def format_code(example): + try: + # sometimes autopep8 will be stuck, so we need to set a timeout + formatted_code = black.format_str(example["script"] , mode=black.FileMode()) + if formatted_code.startswith(TEMPLATE): + formatted_code = formatted_code[len(TEMPLATE):].strip() + if formatted_code.startswith(SHORT_TEMPLATE): + formatted_code = formatted_code[len(SHORT_TEMPLATE):].strip() + example["script"] = formatted_code + except Exception as e: + print(e) + pass + return example + +def parse_whole_content_kaggle(example): + notebook = example["content"] + script_content = parse_jupyter_into_script(notebook, False) + example["script"] = script_content + return example + +def process_kaggle_jupyter(dataset, output_path, use_code_execution, workers=1): + init_size = len(dataset) + dataset = dataset.filter(lambda x: len(x["content"]) <= 500_0000, num_proc=workers) + dataset = dataset.map(parse_whole_content_kaggle, num_proc=90) + dataset = dataset.filter(lambda x: len(x["script"]) > 100, num_proc=workers) + print(f"Finish parsing the whole content, total {len(dataset)} notebooks, dropped {100 - len(dataset)/init_size * 100:.2f}% of the original dataset") + init_size = len(dataset) + # filter the syntax error + dataset = dataset.filter(lambda x: check_syntax(x["script"]), num_proc=workers) + dataset = dataset.map(format_code, num_proc=90, load_from_cache_file=False) + print(f"Check the syntax, total {len(dataset)} notebooks, dropped {100 - len(dataset)/init_size * 100:.2f}% more of the original dataset") + save_manual_shards( + dataset, user="loubnabnl", remote_dataset_repo="kaggle-scripts-clean", + ) + print("DONE! Example:\n") + print(dataset[0]["script"][:100]) + + +if __name__ == '__main__': + dataset = load_dataset("bigcode/kaggle-notebooks-data", + split="train") + process_kaggle_jupyter(dataset, + use_code_execution=False, + workers=36) diff --git a/data_analysis/kaggle/curation/requirements.txt b/data_analysis/kaggle/curation/requirements.txt new file mode 100644 index 0000000..b966e9b --- /dev/null +++ b/data_analysis/kaggle/curation/requirements.txt @@ -0,0 +1,7 @@ +beautifulsoup4 +tqdm +nbformat +torch +transformers +datasets +black \ No newline at end of file diff --git a/data_analysis/kaggle/curation/utils.py b/data_analysis/kaggle/curation/utils.py new file mode 100644 index 0000000..eab6fde --- /dev/null +++ b/data_analysis/kaggle/curation/utils.py @@ -0,0 +1,336 @@ +import json +import itertools +import re +from tqdm import tqdm +import sys +from typing import List +from bs4 import BeautifulSoup +import random +import keyword +import signal + +DEFAULT_OUTPUT = "" +DEFAULT_TABLE_TEMPLATE = "{}" +# truncate the part after 200 tokens +MAX_OUTPUT_LEN = 3 * 200 +# if longer than 200 tokens, we will drop the line since it is abnormal +MAX_CODE_LEN = 3 * 200 + +MAX_TABLE_COLUMN, MAX_TABLE_ROW = 5, 5 + +def parse_html_table(html): + def normalize_cell_text(text): + return text.strip().replace('\n', ' ').replace('|', '|') + + soup = BeautifulSoup(html, 'html.parser') + table = soup.find('table') + + if not table: + return '' + + rows = table.find_all('tr') + headers = [normalize_cell_text(cell.get_text()) for cell in rows[0].find_all('th')] + rows = [[normalize_cell_text(cell.get_text()) for cell in row.find_all('td')] for row in rows[1:]] + + if len(rows) > MAX_TABLE_ROW: + # random select 10 rows + rows = random.sample(rows, 5) + + # if the columns are too many, then we only select the first 10 columns + if len(headers) > MAX_TABLE_COLUMN: + headers = headers[:5] + rows = [[row[i] for i in range(5)] for row in rows] + + if len(headers) > 0: + markdown_table = ['| ' + ' | '.join(headers) + ' |', '| ' + ' | '.join(['---'] * len(headers)) + ' |'] + else: + markdown_table = [] + for row in rows: + if len(row): + markdown_table.append('| ' + ' | '.join(row) + ' |') + + if len(markdown_table): + return DEFAULT_TABLE_TEMPLATE.format('\n'.join(markdown_table)) + else: + return '' + + +def is_single_variable(line): + line = line.strip() + + # check if the line is a single variable + pattern = r'^\w+[\.\w+\(\)\d]*$' + match = re.match(pattern, line) + + # if match and line not in keyword.kwlist + if match and line not in keyword.kwlist: + return True + else: + return False + + +def filter_unused_output(plain_text: List): + content = "".join(plain_text) + # filter out progress bar + if ", ?B/s]" in content: + return False + if "................" in content: + return False + return True + + +def clean_code(input_code: str): + # TODO: we can extract the in-line comment and place them in the above line + input_code = re.sub("\n(\n+)", "\n\n", input_code) + code_lines = input_code.split("\n") + black_phrase_list = [ + "This Python 3 environment comes", + "here's several helpful packages to load", + "defined by the kaggle/python docker image", + "defined by the kaggle/python Docker image", + "Copyright (C)", + ] + for idx in range(len(code_lines)): + line = code_lines[idx].strip() + # magic command + if line.startswith("%") or line.startswith("!") or line.startswith("cd ") \ + or line.startswith("pip ") or line.startswith("apt ") or line.startswith("wget "): + code_lines[idx] = "" + elif len(line) > MAX_CODE_LEN: + # unexpected long, just drop it + code_lines[idx] = "" + for phrase in black_phrase_list: + if phrase in line: + code_lines[idx] = "" + # TODO: single variable, and we should be careful about the plt.show() case + # elif is_single_variable(line): + # code_lines[idx] = "print({})".format(line) + code_lines = [line for line in code_lines if line.strip() != ""] + # since we will have a unified formatter latter, so we don't need to worry about the space + # TODO: autopep8 will be stuck sometimes, so we need to set a timeout in the future + # return autopep8.fix_code("\n".join(code_lines)) + return "\n".join(code_lines) + + +def timeout_handler(signum, frame): + # Handle the action to be performed after the timeout triggers + # You can raise an exception or perform any other desired action + raise TimeoutError("Timeout occurred") + + +def set_timeout(seconds): + # Register the timeout handler function + signal.signal(signal.SIGALRM, timeout_handler) + # Set the timeout duration + signal.alarm(seconds) + + +def clean_output(outputs: List): + output_text = "" + if len(outputs) > 0: + # deal with figure + for output in outputs: + if 'data' in output.keys(): + all_data_keys = output['data'].keys() + # fetch text + for key in all_data_keys: + # there will be always a html field, it yes then parse it + # if not, then just use the plain text + content = "".join(output['data'][key]) + if key == "text/html" and ("dataframe" in content or " MAX_OUTPUT_LEN: + output_text = output_text[:MAX_OUTPUT_LEN] + "..." + # set a default value + if output_text == "": + output_text = DEFAULT_OUTPUT + return output_text + + +def clean_markdown(text): + text = re.sub(r'<.*?>', '', text) + text = re.sub(r'\r\n', '\n', text) + text = re.sub(r'\n+', '\n', text) + # TODO: we want to keep the hirechical structure of markdown + # text = text.replace('#', '') + # IntroductionGreetings from the Kaggle bot! ... + black_phrase_list = [ + "IntroductionGreetings from the Kaggle bot!", + "Kaggle bot", + "Kaggle kerneler bot", + "automatically-generated", + "clicking run or pressing Shift+Enter", + "https://www.kaggle.com/learn/python", + "Thank You", + ] + for phrase in black_phrase_list: + if phrase in text: + text = "" + return text + + +def segment_blocks(content): + cells = [] + cell_types = [] + for cell in content: + if len(cell['source']) > 0: + output = DEFAULT_OUTPUT + if 'outputs' in cell.keys(): + output = clean_output(cell['outputs']) + cells.append({"input": ''.join(cell['source']), + "output": output}) + cell_types.append(cell['cell_type']) + # if the current cell is empty, then merge it with the next cell if they have the same type + for i in range(len(cells) - 1): + if cells[i]["output"] == DEFAULT_OUTPUT and cell_types[i] == cell_types[i + 1]: + separator = '\n' + cells[i + 1]["input"] = cells[i]["input"] + separator + cells[i + 1]["input"] + cells[i]["input"] = '' + cell_types[i] = '' + cells = [cell for cell in cells if cell["input"] != ''] + cell_types = [cell_type for cell_type in cell_types if cell_type != ''] + return cells, cell_types + + +def formatter(content, option): + assert option in ['code', 'markdown', 'result', 'raw'], "Unknown option: {}".format(option) + if option == 'code': + return clean_code(content) + elif option == 'markdown': + content = clean_markdown(content) + if content != "": + return "\n".join(["# " + line.strip() for line in content.split("\n")]) + else: + return "" + elif option == 'result' and content != DEFAULT_OUTPUT: + result_lines = content.split("\n") + if len(result_lines) >= 5: + result_lines = result_lines[:5] + ["..."] + wrapper = '"""Example Output:\n{}\n"""' + content = wrapper.format("\n".join(result_lines)) + return content + else: + return "" + + +def count_ratio_of_markdown_cells(types): + # statics the ratio of markdown cells + markdown_count = 0 + for cell_type in types: + if cell_type == "markdown": + markdown_count += 1 + return markdown_count / len(types) + + + + +def parse_jupyter_into_script(notebook_json_str, use_code_execution): + """ + Why we do not use jupytext is that we want to keep the output results of the notebook + """ + try: + notebook = json.loads(notebook_json_str) + script_content = "" + conversation_text = "" + # add the filtering: notebook without more than 4 cells will be ignored + if len(notebook) < 4: + return "" + + cells, types = segment_blocks(notebook) + if "code" not in types: + # no code, no need to parse + return "" + # follow paper https://arxiv.org/abs/2201.12901 + # here we remove the jupyter notebook whose markdown cells are less than 30% + # TODO: after discussion with pengcheng, we do not use this by now + # if count_ratio_of_markdown_cells(types) < 0.3: + # return "" + # flatten the list of cells to incorporate markdown and code + + for i in range(len(cells)): + # if this is the last cell and it is a markdown cell, then we do not need to parse it + if i == len(cells) - 1 and types[i] == 'markdown': + break + cell, cell_type = cells[i], types[i] + if cell['output'] == "": + cell['output'] = "" + cell_script = "" + text_code_part = formatter(cell['input'], cell_type) + + # if do not use, then set it as empty + if use_code_execution: + result_part = formatter(cell['output'], 'result') + else: + result_part = "" + + if result_part != "" and text_code_part != "": + cell_script = text_code_part + "\n" + result_part + "\n\n" + elif len(cell['output']) != 0 and cell_type == 'markdown': + # the markdown indicates an interactive widget but we cannot show it now, so ignore it + pass + elif result_part == "" and text_code_part != "": + cell_script = text_code_part + "\n" + # markdown should be separated with the previous code + if cell_type == 'markdown': + cell_script = "\n" + cell_script + if "def " in text_code_part: + # excpliitly add a new line between the current code to the next markdown + # if the current code has a function definition, then we also add a new line + cell_script += "\n" + + conversation_text += cell_script + script_content = conversation_text + return script_content + except Exception as e: + print("Failed to parse the notebook: {}".format(e)) + # traceback.print_exc() + return "" diff --git a/data_analysis/kaggle/metadata/kaggle_data.ipynb b/data_analysis/kaggle/metadata/kaggle_data.ipynb new file mode 100644 index 0000000..06eda1c --- /dev/null +++ b/data_analysis/kaggle/metadata/kaggle_data.ipynb @@ -0,0 +1,1108 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Getting started\n", + "- Download metadata tables from https://www.kaggle.com/datasets/kaggle/meta-kaggle at /fsx/loubna/kaggle_data/metadata_kaggle/\n", + "- Download notebooks dataset from https://www.kaggle.com/datasets/kaggle/meta-kaggle-code at /fsx/loubna/kaggle_data/kaggle-code-data/data (note: it can take many hours)\n", + "\n", + "Some mapping between the code dataset `meta-kaggle-code` and the csv tables available at `meta-kaggle` needs to be done to retrieve the metadata of each notebook, in particular we want to find the dataset name (owner/data_name) to download datasets using kaggle API so we can add information about the dataset used in each notebooks. We also want to add upvotes, title, data description and competition description/title and any other relevant information..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import json\n", + "from pathlib import Path\n", + "\n", + "\n", + "# the Meta Kaggle Code dataset with notebooks\n", + "code_base_path = Path('/fsx/loubna/kaggle_data/kaggle-code-data/data')\n", + "# match id to the filename in Meta Kaggle Code\n", + "kv_csv = '/fsx/loubna/kaggle_data/metadata_kaggle/KernelVersions.csv'\n", + "\n", + "# to get the name of the dataset used in each notebook\n", + "kernelversions_datasetsources_csv = '/fsx/loubna/kaggle_data/metadata_kaggle/KernelVersionDatasetSources.csv'\n", + "datasets_versions_csv = '/fsx/loubna/kaggle_data/metadata_kaggle/DatasetVersions.csv'\n", + "# to get the org user id of the dataset\n", + "datasets_csv = '/fsx/loubna/kaggle_data/metadata_kaggle/Datasets.csv'\n", + "# to get owner name from its id\n", + "users_csv = '/fsx/loubna/kaggle_data/metadata_kaggle/Users.csv'" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# retrieve notebooks, tehre are also .py and .r files to be analyzed\n", + "notebooks = code_base_path.glob('*/*/*.ipynb')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "kversions = pd.read_csv(kv_csv)\n", + "datasets_versions = pd.read_csv(datasets_versions_csv)\n", + "datasets = pd.read_csv(datasets_csv)\n", + "kernelversions_datasetsources = pd.read_csv(kernelversions_datasetsources_csv)\n", + "users = pd.read_csv(users_csv)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdScriptIdParentScriptVersionIdScriptLanguageIdAuthorUserIdCreationDateVersionNumberTitleEvaluationDateIsChangeTotalLinesLinesInsertedFromPreviousLinesChangedFromPreviousLinesUnchangedFromPreviousLinesInsertedFromForkLinesDeletedFromForkLinesChangedFromForkLinesUnchangedFromForkTotalVotes
03107865NaN229419904/26/2015 08:03:20NaNKeras deep net starter code04/26/2015False158.0NaNNaNNaN0.00.00.0158.00
13321991NaN12896304/27/2015 15:13:37NaNYo buddies, let's party04/27/2015False4.00.00.04.0NaNNaNNaNNaN0
2100184583NaN221644505/27/2015 19:12:59NaNscTryOut05/27/2015True81.09.05.067.0NaNNaNNaNNaN0
3101154682NaN1188905/28/2015 03:39:58NaNMax(Time Elapsed,Mean Time) Benchmark05/28/2015True22.00.02.020.00.00.02.020.00
4101944702NaN230392805/28/2015 11:30:32NaNSimple Lasagne NN05/28/2015False246.00.00.0247.00.00.02.0245.00
\n", + "
" + ], + "text/plain": [ + " Id ScriptId ParentScriptVersionId ScriptLanguageId AuthorUserId \\\n", + "0 3107 865 NaN 2 294199 \n", + "1 3321 991 NaN 1 28963 \n", + "2 10018 4583 NaN 2 216445 \n", + "3 10115 4682 NaN 1 1889 \n", + "4 10194 4702 NaN 2 303928 \n", + "\n", + " CreationDate VersionNumber Title \\\n", + "0 04/26/2015 08:03:20 NaN Keras deep net starter code \n", + "1 04/27/2015 15:13:37 NaN Yo buddies, let's party \n", + "2 05/27/2015 19:12:59 NaN scTryOut \n", + "3 05/28/2015 03:39:58 NaN Max(Time Elapsed,Mean Time) Benchmark \n", + "4 05/28/2015 11:30:32 NaN Simple Lasagne NN \n", + "\n", + " EvaluationDate IsChange TotalLines LinesInsertedFromPrevious \\\n", + "0 04/26/2015 False 158.0 NaN \n", + "1 04/27/2015 False 4.0 0.0 \n", + "2 05/27/2015 True 81.0 9.0 \n", + "3 05/28/2015 True 22.0 0.0 \n", + "4 05/28/2015 False 246.0 0.0 \n", + "\n", + " LinesChangedFromPrevious LinesUnchangedFromPrevious \\\n", + "0 NaN NaN \n", + "1 0.0 4.0 \n", + "2 5.0 67.0 \n", + "3 2.0 20.0 \n", + "4 0.0 247.0 \n", + "\n", + " LinesInsertedFromFork LinesDeletedFromFork LinesChangedFromFork \\\n", + "0 0.0 0.0 0.0 \n", + "1 NaN NaN NaN \n", + "2 NaN NaN NaN \n", + "3 0.0 0.0 2.0 \n", + "4 0.0 0.0 2.0 \n", + "\n", + " LinesUnchangedFromFork TotalVotes \n", + "0 158.0 0 \n", + "1 NaN 0 \n", + "2 NaN 0 \n", + "3 20.0 0 \n", + "4 245.0 0 " + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "kversions.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdKernelVersionIdSourceDatasetVersionId
02929388886801491
12991428885061491
22932568899951491
32939558899671491
43038178898481491
\n", + "
" + ], + "text/plain": [ + " Id KernelVersionId SourceDatasetVersionId\n", + "0 292938 888680 1491\n", + "1 299142 888506 1491\n", + "2 293256 889995 1491\n", + "3 293955 889967 1491\n", + "4 303817 889848 1491" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "kernelversions_datasetsources.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/fsx/loubna/kaggle_data/kaggle-code-data/data/0069/046/69046416.ipynb\n", + "69046416\n" + ] + } + ], + "source": [ + "fp = next(notebooks) \n", + "with open(fp,'r') as f:\n", + " content = f.readlines()[0]\n", + " content = json.loads(content)\n", + " cells = content['cells']\n", + " sample = {\"content\": cells}\n", + "file_id = str(fp).split('/')[-1].split('.')[0]\n", + "print(fp)\n", + "# the file id is its name\n", + "print(file_id)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pprint import pprint\n", + "\n", + "pprint(cells)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We now have the content of the notebook, let's retrieve its metadata: dataset, description, competition upvotes..\n", + "\n", + "There are several tables to retrieve this data from:\n", + "- KernelVersions for some metadata that will link to other tables\n", + "- KernelVersionDatasetSources for the data source\n", + "- DatasetVersion with dataset name\n", + "- Datasets with owners of the dataset\n", + "\n", + "=> can be used to download the corresponding kaggla dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdScriptIdParentScriptVersionIdScriptLanguageIdAuthorUserIdCreationDateVersionNumberTitleEvaluationDateIsChangeTotalLinesLinesInsertedFromPreviousLinesChangedFromPreviousLinesUnchangedFromPreviousLinesInsertedFromForkLinesDeletedFromForkLinesChangedFromForkLinesUnchangedFromForkTotalVotes
52217106904641618825679NaN9757161407/26/2021 08:39:574.0King County Houses Neighborhood Classification07/26/2021True269.010.00.0259.0NaNNaNNaNNaN1
\n", + "
" + ], + "text/plain": [ + " Id ScriptId ParentScriptVersionId ScriptLanguageId \\\n", + "5221710 69046416 18825679 NaN 9 \n", + "\n", + " AuthorUserId CreationDate VersionNumber \\\n", + "5221710 7571614 07/26/2021 08:39:57 4.0 \n", + "\n", + " Title EvaluationDate \\\n", + "5221710 King County Houses Neighborhood Classification 07/26/2021 \n", + "\n", + " IsChange TotalLines LinesInsertedFromPrevious \\\n", + "5221710 True 269.0 10.0 \n", + "\n", + " LinesChangedFromPrevious LinesUnchangedFromPrevious \\\n", + "5221710 0.0 259.0 \n", + "\n", + " LinesInsertedFromFork LinesDeletedFromFork LinesChangedFromFork \\\n", + "5221710 NaN NaN NaN \n", + "\n", + " LinesUnchangedFromFork TotalVotes \n", + "5221710 NaN 1 " + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# first metadata\n", + "kversion = kversions[kversions['Id']==int(file_id)]\n", + "kversion" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdKernelVersionIdSourceDatasetVersionId
32972089177595669046416270
\n", + "
" + ], + "text/plain": [ + " Id KernelVersionId SourceDatasetVersionId\n", + "3297208 91775956 69046416 270" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# find data source\n", + "data_source_kernel = kernelversions_datasetsources[kernelversions_datasetsources['KernelVersionId']==int(file_id)]\n", + "data_source_kernel" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdDatasetIdDatasourceVersionIdCreatorUserIdLicenseNameCreationDateVersionNumberTitleSlugSubtitleDescriptionVersionNotesTotalCompressedBytesTotalUncompressedBytes
06661CC0: Public Domain07/18/2015 00:51:121.02013 American Community Survey2013-american-community-surveyFind insights in the 2013 American Community S...The [American Community Survey](http://www.cen...Initial ReleaseNaNNaN
18881CC0: Public Domain08/18/2015 21:53:001.0Ocean Ship Logbooks (1750-1850)climate-data-from-ocean-shipsExplore changing climatology with data from ea...In the mid-eighteenth to nineteenth centuries,...Initial releaseNaNNaN
\n", + "
" + ], + "text/plain": [ + " Id DatasetId DatasourceVersionId CreatorUserId LicenseName \\\n", + "0 6 6 6 1 CC0: Public Domain \n", + "1 8 8 8 1 CC0: Public Domain \n", + "\n", + " CreationDate VersionNumber Title \\\n", + "0 07/18/2015 00:51:12 1.0 2013 American Community Survey \n", + "1 08/18/2015 21:53:00 1.0 Ocean Ship Logbooks (1750-1850) \n", + "\n", + " Slug \\\n", + "0 2013-american-community-survey \n", + "1 climate-data-from-ocean-ships \n", + "\n", + " Subtitle \\\n", + "0 Find insights in the 2013 American Community S... \n", + "1 Explore changing climatology with data from ea... \n", + "\n", + " Description VersionNotes \\\n", + "0 The [American Community Survey](http://www.cen... Initial Release \n", + "1 In the mid-eighteenth to nineteenth centuries,... Initial release \n", + "\n", + " TotalCompressedBytes TotalUncompressedBytes \n", + "0 NaN NaN \n", + "1 NaN NaN " + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "datasets_versions.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdDatasetIdDatasourceVersionIdCreatorUserIdLicenseNameCreationDateVersionNumberTitleSlugSubtitleDescriptionVersionNotesTotalCompressedBytesTotalUncompressedBytes
94270128270680332CC0: Public Domain08/25/2016 15:52:491.0House Sales in King County, USAhousesalespredictionPredict house price using regressionThis dataset contains house sale prices for Ki...Initial release2515206.02515206.0
\n", + "
" + ], + "text/plain": [ + " Id DatasetId DatasourceVersionId CreatorUserId LicenseName \\\n", + "94 270 128 270 680332 CC0: Public Domain \n", + "\n", + " CreationDate VersionNumber Title \\\n", + "94 08/25/2016 15:52:49 1.0 House Sales in King County, USA \n", + "\n", + " Slug Subtitle \\\n", + "94 housesalesprediction Predict house price using regression \n", + "\n", + " Description VersionNotes \\\n", + "94 This dataset contains house sale prices for Ki... Initial release \n", + "\n", + " TotalCompressedBytes TotalUncompressedBytes \n", + "94 2515206.0 2515206.0 " + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "source_id = data_source_kernel['SourceDatasetVersionId']\n", + "dataset_name = datasets_versions[datasets_versions['Id']==int(source_id)]\n", + "dataset_name" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'housesalesprediction'" + ] + }, + "execution_count": 77, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# get str in dataset_name[\"Slug\"]\n", + "data_name = dataset_name[\"Slug\"].values[0]\n", + "data_name" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdCreatorUserIdOwnerUserIdOwnerOrganizationIdCurrentDatasetVersionIdCurrentDatasourceVersionIdForumIdTypeCreationDateLastActivityDateTotalViewsTotalDownloadsTotalVotesTotalKernels
11128680332680332.0NaN270.0270.01447208/25/2016 15:52:4902/06/201899686617251620411225
\n", + "
" + ], + "text/plain": [ + " Id CreatorUserId OwnerUserId OwnerOrganizationId \\\n", + "11 128 680332 680332.0 NaN \n", + "\n", + " CurrentDatasetVersionId CurrentDatasourceVersionId ForumId Type \\\n", + "11 270.0 270.0 1447 2 \n", + "\n", + " CreationDate LastActivityDate TotalViews TotalDownloads \\\n", + "11 08/25/2016 15:52:49 02/06/2018 996866 172516 \n", + "\n", + " TotalVotes TotalKernels \n", + "11 2041 1225 " + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# check in datasets if a row has id =dataset_name[\"DatasetId\"]\n", + "owner = datasets[datasets['Id']==int(dataset_name[\"DatasetId\"])]\n", + "owner" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdUserNameDisplayNameRegisterDatePerformanceTier
01kaggleteamKaggle Team03/24/20115
1368antgoldbloomAnthony Goldbloom01/20/20102
\n", + "
" + ], + "text/plain": [ + " Id UserName DisplayName RegisterDate PerformanceTier\n", + "0 1 kaggleteam Kaggle Team 03/24/2011 5\n", + "1 368 antgoldbloom Anthony Goldbloom 01/20/2010 2" + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "users.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "11 680332.0\n", + "Name: OwnerUserId, dtype: float64" + ] + }, + "execution_count": 68, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "owner[\"OwnerUserId\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdUserNameDisplayNameRegisterDatePerformanceTier
597836680332harlfoxemharlfoxem08/05/20161
\n", + "
" + ], + "text/plain": [ + " Id UserName DisplayName RegisterDate PerformanceTier\n", + "597836 680332 harlfoxem harlfoxem 08/05/2016 1" + ] + }, + "execution_count": 69, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# let's get username\n", + "user_id = users[users['Id']==int(owner[\"OwnerUserId\"])]\n", + "user_id" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'harlfoxem/housesalesprediction'" + ] + }, + "execution_count": 81, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "final_data = f'{user_id[\"UserName\"].values[0]}/{data_name}'\n", + "final_data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can then retrieve the data from: https://www.kaggle.com/datasets/harlfoxem/housesalesprediction 🎉" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.10.9 ('eval-harness': conda)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "271972ab9158cd42175bc1ec5288153b91d150291a0b625c2babd1911356e891" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/data_analysis/kaggle/metadata/manual_sharding.py b/data_analysis/kaggle/metadata/manual_sharding.py new file mode 100644 index 0000000..904b30b --- /dev/null +++ b/data_analysis/kaggle/metadata/manual_sharding.py @@ -0,0 +1,66 @@ +import os +import time +from multiprocessing import Pool +from tqdm import tqdm + +from huggingface_hub import Repository + + +def save_shard(shard_tuple): + """Save shard""" + filename, shard = shard_tuple + # use to_json instead to save as json file + shard.to_parquet(filename) + + +def save_manual_shards(ds, user="loubnabnl", remote_dataset_repo="bigcode-pii-pjj"): + """Save sharded data + Args: + ds (Dataset): dataset to be saved + user (str): user name + remote_dataset_repo (str): remote dataset repository + out_path (str): path to save the shards""" + # this will create a folder OUT_PATH that is a clone of REMOTE_DATASET_REPO + # you can save the shards inside it and do git add/commit/push to push data to the hub + out_path = remote_dataset_repo + # if out path doesnt already exist + if not os.path.exists(out_path): + repo = Repository( + local_dir=out_path, + clone_from=user + "/" + remote_dataset_repo, + repo_type="dataset", + use_auth_token=True, + git_user=user, + ) + + # files will be numerous we save them in a folder called data inside out_path + os.mkdir(out_path + "/data") + SHARD_SIZE = 1000 << 20 + if ds._indices is not None: + dataset_nbytes = ds.data.nbytes * len(ds._indices) / len(ds.data) + else: + dataset_nbytes = ds.data.nbytes + num_shards = int(dataset_nbytes / SHARD_SIZE) + 1 + print(f"Number of shards: {num_shards}") + + print("sharding the dataset") + t_start = time.time() + shards = ( + ds.shard(num_shards=num_shards, index=i, contiguous=True) + for i in range(num_shards) + ) + # use f"{OUT_PATH}/data/train-{index:05d}-of-{num_shards:05d}.json" instead for json files + filenames = ( + f"{out_path}/data/train-{index:05d}-of-{num_shards:05d}.parquet" + for index in range(num_shards) + ) + + with Pool(16) as p: + list( + tqdm( + p.imap_unordered(save_shard, zip(filenames, shards), chunksize=4), + total=num_shards, + ) + ) + print(f"Time to save dataset: {time.time()-t_start:.2f}") + # to push dataset to hub do: git add/commit/push inside OUT_PATH diff --git a/data_analysis/kaggle/metadata/process_data.py b/data_analysis/kaggle/metadata/process_data.py new file mode 100644 index 0000000..3d73011 --- /dev/null +++ b/data_analysis/kaggle/metadata/process_data.py @@ -0,0 +1,58 @@ +# code for converting the kaggle dataset to standard dataframe (metadata not added here, see retrieve_metadata.py) +import pandas as pd +import json +from pathlib import Path +from multiprocessing import Pool, cpu_count +from datasets import Dataset + +code_base_path = Path('/fsx/loubna/kaggle_data/kaggle-code-data/data') + +# Function to extract content from a notebook +def extract_content(fp): + try: + with open(fp, 'r', encoding='utf-8') as f: + content = json.load(f) + cells = content.get('cells', []) + cells = json.dumps(cells) + file_id = fp.stem + return {'file_id': file_id, 'content': cells, 'local_path': str(fp)} + except json.JSONDecodeError: + print(f"Error decoding JSON for file: {fp}") + return {'file_id': None, 'content': None, 'local_path': str(fp)} + + +def find_notebooks(base_dir): + return list(base_dir.glob('*/*.ipynb')) + + +def main(): + sub_dirs = [x for x in code_base_path.iterdir() if x.is_dir()] + + # Use a Pool of workers to find notebooks + with Pool(cpu_count()) as p: + notebook_lists = p.map(find_notebooks, sub_dirs) + print(f"number of notebook dirs retrieved {len(notebook_lists)}") + # Flatten the list of lists + all_notebooks = [item for sublist in notebook_lists for item in sublist] + print(f"total number of notebooks {len(all_notebooks)}") + + # Use a Pool of workers to extract content + print("starting extraction...") + with Pool(cpu_count()) as p: + data = p.map(extract_content, all_notebooks) + print("extraction finished") + + # save data + df = pd.DataFrame(data) + df.to_csv('kaggle_notebooks.csv', index=False) + print("saved to csv file") + ds = Dataset.from_pandas(df) + # filter out None values + ds = ds.filter(lambda x: x['file_id'] is not None) + print(f"number of notebooks after filtering {len(ds)}"d) + ds.push_to_hub("kaggle-notebooks-data") + print("pushed to hub") + return ds + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/data_analysis/kaggle/metadata/retreive_metadata.py b/data_analysis/kaggle/metadata/retreive_metadata.py new file mode 100644 index 0000000..b393cfa --- /dev/null +++ b/data_analysis/kaggle/metadata/retreive_metadata.py @@ -0,0 +1,85 @@ +# code for getting metadata based on file id +import pandas as pd +import json +from datasets import load_dataset +from manual_sharding import save_manual_shards + +<<<<<<< HEAD:data_analysis/kaggle/metadata/retreive_metadata.py +ds = load_dataset("/fsx/loubna/kaggle-scripts-dedup", split="train", num_proc=36) + +print(f"dataset loaded with {len(ds)} rows") +======= +ds = load_dataset("bigcode/kaggle-notebooks-data", use_auth_token=True, split="train") +print("dataset loaded") +>>>>>>> 591eb7a64ae49589fd866b41adb9d653105b87d6:data_analysis/kaggle/retreive_metadata.py + +kv_csv = '/fsx/loubna/kaggle_data/metadata_kaggle/KernelVersions.csv' +kernelversions_datasetsources_csv = '/fsx/loubna/kaggle_data/metadata_kaggle/KernelVersionDatasetSources.csv' +datasets_versions_csv = '/fsx/loubna/kaggle_data/metadata_kaggle/DatasetVersions.csv' +datasets_csv = '/fsx/loubna/kaggle_data/metadata_kaggle/Datasets.csv' +users_csv = '/fsx/loubna/kaggle_data/metadata_kaggle/Users.csv' + +kversions = pd.read_csv(kv_csv) +datasets_versions = pd.read_csv(datasets_versions_csv) +datasets = pd.read_csv(datasets_csv) +kernelversions_datasetsources = pd.read_csv(kernelversions_datasetsources_csv) +users = pd.read_csv(users_csv) +print("metadata loaded") + +def safe_get(dataframe, condition, column=None): + """Utility function to safely get value from DataFrame.""" + result = dataframe[condition] + if result.empty: + return None + if column: + return result[column].values[0] + return result + +def get_metadata(file_id): + """given the id of a notebook (=the stem of its path) we retrieve metadata from csv tables + provided by kaggle""" + + file_id_int = int(file_id) + kversion = safe_get(kversions, kversions['Id'] == file_id_int) + data_source_kernel = safe_get(kernelversions_datasetsources, kernelversions_datasetsources['KernelVersionId'] == file_id_int) + + source_id = None if data_source_kernel is None else data_source_kernel['SourceDatasetVersionId'].values[0] + dataset_v = safe_get(datasets_versions, datasets_versions['Id'] == source_id) + + data_name = dataset_v["Slug"].values[0] if dataset_v is not None else None + dataset_id = dataset_v["DatasetId"].values[0] if dataset_v is not None else None + + source_dataset = safe_get(datasets, datasets['Id'] == dataset_id) + owner_user_id = None if source_dataset is None else source_dataset["OwnerUserId"].values[0] + + user = safe_get(users, users['Id'] == owner_user_id) + user_name = None if user is None else user["UserName"].values[0] + + return { + 'kaggle_dataset_name': data_name, + 'kaggle_dataset_owner': user_name, + 'kversion': json.dumps(kversion.to_dict(orient='records')) if kversion is not None else None, + 'kversion_datasetsources': json.dumps(data_source_kernel.to_dict(orient='records')) if data_source_kernel is not None else None, + 'dataset_versions': json.dumps(dataset_v.to_dict(orient='records')) if dataset_v is not None else None, + 'datasets': json.dumps(source_dataset.to_dict(orient='records')) if source_dataset is not None else None, + 'users': json.dumps(user.to_dict(orient='records')) if user is not None else None + } + + +def retrive_metadata(row): + output = get_metadata(row['file_id']) + return output + +# issue when using map with multipprocessing new values are None +new_ds = ds.map(retrive_metadata) +<<<<<<< HEAD:data_analysis/kaggle/metadata/retreive_metadata.py +save_manual_shards( + new_ds, user="loubnabnl", remote_dataset_repo="kaggle-scripts-clean-dedup-meta", +) +subset = ds.select(range(10_000)) +subset.push_to_hub("kaggle_scripts_subset") +print("Done! 💃🏻💥") +#new_ds.push_to_hub("kaggle-notebooks-data-w-metadata") +======= +new_ds.push_to_hub("kaggle-notebooks-data-metadata-20k") +>>>>>>> 591eb7a64ae49589fd866b41adb9d653105b87d6:data_analysis/kaggle/retreive_metadata.py