diff --git a/data/people_countries_delta_dask/_delta_log/.00000000000000000000.json.crc b/data/people_countries_delta_dask/_delta_log/.00000000000000000000.json.crc
new file mode 100644
index 0000000..2a72af8
Binary files /dev/null and b/data/people_countries_delta_dask/_delta_log/.00000000000000000000.json.crc differ
diff --git a/data/people_countries_delta_dask/_delta_log/00000000000000000000.json b/data/people_countries_delta_dask/_delta_log/00000000000000000000.json
new file mode 100644
index 0000000..000d2e4
--- /dev/null
+++ b/data/people_countries_delta_dask/_delta_log/00000000000000000000.json
@@ -0,0 +1,6 @@
+{"commitInfo":{"timestamp":1706278148531,"operation":"WRITE","operationParameters":{"mode":"Overwrite","partitionBy":"[\"country\"]"},"isolationLevel":"Serializable","isBlindAppend":false,"operationMetrics":{"numFiles":"3","numOutputRows":"5","numOutputBytes":"3045"},"engineInfo":"Apache-Spark/3.4.0 Delta-Lake/2.4.0","txnId":"1cbc9537-63eb-4799-8647-2d947ae8fa41"}}
+{"protocol":{"minReaderVersion":1,"minWriterVersion":2}}
+{"metaData":{"id":"1f110132-a652-4be9-815e-348f294515cf","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"first_name\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"last_name\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"country\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"continent\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":["country"],"configuration":{},"createdTime":1706278146762}}
+{"add":{"path":"country=Argentina/part-00000-8d0390a3-f797-4265-b9c2-da1c941680a3.c000.snappy.parquet","partitionValues":{"country":"Argentina"},"size":1018,"modificationTime":1706278148083,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"first_name\":\"Ernesto\",\"last_name\":\"Guevara\",\"continent\":\"NaN\"},\"maxValues\":{\"first_name\":\"Ernesto\",\"last_name\":\"Guevara\",\"continent\":\"NaN\"},\"nullCount\":{\"first_name\":0,\"last_name\":0,\"continent\":0}}"}}
+{"add":{"path":"country=China/part-00000-88fba1af-b28d-4303-9c85-9a97be631d40.c000.snappy.parquet","partitionValues":{"country":"China"},"size":1002,"modificationTime":1706278148138,"dataChange":true,"stats":"{\"numRecords\":2,\"minValues\":{\"first_name\":\"Bruce\",\"last_name\":\"Lee\",\"continent\":\"Asia\"},\"maxValues\":{\"first_name\":\"Jack\",\"last_name\":\"Ma\",\"continent\":\"Asia\"},\"nullCount\":{\"first_name\":0,\"last_name\":0,\"continent\":0}}"}}
+{"add":{"path":"country=Germany/part-00000-030076e1-5ec9-47c2-830a-1569f823b6ee.c000.snappy.parquet","partitionValues":{"country":"Germany"},"size":1025,"modificationTime":1706278148185,"dataChange":true,"stats":"{\"numRecords\":2,\"minValues\":{\"first_name\":\"Soraya\",\"last_name\":\"Jala\",\"continent\":\"NaN\"},\"maxValues\":{\"first_name\":\"Wolfgang\",\"last_name\":\"Manche\",\"continent\":\"NaN\"},\"nullCount\":{\"first_name\":0,\"last_name\":0,\"continent\":0}}"}}
diff --git a/data/people_countries_delta_dask/country=Argentina/.part-00000-8d0390a3-f797-4265-b9c2-da1c941680a3.c000.snappy.parquet.crc b/data/people_countries_delta_dask/country=Argentina/.part-00000-8d0390a3-f797-4265-b9c2-da1c941680a3.c000.snappy.parquet.crc
new file mode 100644
index 0000000..31c4629
Binary files /dev/null and b/data/people_countries_delta_dask/country=Argentina/.part-00000-8d0390a3-f797-4265-b9c2-da1c941680a3.c000.snappy.parquet.crc differ
diff --git a/data/people_countries_delta_dask/country=Argentina/part-00000-8d0390a3-f797-4265-b9c2-da1c941680a3.c000.snappy.parquet b/data/people_countries_delta_dask/country=Argentina/part-00000-8d0390a3-f797-4265-b9c2-da1c941680a3.c000.snappy.parquet
new file mode 100644
index 0000000..214a0c9
Binary files /dev/null and b/data/people_countries_delta_dask/country=Argentina/part-00000-8d0390a3-f797-4265-b9c2-da1c941680a3.c000.snappy.parquet differ
diff --git a/data/people_countries_delta_dask/country=China/.part-00000-88fba1af-b28d-4303-9c85-9a97be631d40.c000.snappy.parquet.crc b/data/people_countries_delta_dask/country=China/.part-00000-88fba1af-b28d-4303-9c85-9a97be631d40.c000.snappy.parquet.crc
new file mode 100644
index 0000000..f5463cc
Binary files /dev/null and b/data/people_countries_delta_dask/country=China/.part-00000-88fba1af-b28d-4303-9c85-9a97be631d40.c000.snappy.parquet.crc differ
diff --git a/data/people_countries_delta_dask/country=China/part-00000-88fba1af-b28d-4303-9c85-9a97be631d40.c000.snappy.parquet b/data/people_countries_delta_dask/country=China/part-00000-88fba1af-b28d-4303-9c85-9a97be631d40.c000.snappy.parquet
new file mode 100644
index 0000000..61ea5f4
Binary files /dev/null and b/data/people_countries_delta_dask/country=China/part-00000-88fba1af-b28d-4303-9c85-9a97be631d40.c000.snappy.parquet differ
diff --git a/data/people_countries_delta_dask/country=Germany/.part-00000-030076e1-5ec9-47c2-830a-1569f823b6ee.c000.snappy.parquet.crc b/data/people_countries_delta_dask/country=Germany/.part-00000-030076e1-5ec9-47c2-830a-1569f823b6ee.c000.snappy.parquet.crc
new file mode 100644
index 0000000..7c8b129
Binary files /dev/null and b/data/people_countries_delta_dask/country=Germany/.part-00000-030076e1-5ec9-47c2-830a-1569f823b6ee.c000.snappy.parquet.crc differ
diff --git a/data/people_countries_delta_dask/country=Germany/part-00000-030076e1-5ec9-47c2-830a-1569f823b6ee.c000.snappy.parquet b/data/people_countries_delta_dask/country=Germany/part-00000-030076e1-5ec9-47c2-830a-1569f823b6ee.c000.snappy.parquet
new file mode 100644
index 0000000..8ab9c0a
Binary files /dev/null and b/data/people_countries_delta_dask/country=Germany/part-00000-030076e1-5ec9-47c2-830a-1569f823b6ee.c000.snappy.parquet differ
diff --git a/notebooks/python-deltalake/dask-deltalake.ipynb b/notebooks/python-deltalake/dask-deltalake.ipynb
new file mode 100644
index 0000000..423a79d
--- /dev/null
+++ b/notebooks/python-deltalake/dask-deltalake.ipynb
@@ -0,0 +1,808 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "2324f66b-f367-4b07-9892-0a3d8b9153d2",
+ "metadata": {},
+ "source": [
+ "# Using Delta Lake with Dask\n",
+ "\n",
+ "Delta Lake is a great storage format for Dask analyses. This page will explain why and how to use Delta Lake with Dask.\n",
+ "\n",
+ "You will learn how to read Delta Lakes into Dask DataFrames, how to query Delta tables with Dask, and the unique advantages Delta Lake offers the Dask community.\n",
+ "\n",
+ "Here are some of the benefits that Delta Lake provides Dask users:\n",
+ "- better performance with file skipping\n",
+ "- enhanced file skipping via Z Ordering\n",
+ "- ACID transactions for reliable writes\n",
+ "- easy time-travel functionality"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "546a47b5-5614-4078-80be-61e2b365cedc",
+ "metadata": {},
+ "source": [
+ "> ❗️ `dask-deltatable` doesn't currently work with deltalake=0.14, use deltalake=13.0 or lower. See https://github.com/dask-contrib/dask-deltatable/issues/65"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "001e4111-23d7-4db2-9eda-68cb57ba46d2",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import dask_deltatable as ddt\n",
+ "import dask.dataframe as dd\n",
+ "import pandas as pd\n",
+ "import numpy as np"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c61cc62d-ed7f-4d1c-b613-e7555708c0ac",
+ "metadata": {},
+ "source": [
+ "## Read Delta Lake into a Dask DataFrame"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "64b6710b-aa00-4f97-a2d5-b6dc9810a9b8",
+ "metadata": {},
+ "source": [
+ "Let's start with some data stored in a Delta Lake on disk. Read it into a Dask DataFrame using `dask-deltatable.read_deltalake`:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "6b4a1fca-0b21-4a34-998c-7fd07d86f1ff",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# read delta table into Dask DataFrame\n",
+ "delta_path = \"../../data/people_countries_delta_dask\"\n",
+ "ddf = ddt.read_deltalake(delta_path)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f4b07844-cba9-4095-8429-38b62f69c9e6",
+ "metadata": {},
+ "source": [
+ "Dask is a library for efficient distributed computing and works with [lazy evaluation](https://docs.dask.org/en/stable/user-interfaces.html#laziness-and-computing). Function calls to `dask.dataframe` build a task graph in the background. To trigger computation, call `.compute()`:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "cf5d9296-5914-497b-be00-fe7371ed6d57",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " first_name | \n",
+ " last_name | \n",
+ " country | \n",
+ " continent | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " Ernesto | \n",
+ " Guevara | \n",
+ " Argentina | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 0 | \n",
+ " Bruce | \n",
+ " Lee | \n",
+ " China | \n",
+ " Asia | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " Jack | \n",
+ " Ma | \n",
+ " China | \n",
+ " Asia | \n",
+ "
\n",
+ " \n",
+ " 0 | \n",
+ " Wolfgang | \n",
+ " Manche | \n",
+ " Germany | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " Soraya | \n",
+ " Jala | \n",
+ " Germany | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " first_name last_name country continent\n",
+ "0 Ernesto Guevara Argentina NaN\n",
+ "0 Bruce Lee China Asia\n",
+ "1 Jack Ma China Asia\n",
+ "0 Wolfgang Manche Germany NaN\n",
+ "1 Soraya Jala Germany NaN"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "ddf.compute()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "868bc406-14d1-4905-8dfc-1af2279c82ed",
+ "metadata": {},
+ "source": [
+ "You can read in specific versions of Delta tables by specifying a `version` number or a timestamp:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "44b79297-0d22-4411-a84c-9b385c204624",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# # with specific version\n",
+ "# ddf = ddt.read_deltalake(delta_path, version=3)\n",
+ "\n",
+ "# # with specific datetime\n",
+ "# ddt.read_deltalake(delta_path, datetime=\"2018-12-19T16:39:57-08:00\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b8651b12-aeaf-4664-b26f-a3daaabff0b0",
+ "metadata": {},
+ "source": [
+ "`dask-deltatable` also supports reading from remote sources like S3 with:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "cbf00cc0-298c-4655-ae9a-b29d12e83d2a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# ddt.read_deltalake(\"s3://bucket_name/delta_path\", version=3)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c2d50dc6-0d2f-4fe6-9a8a-2cd8e3afb5eb",
+ "metadata": {},
+ "source": [
+ "> To read data from remote sources you'll need to make sure the credentials are properly configured in environment variables or config files. Refer to your cloud provider documentation to configure these."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "fc085856-4a88-4087-a1da-bff626094236",
+ "metadata": {},
+ "source": [
+ "## What can I do with a Dask Deltatable?"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d550eec3-7d19-4591-b752-74d98eccb997",
+ "metadata": {},
+ "source": [
+ "Reading a Delta Lake in with `dask-deltatable` returns a regular Dask DataFrame. You can perform [all the regular Dask operations](https://docs.dask.org/en/stable/dataframe.html) on this DataFrame."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "b3b749a1-6e5e-41e9-b362-9faafcf9d616",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "dask.dataframe.core.DataFrame"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "type(ddf)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d8bb7107-fdce-4d22-a759-f5802155a46d",
+ "metadata": {},
+ "source": [
+ "Let's take a look at the first few rows:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "eed2de99-738c-4e9b-b2e5-fcd5ab3b89f7",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " first_name | \n",
+ " last_name | \n",
+ " country | \n",
+ " continent | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " Ernesto | \n",
+ " Guevara | \n",
+ " Argentina | \n",
+ " <NA> | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " Wolfgang | \n",
+ " Manche | \n",
+ " Germany | \n",
+ " <NA> | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " Soraya | \n",
+ " Jala | \n",
+ " Germany | \n",
+ " <NA> | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " first_name last_name country continent\n",
+ "0 Ernesto Guevara Argentina \n",
+ "1 Wolfgang Manche Germany \n",
+ "2 Soraya Jala Germany "
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "ddf.head(n=3)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "717d4956-e095-465b-b506-ba4e376a7503",
+ "metadata": {},
+ "source": [
+ "`dask.dataframe.head()` shows you the first rows of the first partition in the dataframe. In this case, the first partition only has 1 row.\n",
+ "\n",
+ "This is because the Delta Lake has been partitioned by country:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "7e09a4a2-b2e0-44ea-b0c3-58c7fbbadeec",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[34m_delta_log\u001b[m\u001b[m \u001b[34mcountry=Argentina\u001b[m\u001b[m \u001b[34mcountry=China\u001b[m\u001b[m \u001b[34mcountry=Germany\u001b[m\u001b[m\n"
+ ]
+ }
+ ],
+ "source": [
+ "!ls ../../data/people_countries_delta_dask"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "bdec4f14-921f-420f-bc19-671c4f210f6b",
+ "metadata": {},
+ "source": [
+ "`dask-deltatable` neatly reads in the partitioned Delta Lake into corresponding Dask DataFrame partitions:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "f1ba689d-e198-4710-9152-0aafe761880e",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "3"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# see number of partitions\n",
+ "ddf.npartitions"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f1ffc4f2-45ea-4884-b861-d8293110e97c",
+ "metadata": {},
+ "source": [
+ "You can inspect a single partition using `dask.dataframe.get_partition()`:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "cf5724fa-2203-4917-965d-e566cd82b16e",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " first_name | \n",
+ " last_name | \n",
+ " country | \n",
+ " continent | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " Wolfgang | \n",
+ " Manche | \n",
+ " Germany | \n",
+ " <NA> | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " Soraya | \n",
+ " Jala | \n",
+ " Germany | \n",
+ " <NA> | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " first_name last_name country continent\n",
+ "0 Wolfgang Manche Germany \n",
+ "1 Soraya Jala Germany "
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "ddf.get_partition(n=1).compute()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "081a17cb-5ff9-4797-9329-38178f3342f9",
+ "metadata": {},
+ "source": [
+ "## Perform Dask Operations"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "38d76a77-e09b-41c9-b516-4ecda564738a",
+ "metadata": {},
+ "source": [
+ "Let's perform some basic computations over the Delta Lake data that's now stored in our Dask DataFrame. \n",
+ "\n",
+ "Suppose you want to group the dataset by the `country` column:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "298a51f4-06b9-46f0-a654-6adbabf7eee8",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " first_name | \n",
+ " last_name | \n",
+ " continent | \n",
+ "
\n",
+ " \n",
+ " country | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " Argentina | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " Germany | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " China | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " first_name last_name continent\n",
+ "country \n",
+ "Argentina 1 1 0\n",
+ "Germany 2 2 0\n",
+ "China 2 2 2"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "ddf.groupby(['country']).count().compute()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "11ed91c2-0fcf-4114-b057-902bc546c0d1",
+ "metadata": {},
+ "source": [
+ "Dask executes this `groupby` operation in parallel across all available cores. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b40fc539-c9d2-4050-b699-2ed00b76f4d0",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "markdown",
+ "id": "95219b2f-434b-491c-a02d-230f711ffcfe",
+ "metadata": {},
+ "source": [
+ "## Map Functions over Partitions"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "230855dd-e3c2-461b-a969-7a0e8abed69d",
+ "metadata": {},
+ "source": [
+ "You can also use Dask's `map_partitions` method to map a custom Python function over all the partitions. \n",
+ "\n",
+ "Let's write a function that will replace the missing `continent` values with the right continent names."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "8e3fdaf8-d969-448a-af30-91886b166bac",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# define custom python function\n",
+ "\n",
+ "# get na_string\n",
+ "df = ddf.get_partition(0).compute()\n",
+ "na_string = df.iloc[0].continent\n",
+ "na_string\n",
+ "\n",
+ "# define function\n",
+ "def replace_proper(partition, na_string):\n",
+ " if [partition.country == \"Argentina\"]:\n",
+ " partition.loc[partition.country==\"Argentina\"] = partition.loc[partition.country==\"Argentina\"].replace(na_string, \"South America\")\n",
+ " if [partition.country == \"Germany\"]:\n",
+ " partition.loc[partition.country==\"Germany\"] = partition.loc[partition.country==\"Germany\"].replace(na_string, \"Europe\")\n",
+ " else:\n",
+ " pass\n",
+ " return partition "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "id": "a1f7d880-0152-47bb-a67c-3d880c1d3e8b",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " first_name | \n",
+ " last_name | \n",
+ " country | \n",
+ " continent | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " Ernesto | \n",
+ " Guevara | \n",
+ " Argentina | \n",
+ " South America | \n",
+ "
\n",
+ " \n",
+ " 0 | \n",
+ " Wolfgang | \n",
+ " Manche | \n",
+ " Germany | \n",
+ " Europe | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " Soraya | \n",
+ " Jala | \n",
+ " Germany | \n",
+ " Europe | \n",
+ "
\n",
+ " \n",
+ " 0 | \n",
+ " Bruce | \n",
+ " Lee | \n",
+ " China | \n",
+ " Asia | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " Jack | \n",
+ " Ma | \n",
+ " China | \n",
+ " Asia | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " first_name last_name country continent\n",
+ "0 Ernesto Guevara Argentina South America\n",
+ "0 Wolfgang Manche Germany Europe\n",
+ "1 Soraya Jala Germany Europe\n",
+ "0 Bruce Lee China Asia\n",
+ "1 Jack Ma China Asia"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# define metadata and map function over partitions\n",
+ "meta = dict(ddf.dtypes)\n",
+ "ddf3 = ddf.map_partitions(replace_proper, na_string, meta=meta)\n",
+ "ddf3.compute()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4a639399-56d5-40cf-9e14-762f52fd78c8",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3a475f84-cb5f-44ba-bd42-511ce879318c",
+ "metadata": {},
+ "source": [
+ "## Write to Delta Lake\n",
+ "After doing your data processing in Dask, you can write the data back out to Delta Lake using `to_deltalake`:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "id": "084ba149-a179-4945-8c37-68732cf2c137",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# ddt.to_deltalake(ddf, \"tmp/test_write\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "81d675e5-3363-45d5-ab86-4cd831bdcada",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "markdown",
+ "id": "aec71505-2de9-408d-8a2f-ce48989eacd2",
+ "metadata": {},
+ "source": [
+ "## Contribute to `dask-deltalake`"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b0370ae5-08a9-4873-9479-e6cfc169995a",
+ "metadata": {},
+ "source": [
+ "To contribute, go to the [`dask-deltalake` Github repository](https://github.com/rrpelgrim/dask-deltatable)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a3fb5cbc-b5ab-42a5-bf6c-937e194568e7",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ad4eb23b-3dfb-469a-a7e8-6c23aa8bfb90",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python [conda env:dask-delta-0140]",
+ "language": "python",
+ "name": "conda-env-dask-delta-0140-py"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.0"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}