From 03d9b46a6f2ebbac23681e6cbc05dd23ae072884 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Thu, 10 Aug 2023 07:40:29 +0000 Subject: [PATCH] update --- .devcontainer/devcontainer.json | 28 +++++++++-------- .github/workflows/test_dask.yml | 55 +++++++++++++++++++++++++++++++++ fugue_dask/_constants.py | 12 +++++-- fugue_dask/execution_engine.py | 6 +++- setup.py | 15 ++++----- 5 files changed, 93 insertions(+), 23 deletions(-) create mode 100644 .github/workflows/test_dask.yml diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index dc1fdead..a438eb7b 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -7,26 +7,28 @@ "settings": { "terminal.integrated.shell.linux": "/bin/bash", "python.pythonPath": "/usr/local/bin/python", - "python.linting.enabled": true, - "python.linting.pylintEnabled": true, - "python.formatting.autopep8Path": "/usr/local/py-utils/bin/autopep8", - "python.formatting.blackPath": "/usr/local/py-utils/bin/black", - "python.formatting.yapfPath": "/usr/local/py-utils/bin/yapf", - "python.linting.banditPath": "/usr/local/py-utils/bin/bandit", - "python.linting.flake8Path": "/usr/local/py-utils/bin/flake8", - "python.linting.mypyPath": "/usr/local/py-utils/bin/mypy", - "python.linting.pycodestylePath": "/usr/local/py-utils/bin/pycodestyle", - "python.linting.pydocstylePath": "/usr/local/py-utils/bin/pydocstyle", - "python.linting.pylintPath": "/usr/local/py-utils/bin/pylint" + "python.defaultInterpreterPath": "/usr/local/bin/python", + "isort.interpreter": [ + "/usr/local/bin/python" + ], + "flake8.interpreter": [ + "/usr/local/bin/python" + ], + "pylint.interpreter": [ + "/usr/local/bin/python" + ] }, "extensions": [ "ms-python.python", "ms-python.isort", + "ms-python.flake8", + "ms-python.pylint", + "ms-python.mypy", "GitHub.copilot", "njpwerner.autodocstring" ] - }}, - + } + }, "forwardPorts": [ 8888 ], diff --git a/.github/workflows/test_dask.yml b/.github/workflows/test_dask.yml new file mode 100644 index 00000000..92cc7485 --- /dev/null +++ b/.github/workflows/test_dask.yml @@ -0,0 +1,55 @@ +# This workflow will install Python dependencies, run tests and lint with a variety of Python versions +# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions + +name: Dask Tests + +on: + push: + branches: [ master ] + paths-ignore: + - 'docs/**' + - '**.md' + pull_request: + branches: [ master ] + paths-ignore: + - 'docs/**' + - '**.md' + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + test_dask_lower_bound: + name: Dask 2023.2.0 + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v2 + - name: Set up Python 3.9 + uses: actions/setup-python@v1 + with: + python-version: 3.8 + - name: Install dependencies + run: make devenv + - name: Setup Dask + run: pip install pyarrow=11.0.0 pandas==2.0.2 dask[dataframe,distributed]==2023.5.0 + - name: Test + run: make testdask + + test_dask_latest: + name: Dask Latest + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v2 + - name: Set up Python 3.10 + uses: actions/setup-python@v1 + with: + python-version: "3.10" + - name: Install dependencies + run: make devenv + - name: Setup Dask + run: pip install -U dask[dataframe,distributed] pyarrow pandas + - name: Test + run: make testdask diff --git a/fugue_dask/_constants.py b/fugue_dask/_constants.py index 6f1ed6b3..e8ce20e0 100644 --- a/fugue_dask/_constants.py +++ b/fugue_dask/_constants.py @@ -1,7 +1,15 @@ from typing import Any, Dict -import pandas as pd + import dask +import pandas as pd +import pyarrow as pa +from packaging import version FUGUE_DASK_CONF_DEFAULT_PARTITIONS = "fugue.dask.default.partitions" FUGUE_DASK_DEFAULT_CONF: Dict[str, Any] = {FUGUE_DASK_CONF_DEFAULT_PARTITIONS: -1} -FUGUE_DASK_USE_ARROW = hasattr(pd, "ArrowDtype") and dask.__version__ >= "2023.7.1" +FUGUE_DASK_USE_ARROW = ( + hasattr(pd, "ArrowDtype") + and version.parse(dask.__version__) >= version.parse("2023.2") + and version.parse(pa.__version__) >= version.parse("7") + and version.parse(pd.__version__) >= version.parse("2") +) diff --git a/fugue_dask/execution_engine.py b/fugue_dask/execution_engine.py index 06e3c5f4..9e4e2fd0 100644 --- a/fugue_dask/execution_engine.py +++ b/fugue_dask/execution_engine.py @@ -476,7 +476,11 @@ def take( _presort: IndexedOrderedDict = presort or partition_spec.presort def _partition_take(partition, n, presort): - if len(presort.keys()) > 0: + assert_or_throw( + partition.shape[1] == len(meta), + FugueBug("hitting the dask bug where partition keys are lost"), + ) + if len(presort.keys()) > 0 and len(partition) > 1: partition = partition.sort_values( list(presort.keys()), ascending=list(presort.values()), diff --git a/setup.py b/setup.py index b1b7bb15..072586d9 100644 --- a/setup.py +++ b/setup.py @@ -33,8 +33,8 @@ def get_version() -> str: install_requires=[ "triad>=0.9.1", "adagio>=0.2.4", - "pyarrow>=0.15.1", - "pandas>=1.2.0", + "pyarrow>=6.0.1", + "pandas>=1.3.5", # sql dependencies "qpd>=0.4.4", "fugue-sql-antlr>=0.1.6", @@ -52,8 +52,9 @@ def get_version() -> str: "spark": ["pyspark>=3.1.1"], "dask": [ "dask[distributed,dataframe]; python_version < '3.8'", - "dask[distributed,dataframe]>=2022.9.0,<2023.7.1; python_version >= '3.8'", - "qpd[dask]>=0.4.4", + "dask[distributed,dataframe]>=2023.5.0; python_version >= '3.8'", + "pyarrow>=11.0.0", + "pandas>=2.0.2", ], "ray": ["ray[data]>=2.1.0", "duckdb>=0.5.0", "pyarrow>=6.0.1"], "duckdb": [ @@ -73,14 +74,14 @@ def get_version() -> str: "fugue-sql-antlr[cpp]>=0.1.6", "pyspark>=3.1.1", "dask[distributed,dataframe]; python_version < '3.8'", - "dask[distributed,dataframe]>=2022.9.0,<2023.7.1; python_version >= '3.8'", + "dask[distributed,dataframe]>=2023.5.0; python_version >= '3.8'", "ray[data]>=2.1.0", - "qpd[dask]>=0.4.4", "notebook", "jupyterlab", "ipython>=7.10.0", "duckdb>=0.5.0", - "pyarrow>=6.0.1", + "pyarrow>=11.0.0", + "pandas>=2.0.2", "ibis-framework>=2.1.1; python_version < '3.8'", "ibis-framework>=3.2.0,<6; python_version >= '3.8'", "polars",