From 18f0f5eeffe9a39dac68f6bfed48b9bbec57818e Mon Sep 17 00:00:00 2001 From: Tao He Date: Fri, 10 Nov 2023 00:13:45 +0800 Subject: [PATCH] Use arrow's string array to place string columns in vineyard (#1611) The change is backwards-compatible and fixes the issue in Rust SDK. Signed-off-by: Tao He --- .github/workflows/build-test-graph.yml | 12 +++---- .github/workflows/build-test.yml | 12 +++---- python/vineyard/data/tensor.py | 34 ++++++++++++++++--- python/vineyard/data/tests/test_dataframe.py | 7 ++++ .../src/ds/numpy_test.rs | 1 - .../src/ds/pandas_test.rs | 1 - 6 files changed, 49 insertions(+), 18 deletions(-) diff --git a/.github/workflows/build-test-graph.yml b/.github/workflows/build-test-graph.yml index 6ab20f98..716b067f 100644 --- a/.github/workflows/build-test-graph.yml +++ b/.github/workflows/build-test-graph.yml @@ -105,12 +105,12 @@ jobs: wget https://apache.jfrog.io/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb sudo apt install -y -V ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb sudo apt update - sudo apt install -y libarrow-dev=14.0.0-1 \ - libarrow-dataset-dev=14.0.0-1 \ - libarrow-acero-dev=14.0.0-1 \ - libarrow-flight-dev=14.0.0-1 \ - libgandiva-dev=14.0.0-1 \ - libparquet-dev=14.0.0-1 + sudo apt install -y libarrow-dev=14.0.1-1 \ + libarrow-dataset-dev=14.0.1-1 \ + libarrow-acero-dev=14.0.1-1 \ + libarrow-flight-dev=14.0.1-1 \ + libgandiva-dev=14.0.1-1 \ + libparquet-dev=14.0.1-1 # install clang-format sudo curl -L https://github.com/muttleyxd/clang-tools-static-binaries/releases/download/master-1d7ec53d/clang-format-11_linux-amd64 --output /usr/bin/clang-format diff --git a/.github/workflows/build-test.yml b/.github/workflows/build-test.yml index b89a0b9f..cad1aa4b 100644 --- a/.github/workflows/build-test.yml +++ b/.github/workflows/build-test.yml @@ -124,12 +124,12 @@ jobs: wget https://apache.jfrog.io/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb sudo apt install -y -V ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb sudo apt update - sudo apt install -y libarrow-dev=14.0.0-1 \ - libarrow-dataset-dev=14.0.0-1 \ - libarrow-acero-dev=14.0.0-1 \ - libarrow-flight-dev=14.0.0-1 \ - libgandiva-dev=14.0.0-1 \ - libparquet-dev=14.0.0-1 + sudo apt install -y libarrow-dev=14.0.1-1 \ + libarrow-dataset-dev=14.0.1-1 \ + libarrow-acero-dev=14.0.1-1 \ + libarrow-flight-dev=14.0.1-1 \ + libgandiva-dev=14.0.1-1 \ + libparquet-dev=14.0.1-1 # install deps for java sudo apt install -y default-jdk-headless maven diff --git a/python/vineyard/data/tensor.py b/python/vineyard/data/tensor.py index 480f9efd..25b2d438 100644 --- a/python/vineyard/data/tensor.py +++ b/python/vineyard/data/tensor.py @@ -19,6 +19,7 @@ import pickle import numpy as np +import pyarrow as pa from vineyard._C import Object from vineyard._C import ObjectID @@ -54,14 +55,39 @@ class ndarray(np.ndarray): def numpy_ndarray_builder(client, value, **kw): meta = ObjectMeta() - meta['typename'] = 'vineyard::Tensor<%s>' % normalize_cpptype(value.dtype) - meta['value_type_'] = value.dtype.name - meta['value_type_meta_'] = value.dtype.str meta['shape_'] = to_json(value.shape) meta['partition_index_'] = to_json(kw.get('partition_index', [])) meta['nbytes'] = value.nbytes meta['order_'] = to_json(('C' if value.flags['C_CONTIGUOUS'] else 'F')) - meta.add_member('buffer_', build_numpy_buffer(client, value)) + + if value.dtype.name == 'object' or value.dtype.name.startswith('str'): + # check if it can be used as a string array + try: + from vineyard.core.builder import get_current_builders + from vineyard.data.arrow import string_array_builder + + # string tensors in numpy like np.array(['a', 'b']) cannot be + # converted to pa.large_string_array directly. + try: + array = pa.array(value, type=pa.large_string()) + except: # noqa: E722, pylint: disable=bare-except + array = pa.array(value, type=pa.string()) + meta['typename'] = 'vineyard::Tensor' + meta['value_type_'] = 'string' + meta['value_type_meta_'] = 'str' + meta.add_member( + 'buffer_', string_array_builder(client, array, get_current_builders()) + ) + except: # noqa: E722, pylint: disable=bare-except + meta['typename'] = 'vineyard::Tensor<%s>' % normalize_cpptype(value.dtype) + meta['value_type_'] = value.dtype.name + meta['value_type_meta_'] = value.dtype.str + meta.add_member('buffer_', build_numpy_buffer(client, value)) + else: + meta['typename'] = 'vineyard::Tensor<%s>' % normalize_cpptype(value.dtype) + meta['value_type_'] = value.dtype.name + meta['value_type_meta_'] = value.dtype.str + meta.add_member('buffer_', build_numpy_buffer(client, value)) return client.create_metadata(meta) diff --git a/python/vineyard/data/tests/test_dataframe.py b/python/vineyard/data/tests/test_dataframe.py index 9adf47a7..6cdda028 100644 --- a/python/vineyard/data/tests/test_dataframe.py +++ b/python/vineyard/data/tests/test_dataframe.py @@ -33,6 +33,13 @@ def test_pandas_dataframe(vineyard_client): pd.testing.assert_frame_equal(df, vineyard_client.get(object_id)) +def test_pandas_dataframe_string(vineyard_client): + # see gh#533 + df = pd.DataFrame({'a': ['1', '2', '3', '4'], 'b': ['5', '6', '7', '8']}) + object_id = vineyard_client.put(df) + pd.testing.assert_frame_equal(df, vineyard_client.get(object_id)) + + def test_pandas_dataframe_complex_columns(vineyard_client): # see gh#533 df = pd.DataFrame([1, 2, 3, 4], columns=[['x']]) diff --git a/rust/vineyard-integration-testing/src/ds/numpy_test.rs b/rust/vineyard-integration-testing/src/ds/numpy_test.rs index 82e103d6..5d851aa6 100644 --- a/rust/vineyard-integration-testing/src/ds/numpy_test.rs +++ b/rust/vineyard-integration-testing/src/ds/numpy_test.rs @@ -60,7 +60,6 @@ mod tests { return Ok(()); } - #[ignore = "ndarray with string type in python side needs to be fixed"] #[test] fn test_numpy_string() -> Result<()> { use arrow_array::array::Array; diff --git a/rust/vineyard-integration-testing/src/ds/pandas_test.rs b/rust/vineyard-integration-testing/src/ds/pandas_test.rs index 6b513478..701ec04f 100644 --- a/rust/vineyard-integration-testing/src/ds/pandas_test.rs +++ b/rust/vineyard-integration-testing/src/ds/pandas_test.rs @@ -46,7 +46,6 @@ mod tests { return Ok(()); } - #[ignore = "ndarray with string type in python side needs to be fixed"] #[test] fn test_pandas_string() -> Result<()> { let ctx = Context::new();