diff --git a/.github/workflows/rust-ci.yml b/.github/workflows/rust-ci.yml index ff07a18c7..c509af1cb 100644 --- a/.github/workflows/rust-ci.yml +++ b/.github/workflows/rust-ci.yml @@ -50,7 +50,15 @@ jobs: - name: Install requirement uses: ConorMacBride/install-package@v1 with: - apt: postgresql-client-14 cargo + apt: postgresql-client-14 + - uses: actions-rs/toolchain@v1 + with: + profile: minimal + toolchain: stable + default: true + - uses: Swatinem/rust-cache@v2 + with: + workspaces: "./rust -> target" - name: Init PG run: | ./script/meta_init_for_local_test.sh -j 2 diff --git a/lakesoul-flink/src/main/java/org/apache/flink/lakesoul/entry/SyncDatabase.java b/lakesoul-flink/src/main/java/org/apache/flink/lakesoul/entry/SyncDatabase.java index 14b67bbaa..cc7398556 100644 --- a/lakesoul-flink/src/main/java/org/apache/flink/lakesoul/entry/SyncDatabase.java +++ b/lakesoul-flink/src/main/java/org/apache/flink/lakesoul/entry/SyncDatabase.java @@ -20,6 +20,7 @@ import org.apache.flink.table.api.Table; import org.apache.flink.streaming.api.environment.CheckpointConfig; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.TableDescriptor; import org.apache.flink.table.api.TableResult; import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; import org.apache.flink.table.catalog.Catalog; @@ -255,10 +256,13 @@ public static void xsyncToMysql(StreamExecutionEnvironment env) throws SQLExcept Catalog lakesoulCatalog = new LakeSoulCatalog(); tEnvs.registerCatalog("lakeSoul", lakesoulCatalog); String jdbcUrl = url + targetDatabase; - TableResult schemaResult = tEnvs.executeSql( - "SELECT * FROM lakeSoul.`" + sourceDatabase + "`.`" + sourceTableName + "` LIMIT 1"); - DataType[] fieldDataTypes = schemaResult.getTableSchema().getFieldDataTypes(); - String[] fieldNames = schemaResult.getTableSchema().getFieldNames(); +// TableResult schemaResult = tEnvs.executeSql( +// "SELECT * FROM lakeSoul.`" + sourceDatabase + "`.`" + sourceTableName + "` LIMIT 1"); + Table lakesoulTable = tEnvs.from("`lakesoul`.`" + sourceDatabase + "`.`" + sourceTableName + "`"); + DataType[] fieldDataTypes = lakesoulTable.getSchema().getFieldDataTypes(); + String[] fieldNames = lakesoulTable.getSchema().getFieldNames(); +// DataType[] fieldDataTypes = schemaResult.getTableSchema().getFieldDataTypes(); +// String[] fieldNames = schemaResult.getTableSchema().getFieldNames(); String tablePk = getTablePk(sourceDatabase, sourceTableName); String[] stringFieldsTypes = getMysqlFieldsTypes(fieldDataTypes, fieldNames, tablePk); String createTableSql = pgAndMsqlCreateTableSql(stringFieldsTypes, fieldNames, targetTableName, tablePk); diff --git a/lakesoul-flink/src/main/java/org/apache/flink/lakesoul/substrait/SubstraitFlinkUtil.java b/lakesoul-flink/src/main/java/org/apache/flink/lakesoul/substrait/SubstraitFlinkUtil.java index 30ed111de..9f4206b6d 100644 --- a/lakesoul-flink/src/main/java/org/apache/flink/lakesoul/substrait/SubstraitFlinkUtil.java +++ b/lakesoul-flink/src/main/java/org/apache/flink/lakesoul/substrait/SubstraitFlinkUtil.java @@ -1,3 +1,7 @@ +// SPDX-FileCopyrightText: 2023 LakeSoul Contributors +// +// SPDX-License-Identifier: Apache-2.0 + package org.apache.flink.lakesoul.substrait; import io.substrait.expression.Expression; diff --git a/lakesoul-flink/src/main/java/org/apache/flink/lakesoul/substrait/SubstraitVisitor.java b/lakesoul-flink/src/main/java/org/apache/flink/lakesoul/substrait/SubstraitVisitor.java index 78c57ab07..66a442453 100644 --- a/lakesoul-flink/src/main/java/org/apache/flink/lakesoul/substrait/SubstraitVisitor.java +++ b/lakesoul-flink/src/main/java/org/apache/flink/lakesoul/substrait/SubstraitVisitor.java @@ -1,3 +1,7 @@ +// SPDX-FileCopyrightText: 2023 LakeSoul Contributors +// +// SPDX-License-Identifier: Apache-2.0 + package org.apache.flink.lakesoul.substrait; import com.dmetasoul.lakesoul.lakesoul.io.DateTimeUtils; diff --git a/rust/Cargo.lock b/rust/Cargo.lock index 8ae8cef40..2df3b28d2 100644 --- a/rust/Cargo.lock +++ b/rust/Cargo.lock @@ -78,9 +78,9 @@ dependencies = [ [[package]] name = "anstream" -version = "0.6.14" +version = "0.6.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "418c75fa768af9c03be99d17643f93f79bbba589895012a80e3452a19ddda15b" +checksum = "64e15c1ab1f89faffbf04a634d5e1962e9074f2741eef6d97f3c4e322426d526" dependencies = [ "anstyle", "anstyle-parse", @@ -93,33 +93,33 @@ dependencies = [ [[package]] name = "anstyle" -version = "1.0.7" +version = "1.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "038dfcf04a5feb68e9c60b21c9625a54c2c0616e79b72b0fd87075a056ae1d1b" +checksum = "1bec1de6f59aedf83baf9ff929c98f2ad654b97c9510f4e70cf6f661d49fd5b1" [[package]] name = "anstyle-parse" -version = "0.2.4" +version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c03a11a9034d92058ceb6ee011ce58af4a9bf61491aa7e1e59ecd24bd40d22d4" +checksum = "eb47de1e80c2b463c735db5b217a0ddc39d612e7ac9e2e96a5aed1f57616c1cb" dependencies = [ "utf8parse", ] [[package]] name = "anstyle-query" -version = "1.1.0" +version = "1.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ad186efb764318d35165f1758e7dcef3b10628e26d41a44bc5550652e6804391" +checksum = "6d36fc52c7f6c869915e99412912f22093507da8d9e942ceaf66fe4b7c14422a" dependencies = [ "windows-sys 0.52.0", ] [[package]] name = "anstyle-wincon" -version = "3.0.3" +version = "3.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61a38449feb7068f52bb06c12759005cf459ee52bb4adc1d5a7c4322d716fb19" +checksum = "5bf74e1b6e971609db8ca7a9ce79fd5768ab6ae46441c572e46cf596f59e57f8" dependencies = [ "anstyle", "windows-sys 0.52.0", @@ -139,22 +139,22 @@ checksum = "3d62b7694a562cdf5a74227903507c56ab2cc8bdd1f781ed5cb4cf9c9f810bfc" [[package]] name = "arrayref" -version = "0.3.7" +version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6b4930d2cb77ce62f89ee5d5289b4ac049559b1c45539271f5ed4fdc7db34545" +checksum = "9d151e35f61089500b617991b791fc8bfd237ae50cd5950803758a179b41e67a" [[package]] name = "arrayvec" -version = "0.7.4" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711" +checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow" -version = "48.0.1" -source = "git+https://github.com/lakesoul-io/arrow-rs.git?branch=arrow-rs-48-parquet-bufferred#bb0ebe19cb0b43fcf1b7f3606f33e8b6e0eea756" +version = "52.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05048a8932648b63f21c37d88b552ccc8a65afb6dfe9fc9f30ce79174c2e7a85" dependencies = [ - "ahash", "arrow-arith", "arrow-array", "arrow-buffer", @@ -172,8 +172,9 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "48.0.1" -source = "git+https://github.com/lakesoul-io/arrow-rs.git?branch=arrow-rs-48-parquet-bufferred#bb0ebe19cb0b43fcf1b7f3606f33e8b6e0eea756" +version = "52.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d8a57966e43bfe9a3277984a14c24ec617ad874e4c0e1d2a1b083a39cfbf22c" dependencies = [ "arrow-array", "arrow-buffer", @@ -186,8 +187,9 @@ dependencies = [ [[package]] name = "arrow-array" -version = "48.0.1" -source = "git+https://github.com/lakesoul-io/arrow-rs.git?branch=arrow-rs-48-parquet-bufferred#bb0ebe19cb0b43fcf1b7f3606f33e8b6e0eea756" +version = "52.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "16f4a9468c882dc66862cef4e1fd8423d47e67972377d85d80e022786427768c" dependencies = [ "ahash", "arrow-buffer", @@ -202,8 +204,9 @@ dependencies = [ [[package]] name = "arrow-buffer" -version = "48.0.1" -source = "git+https://github.com/lakesoul-io/arrow-rs.git?branch=arrow-rs-48-parquet-bufferred#bb0ebe19cb0b43fcf1b7f3606f33e8b6e0eea756" +version = "52.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c975484888fc95ec4a632cdc98be39c085b1bb518531b0c80c5d462063e5daa1" dependencies = [ "bytes", "half", @@ -212,25 +215,30 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "48.0.1" -source = "git+https://github.com/lakesoul-io/arrow-rs.git?branch=arrow-rs-48-parquet-bufferred#bb0ebe19cb0b43fcf1b7f3606f33e8b6e0eea756" +version = "52.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da26719e76b81d8bc3faad1d4dbdc1bcc10d14704e63dc17fc9f3e7e1e567c8e" dependencies = [ "arrow-array", "arrow-buffer", "arrow-data", "arrow-schema", "arrow-select", + "atoi", + "base64 0.22.1", "chrono", "comfy-table 7.1.1", "half", "lexical-core", "num", + "ryu", ] [[package]] name = "arrow-csv" -version = "48.0.1" -source = "git+https://github.com/lakesoul-io/arrow-rs.git?branch=arrow-rs-48-parquet-bufferred#bb0ebe19cb0b43fcf1b7f3606f33e8b6e0eea756" +version = "52.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c13c36dc5ddf8c128df19bab27898eea64bf9da2b555ec1cd17a8ff57fba9ec2" dependencies = [ "arrow-array", "arrow-buffer", @@ -247,8 +255,9 @@ dependencies = [ [[package]] name = "arrow-data" -version = "48.0.1" -source = "git+https://github.com/lakesoul-io/arrow-rs.git?branch=arrow-rs-48-parquet-bufferred#bb0ebe19cb0b43fcf1b7f3606f33e8b6e0eea756" +version = "52.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dd9d6f18c65ef7a2573ab498c374d8ae364b4a4edf67105357491c031f716ca5" dependencies = [ "arrow-buffer", "arrow-schema", @@ -258,8 +267,9 @@ dependencies = [ [[package]] name = "arrow-ipc" -version = "48.0.1" -source = "git+https://github.com/lakesoul-io/arrow-rs.git?branch=arrow-rs-48-parquet-bufferred#bb0ebe19cb0b43fcf1b7f3606f33e8b6e0eea756" +version = "52.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e786e1cdd952205d9a8afc69397b317cfbb6e0095e445c69cda7e8da5c1eeb0f" dependencies = [ "arrow-array", "arrow-buffer", @@ -267,12 +277,14 @@ dependencies = [ "arrow-data", "arrow-schema", "flatbuffers", + "lz4_flex", ] [[package]] name = "arrow-json" -version = "48.0.1" -source = "git+https://github.com/lakesoul-io/arrow-rs.git?branch=arrow-rs-48-parquet-bufferred#bb0ebe19cb0b43fcf1b7f3606f33e8b6e0eea756" +version = "52.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb22284c5a2a01d73cebfd88a33511a3234ab45d66086b2ca2d1228c3498e445" dependencies = [ "arrow-array", "arrow-buffer", @@ -281,7 +293,7 @@ dependencies = [ "arrow-schema", "chrono", "half", - "indexmap 2.2.6", + "indexmap 2.4.0", "lexical-core", "num", "serde", @@ -290,8 +302,9 @@ dependencies = [ [[package]] name = "arrow-ord" -version = "48.0.1" -source = "git+https://github.com/lakesoul-io/arrow-rs.git?branch=arrow-rs-48-parquet-bufferred#bb0ebe19cb0b43fcf1b7f3606f33e8b6e0eea756" +version = "52.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42745f86b1ab99ef96d1c0bcf49180848a64fe2c7a7a0d945bc64fa2b21ba9bc" dependencies = [ "arrow-array", "arrow-buffer", @@ -304,8 +317,9 @@ dependencies = [ [[package]] name = "arrow-row" -version = "48.0.1" -source = "git+https://github.com/lakesoul-io/arrow-rs.git?branch=arrow-rs-48-parquet-bufferred#bb0ebe19cb0b43fcf1b7f3606f33e8b6e0eea756" +version = "52.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4cd09a518c602a55bd406bcc291a967b284cfa7a63edfbf8b897ea4748aad23c" dependencies = [ "ahash", "arrow-array", @@ -313,22 +327,23 @@ dependencies = [ "arrow-data", "arrow-schema", "half", - "hashbrown 0.14.5", ] [[package]] name = "arrow-schema" -version = "48.0.1" -source = "git+https://github.com/lakesoul-io/arrow-rs.git?branch=arrow-rs-48-parquet-bufferred#bb0ebe19cb0b43fcf1b7f3606f33e8b6e0eea756" +version = "52.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e972cd1ff4a4ccd22f86d3e53e835c2ed92e0eea6a3e8eadb72b4f1ac802cf8" dependencies = [ - "bitflags 2.5.0", + "bitflags 2.6.0", "serde", ] [[package]] name = "arrow-select" -version = "48.0.1" -source = "git+https://github.com/lakesoul-io/arrow-rs.git?branch=arrow-rs-48-parquet-bufferred#bb0ebe19cb0b43fcf1b7f3606f33e8b6e0eea756" +version = "52.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "600bae05d43483d216fb3494f8c32fdbefd8aa4e1de237e790dbb3d9f44690a3" dependencies = [ "ahash", "arrow-array", @@ -340,14 +355,16 @@ dependencies = [ [[package]] name = "arrow-string" -version = "48.0.1" -source = "git+https://github.com/lakesoul-io/arrow-rs.git?branch=arrow-rs-48-parquet-bufferred#bb0ebe19cb0b43fcf1b7f3606f33e8b6e0eea756" +version = "52.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0dc1985b67cb45f6606a248ac2b4a288849f196bab8c657ea5589f47cdd55e6" dependencies = [ "arrow-array", "arrow-buffer", "arrow-data", "arrow-schema", "arrow-select", + "memchr", "num", "regex", "regex-syntax 0.8.4", @@ -367,9 +384,9 @@ dependencies = [ [[package]] name = "async-compression" -version = "0.4.11" +version = "0.4.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd066d0b4ef8ecb03a55319dc13aa6910616d0f44008a045bb1835af830abff5" +checksum = "fec134f64e2bc57411226dfc4e52dec859ddfc7e711fc5e07b612584f000e4aa" dependencies = [ "bzip2", "flate2", @@ -391,7 +408,7 @@ checksum = "3b43422f69d8ff38f95f1b2bb76517c91589a924d1559a0e935d7c8ce0274c11" dependencies = [ "proc-macro2", "quote", - "syn 2.0.66", + "syn 2.0.75", ] [[package]] @@ -402,13 +419,22 @@ checksum = "8b75356056920673b02621b35afd0f7dda9306d03c79a30f5c56c44cf256e3de" [[package]] name = "async-trait" -version = "0.1.80" +version = "0.1.81" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c6fa2087f2753a7da8cc1c0dbfcf89579dd57458e36769de5ac750b4671737ca" +checksum = "6e0c28dcc82d7c8ead5cb13beb15405b57b8546e93215673ff8ca0349a028107" dependencies = [ "proc-macro2", "quote", - "syn 2.0.66", + "syn 2.0.75", +] + +[[package]] +name = "atoi" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f28d99ec8bfea296261ca1af174f24225171fea9664ba9003cbebee704810528" +dependencies = [ + "num-traits", ] [[package]] @@ -440,15 +466,6 @@ version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0c4b4d0bd25bd0b74681c0ad21497610ce1b7c91b1022cd21c80c6fbdd9476b0" -[[package]] -name = "autotools" -version = "0.2.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef941527c41b0fc0dd48511a8154cd5fc7e29200a0ff8b7203c5d777dbc795cf" -dependencies = [ - "cc", -] - [[package]] name = "backtrace" version = "0.3.73" @@ -470,6 +487,12 @@ version = "0.21.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567" +[[package]] +name = "base64" +version = "0.22.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" + [[package]] name = "bitflags" version = "1.3.2" @@ -478,9 +501,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "bitflags" -version = "2.5.0" +version = "2.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cf4b9d6a944f767f8e5e0db018570623c85f3d925ac718db4e06d0187adb21c1" +checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de" [[package]] name = "blake2" @@ -493,9 +516,9 @@ dependencies = [ [[package]] name = "blake3" -version = "1.5.1" +version = "1.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30cca6d3674597c30ddf2c587bf8d9d65c9a84d2326d941cc79c9842dfe0ef52" +checksum = "e9ec96fe9a81b5e365f9db71fe00edc4fe4ca2cc7dcb7861f0603012a7caa210" dependencies = [ "arrayref", "arrayvec", @@ -528,9 +551,9 @@ dependencies = [ [[package]] name = "brotli" -version = "3.5.0" +version = "6.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d640d25bc63c50fb1f0b545ffd80207d2e10a4c965530809b40ba3386825c391" +checksum = "74f7971dbd9326d58187408ab83117d8ac1bb9c17b085fdacd1cf2f598719b6b" dependencies = [ "alloc-no-stdlib", "alloc-stdlib", @@ -539,9 +562,9 @@ dependencies = [ [[package]] name = "brotli-decompressor" -version = "2.5.1" +version = "4.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4e2e4afe60d7dd600fdd3de8d0f08c2b7ec039712e3b6137ff98b7004e82de4f" +checksum = "9a45bd2e4095a8b518033b128020dd4a55aab1c0a381ba4404a472630f4bc362" dependencies = [ "alloc-no-stdlib", "alloc-stdlib", @@ -561,9 +584,9 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "bytes" -version = "1.6.0" +version = "1.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "514de17de45fdb8dc022b1a7975556c53c86f9f0aa5f534b98977b171857c2c9" +checksum = "8318a53db07bb3f8dca91a600466bdb3f2eaadeedfdbcf02e1accbad9271ba50" [[package]] name = "bzip2" @@ -607,13 +630,13 @@ dependencies = [ [[package]] name = "cc" -version = "1.0.99" +version = "1.1.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "96c51067fd44124faa7f870b4b1c969379ad32b2ba805aa959430ceaa384f695" +checksum = "72db2f7947ecee9b03b510377e8bb9077afa27176fdbff55c51027e976fdcc48" dependencies = [ "jobserver", "libc", - "once_cell", + "shlex", ] [[package]] @@ -635,14 +658,14 @@ dependencies = [ "pure-rust-locales", "serde", "wasm-bindgen", - "windows-targets 0.52.5", + "windows-targets 0.52.6", ] [[package]] name = "chrono-tz" -version = "0.8.6" +version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d59ae0466b83e838b81a54256c39d5d7c20b9d7daa10510a242d9b75abd5936e" +checksum = "93698b29de5e97ad0ae26447b344c482a7284c737d9ddc5f9e52b74a336671bb" dependencies = [ "chrono", "chrono-tz-build", @@ -651,9 +674,9 @@ dependencies = [ [[package]] name = "chrono-tz-build" -version = "0.2.1" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "433e39f13c9a060046954e0592a8d0a4bcb1040125cbf91cb8ee58964cfb350f" +checksum = "0c088aee841df9c3041febbb73934cfc39708749bf96dc827e3359cd39ef11b1" dependencies = [ "parse-zoneinfo", "phf", @@ -684,11 +707,20 @@ dependencies = [ "os_str_bytes", ] +[[package]] +name = "cmake" +version = "0.1.51" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb1e43aa7fd152b1f968787f7dbcdeb306d1867ff373c69955211876c053f91a" +dependencies = [ + "cc", +] + [[package]] name = "colorchoice" -version = "1.0.1" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b6a852b24ab71dffc585bcb46eaf7959d175cb865a7152e35b348d1b2960422" +checksum = "d3fd119d74b830634cea2a0f58bbd0d54540518a14397557951e79340abc28c0" [[package]] name = "comfy-table" @@ -708,7 +740,7 @@ version = "7.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b34115915337defe99b2aff5c2ce6771e5fbc4079f4b506301f5cf394c8452f7" dependencies = [ - "strum 0.26.2", + "strum 0.26.3", "strum_macros 0.26.4", "unicode-width", ] @@ -760,15 +792,15 @@ dependencies = [ [[package]] name = "core-foundation-sys" -version = "0.8.6" +version = "0.8.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f" +checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" [[package]] name = "cpufeatures" -version = "0.2.12" +version = "0.2.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "53fe5e26ff1b7aef8bca9c6080520cfb8d9333c7568e1829cef191a9723e5504" +checksum = "51e852e6dc9a5bed1fae92dd2375037bf2b768725bf3be87811edee3249d09ad" dependencies = [ "libc", ] @@ -797,7 +829,7 @@ dependencies = [ "bitflags 1.3.2", "crossterm_winapi", "libc", - "mio", + "mio 0.8.11", "parking_lot", "signal-hook", "signal-hook-mio", @@ -857,7 +889,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "edb49164822f3ee45b17acd4a208cfc1251410cf0cad9a833234c9890774dd9f" dependencies = [ "quote", - "syn 2.0.66", + "syn 2.0.75", ] [[package]] @@ -868,11 +900,12 @@ checksum = "7762d17f1241643615821a8455a0b2c3e803784b058693d990b11f2dce25a0ca" [[package]] name = "dashmap" -version = "5.5.3" +version = "6.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "978747c1d849a7d2ee5e8adc0159961c48fb7e5db2f06af6723b80123bb53856" +checksum = "804c8821570c3f8b70230c2ba75ffa5c0f9a4189b9a432b6656c536712acae28" dependencies = [ "cfg-if", + "crossbeam-utils", "hashbrown 0.14.5", "lock_api", "once_cell", @@ -881,12 +914,14 @@ dependencies = [ [[package]] name = "datafusion" -version = "33.0.0" -source = "git+https://github.com/lakesoul-io/arrow-datafusion.git?branch=datafusion-33-parquet-prefetch#235eb27b6b0d23b18fb4a111fecbf5fa1b0d46a2" +version = "41.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e4fd4a99fc70d40ef7e52b243b4a399c3f8d353a40d5ecb200deee05e49c61bb" dependencies = [ "ahash", "arrow", "arrow-array", + "arrow-ipc", "arrow-schema", "async-compression", "async-trait", @@ -894,11 +929,18 @@ dependencies = [ "bzip2", "chrono", "dashmap", + "datafusion-catalog", "datafusion-common", + "datafusion-common-runtime", "datafusion-execution", "datafusion-expr", + "datafusion-functions", + "datafusion-functions-aggregate", + "datafusion-functions-nested", "datafusion-optimizer", "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-optimizer", "datafusion-physical-plan", "datafusion-sql", "flate2", @@ -906,13 +948,14 @@ dependencies = [ "glob", "half", "hashbrown 0.14.5", - "indexmap 2.2.6", - "itertools 0.11.0", + "indexmap 2.4.0", + "itertools 0.12.1", "log", "num_cpus", "object_store", "parking_lot", "parquet", + "paste", "pin-project-lite", "rand", "sqlparser", @@ -925,10 +968,25 @@ dependencies = [ "zstd", ] +[[package]] +name = "datafusion-catalog" +version = "41.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e13b3cfbd84c6003594ae1972314e3df303a27ce8ce755fcea3240c90f4c0529" +dependencies = [ + "arrow-schema", + "async-trait", + "datafusion-common", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-plan", +] + [[package]] name = "datafusion-common" -version = "33.0.0" -source = "git+https://github.com/lakesoul-io/arrow-datafusion.git?branch=datafusion-33-parquet-prefetch#235eb27b6b0d23b18fb4a111fecbf5fa1b0d46a2" +version = "41.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44fdbc877e3e40dcf88cc8f283d9f5c8851f0a3aa07fee657b1b75ac1ad49b9c" dependencies = [ "ahash", "arrow", @@ -937,16 +995,29 @@ dependencies = [ "arrow-schema", "chrono", "half", + "hashbrown 0.14.5", + "instant", + "libc", "num_cpus", "object_store", "parquet", "sqlparser", ] +[[package]] +name = "datafusion-common-runtime" +version = "41.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a7496d1f664179f6ce3a5cbef6566056ccaf3ea4aa72cc455f80e62c1dd86b1" +dependencies = [ + "tokio", +] + [[package]] name = "datafusion-execution" -version = "33.0.0" -source = "git+https://github.com/lakesoul-io/arrow-datafusion.git?branch=datafusion-33-parquet-prefetch#235eb27b6b0d23b18fb4a111fecbf5fa1b0d46a2" +version = "41.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "799e70968c815b611116951e3dd876aef04bf217da31b72eec01ee6a959336a1" dependencies = [ "arrow", "chrono", @@ -965,22 +1036,95 @@ dependencies = [ [[package]] name = "datafusion-expr" -version = "33.0.0" -source = "git+https://github.com/lakesoul-io/arrow-datafusion.git?branch=datafusion-33-parquet-prefetch#235eb27b6b0d23b18fb4a111fecbf5fa1b0d46a2" +version = "41.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1c1841c409d9518c17971d15c9bae62e629eb937e6fb6c68cd32e9186f8b30d2" dependencies = [ "ahash", "arrow", "arrow-array", + "arrow-buffer", + "chrono", + "datafusion-common", + "paste", + "serde_json", + "sqlparser", + "strum 0.26.3", + "strum_macros 0.26.4", +] + +[[package]] +name = "datafusion-functions" +version = "41.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8e481cf34d2a444bd8fa09b65945f0ce83dc92df8665b761505b3d9f351bebb" +dependencies = [ + "arrow", + "arrow-buffer", + "base64 0.22.1", + "blake2", + "blake3", + "chrono", + "datafusion-common", + "datafusion-execution", + "datafusion-expr", + "hashbrown 0.14.5", + "hex", + "itertools 0.12.1", + "log", + "md-5", + "rand", + "regex", + "sha2", + "unicode-segmentation", + "uuid", +] + +[[package]] +name = "datafusion-functions-aggregate" +version = "41.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b4ece19f73c02727e5e8654d79cd5652de371352c1df3c4ac3e419ecd6943fb" +dependencies = [ + "ahash", + "arrow", + "arrow-schema", "datafusion-common", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr-common", + "log", + "paste", "sqlparser", - "strum 0.25.0", - "strum_macros 0.25.3", +] + +[[package]] +name = "datafusion-functions-nested" +version = "41.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1474552cc824e8c9c88177d454db5781d4b66757d4aca75719306b8343a5e8d" +dependencies = [ + "arrow", + "arrow-array", + "arrow-buffer", + "arrow-ord", + "arrow-schema", + "datafusion-common", + "datafusion-execution", + "datafusion-expr", + "datafusion-functions", + "datafusion-functions-aggregate", + "itertools 0.12.1", + "log", + "paste", + "rand", ] [[package]] name = "datafusion-optimizer" -version = "33.0.0" -source = "git+https://github.com/lakesoul-io/arrow-datafusion.git?branch=datafusion-33-parquet-prefetch#235eb27b6b0d23b18fb4a111fecbf5fa1b0d46a2" +version = "41.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "791ff56f55608bc542d1ea7a68a64bdc86a9413f5a381d06a39fd49c2a3ab906" dependencies = [ "arrow", "async-trait", @@ -989,15 +1133,18 @@ dependencies = [ "datafusion-expr", "datafusion-physical-expr", "hashbrown 0.14.5", - "itertools 0.11.0", + "indexmap 2.4.0", + "itertools 0.12.1", "log", + "paste", "regex-syntax 0.8.4", ] [[package]] name = "datafusion-physical-expr" -version = "33.0.0" -source = "git+https://github.com/lakesoul-io/arrow-datafusion.git?branch=datafusion-33-parquet-prefetch#235eb27b6b0d23b18fb4a111fecbf5fa1b0d46a2" +version = "41.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a223962b3041304a3e20ed07a21d5de3d88d7e4e71ca192135db6d24e3365a4" dependencies = [ "ahash", "arrow", @@ -1005,86 +1152,117 @@ dependencies = [ "arrow-buffer", "arrow-ord", "arrow-schema", - "base64", - "blake2", - "blake3", + "arrow-string", + "base64 0.22.1", "chrono", "datafusion-common", + "datafusion-execution", "datafusion-expr", + "datafusion-physical-expr-common", "half", "hashbrown 0.14.5", "hex", - "indexmap 2.2.6", - "itertools 0.11.0", - "libc", + "indexmap 2.4.0", + "itertools 0.12.1", "log", - "md-5", "paste", "petgraph", - "rand", "regex", - "sha2", - "unicode-segmentation", - "uuid", +] + +[[package]] +name = "datafusion-physical-expr-common" +version = "41.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db5e7d8532a1601cd916881db87a70b0a599900d23f3db2897d389032da53bc6" +dependencies = [ + "ahash", + "arrow", + "datafusion-common", + "datafusion-expr", + "hashbrown 0.14.5", + "rand", +] + +[[package]] +name = "datafusion-physical-optimizer" +version = "41.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fdb9c78f308e050f5004671039786a925c3fee83b90004e9fcfd328d7febdcc0" +dependencies = [ + "datafusion-common", + "datafusion-execution", + "datafusion-physical-expr", + "datafusion-physical-plan", ] [[package]] name = "datafusion-physical-plan" -version = "33.0.0" -source = "git+https://github.com/lakesoul-io/arrow-datafusion.git?branch=datafusion-33-parquet-prefetch#235eb27b6b0d23b18fb4a111fecbf5fa1b0d46a2" +version = "41.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8d1116949432eb2d30f6362707e2846d942e491052a206f2ddcb42d08aea1ffe" dependencies = [ "ahash", "arrow", "arrow-array", "arrow-buffer", + "arrow-ord", "arrow-schema", "async-trait", "chrono", "datafusion-common", + "datafusion-common-runtime", "datafusion-execution", "datafusion-expr", + "datafusion-functions-aggregate", "datafusion-physical-expr", + "datafusion-physical-expr-common", "futures", "half", "hashbrown 0.14.5", - "indexmap 2.2.6", - "itertools 0.11.0", + "indexmap 2.4.0", + "itertools 0.12.1", "log", "once_cell", "parking_lot", "pin-project-lite", "rand", "tokio", - "uuid", ] [[package]] name = "datafusion-sql" -version = "33.0.0" -source = "git+https://github.com/lakesoul-io/arrow-datafusion.git?branch=datafusion-33-parquet-prefetch#235eb27b6b0d23b18fb4a111fecbf5fa1b0d46a2" +version = "41.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b45d0180711165fe94015d7c4123eb3e1cf5fb60b1506453200b8d1ce666bef0" dependencies = [ "arrow", + "arrow-array", "arrow-schema", "datafusion-common", "datafusion-expr", "log", + "regex", "sqlparser", + "strum 0.26.3", ] [[package]] name = "datafusion-substrait" -version = "33.0.0" -source = "git+https://github.com/lakesoul-io/arrow-datafusion.git?branch=datafusion-33-parquet-prefetch#235eb27b6b0d23b18fb4a111fecbf5fa1b0d46a2" +version = "41.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf0a0055aa98246c79f98f0d03df11f16cb7adc87818d02d4413e3f3cdadbbee" dependencies = [ + "arrow-buffer", "async-recursion", "chrono", "datafusion", - "itertools 0.11.0", + "itertools 0.12.1", "object_store", + "pbjson-types", "prost", - "prost-types", "substrait", - "tokio", + "url", ] [[package]] @@ -1109,17 +1287,6 @@ dependencies = [ "subtle", ] -[[package]] -name = "displaydoc" -version = "0.2.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "487585f4d0c6655fe74905e2504d8ad6908e4db67f744eb140876906c2f3175d" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.66", -] - [[package]] name = "doc-comment" version = "0.3.3" @@ -1134,24 +1301,15 @@ checksum = "0d6ef0072f8a535281e4876be788938b528e9a1d43900b82c2569af7da799125" [[package]] name = "either" -version = "1.12.0" +version = "1.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3dca9240753cf90908d7e4aac30f630662b02aebaa1b58a3cadabdb23385b58b" - -[[package]] -name = "encoding_rs" -version = "0.8.34" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b45de904aa0b010bce2ab45264d0631681847fa7b6f2eaa7dab7619943bc4f59" -dependencies = [ - "cfg-if", -] +checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0" [[package]] name = "env_filter" -version = "0.1.0" +version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a009aa4810eb158359dda09d0c87378e4bbb89b5a801f016885a4707ba24f7ea" +checksum = "4f2c92ceda6ceec50f43169f9ee8424fe2db276791afde7b2cd8bc084cb376ab" dependencies = [ "log", "regex", @@ -1159,9 +1317,9 @@ dependencies = [ [[package]] name = "env_logger" -version = "0.11.3" +version = "0.11.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38b35839ba51819680ba087cd351788c9a3c476841207e0b8cee0b04722343b9" +checksum = "e13fa619b91fb2381732789fc5de83b45675e882f66623b7d8cb4f643017018d" dependencies = [ "anstream", "anstyle", @@ -1227,9 +1385,9 @@ checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" [[package]] name = "flatbuffers" -version = "23.5.26" +version = "24.3.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4dac53e22462d78c16d64a1cd22371b54cc3fe94aa15e7886a2fa6e5d1ab8640" +checksum = "8add37afff2d4ffa83bc748a70b4b1370984f6980768554182424ef71447c35f" dependencies = [ "bitflags 1.3.2", "rustc_version", @@ -1237,9 +1395,9 @@ dependencies = [ [[package]] name = "flate2" -version = "1.0.30" +version = "1.0.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f54427cfd1c7829e2a139fcefea601bf088ebca651d2bf53ebc600eac295dae" +checksum = "7f211bbe8e69bbd0cfdea405084f128ae8b4aaa6b0b522fc8f2b009084797920" dependencies = [ "crc32fast", "miniz_oxide", @@ -1326,7 +1484,7 @@ checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" dependencies = [ "proc-macro2", "quote", - "syn 2.0.66", + "syn 2.0.75", ] [[package]] @@ -1386,19 +1544,6 @@ version = "0.29.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "40ecd4077b5ae9fd2e9e169b102c6c330d0605168eb0e8bf79952b256dbefffd" -[[package]] -name = "git2" -version = "0.18.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "232e6a7bfe35766bf715e55a88b39a700596c0ccfd88cd3680b4cdb40d66ef70" -dependencies = [ - "bitflags 2.5.0", - "libc", - "libgit2-sys", - "log", - "url", -] - [[package]] name = "glob" version = "0.3.1" @@ -1407,17 +1552,17 @@ checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" [[package]] name = "h2" -version = "0.3.26" +version = "0.4.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81fe527a889e1532da5c525686d96d4c2e74cdd345badf8dfef9f6b39dd5f5e8" +checksum = "fa82e28a107a8cc405f0839610bdc9b15f1e25ec7d696aa5cf173edbcb1486ab" dependencies = [ + "atomic-waker", "bytes", "fnv", "futures-core", "futures-sink", - "futures-util", "http", - "indexmap 2.2.6", + "indexmap 2.4.0", "slab", "tokio", "tokio-util", @@ -1441,15 +1586,6 @@ version = "0.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" -[[package]] -name = "hashbrown" -version = "0.13.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43a3c133739dddd0d2990f9a4bdf8eb4b21ef50e4851ca85ab661199821d510e" -dependencies = [ - "ahash", -] - [[package]] name = "hashbrown" version = "0.14.5" @@ -1527,9 +1663,9 @@ dependencies = [ [[package]] name = "http" -version = "0.2.12" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "601cbb57e577e2f5ef5be8e7b83f0f63994f25aa94d673e54a92d5c516d101f1" +checksum = "21b9ddb458710bc376481b842f5da65cdf31522de232c1ca8146abce2a358258" dependencies = [ "bytes", "fnv", @@ -1538,26 +1674,32 @@ dependencies = [ [[package]] name = "http-body" -version = "0.4.6" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ceab25649e9960c0311ea418d17bee82c0dcec1bd053b5f9a66e265a693bed2" +checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184" dependencies = [ "bytes", "http", - "pin-project-lite", ] [[package]] -name = "httparse" -version = "1.9.3" +name = "http-body-util" +version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d0e7a4dd27b9476dc40cb050d3632d3bba3a70ddbff012285f7f8559a1e7e545" +checksum = "793429d76616a256bcb62c2a2ec2bed781c8307e797e2598c50010f2bee2544f" +dependencies = [ + "bytes", + "futures-util", + "http", + "http-body", + "pin-project-lite", +] [[package]] -name = "httpdate" -version = "1.0.3" +name = "httparse" +version = "1.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" +checksum = "0fcc0b4a115bf80b728eb8ea024ad5bd707b615bfed49e0665b6e0f86fd082d9" [[package]] name = "humantime" @@ -1567,40 +1709,60 @@ checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" [[package]] name = "hyper" -version = "0.14.29" +version = "1.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f361cde2f109281a220d4307746cdfd5ee3f410da58a70377762396775634b33" +checksum = "50dfd22e0e76d0f662d429a5f80fcaf3855009297eab6a0a9f8543834744ba05" dependencies = [ "bytes", "futures-channel", - "futures-core", "futures-util", "h2", "http", "http-body", "httparse", - "httpdate", "itoa", "pin-project-lite", - "socket2", + "smallvec", "tokio", - "tower-service", - "tracing", "want", ] [[package]] name = "hyper-rustls" -version = "0.24.2" +version = "0.27.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec3efd23720e2049821a693cbc7e65ea87c72f1c58ff2f9522ff332b1491e590" +checksum = "5ee4be2c948921a1a5320b629c4193916ed787a7f7f293fd3f7f5a6c9de74155" dependencies = [ "futures-util", "http", "hyper", + "hyper-util", "rustls", + "rustls-native-certs", + "rustls-pki-types", "tokio", "tokio-rustls", + "tower-service", +] + +[[package]] +name = "hyper-util" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cde7055719c54e36e95e8719f95883f22072a48ede39db7fc17a4e1d5281e9b9" +dependencies = [ + "bytes", + "futures-channel", + "futures-util", + "http", + "http-body", + "hyper", + "pin-project-lite", + "socket2", + "tokio", + "tower", + "tower-service", + "tracing", ] [[package]] @@ -1626,134 +1788,14 @@ dependencies = [ "cc", ] -[[package]] -name = "icu_collections" -version = "1.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "db2fa452206ebee18c4b5c2274dbf1de17008e874b4dc4f0aea9d01ca79e4526" -dependencies = [ - "displaydoc", - "yoke", - "zerofrom", - "zerovec", -] - -[[package]] -name = "icu_locid" -version = "1.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "13acbb8371917fc971be86fc8057c41a64b521c184808a698c02acc242dbf637" -dependencies = [ - "displaydoc", - "litemap", - "tinystr", - "writeable", - "zerovec", -] - -[[package]] -name = "icu_locid_transform" -version = "1.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "01d11ac35de8e40fdeda00d9e1e9d92525f3f9d887cdd7aa81d727596788b54e" -dependencies = [ - "displaydoc", - "icu_locid", - "icu_locid_transform_data", - "icu_provider", - "tinystr", - "zerovec", -] - -[[package]] -name = "icu_locid_transform_data" -version = "1.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fdc8ff3388f852bede6b579ad4e978ab004f139284d7b28715f773507b946f6e" - -[[package]] -name = "icu_normalizer" -version = "1.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "19ce3e0da2ec68599d193c93d088142efd7f9c5d6fc9b803774855747dc6a84f" -dependencies = [ - "displaydoc", - "icu_collections", - "icu_normalizer_data", - "icu_properties", - "icu_provider", - "smallvec", - "utf16_iter", - "utf8_iter", - "write16", - "zerovec", -] - -[[package]] -name = "icu_normalizer_data" -version = "1.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8cafbf7aa791e9b22bec55a167906f9e1215fd475cd22adfcf660e03e989516" - -[[package]] -name = "icu_properties" -version = "1.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1f8ac670d7422d7f76b32e17a5db556510825b29ec9154f235977c9caba61036" -dependencies = [ - "displaydoc", - "icu_collections", - "icu_locid_transform", - "icu_properties_data", - "icu_provider", - "tinystr", - "zerovec", -] - -[[package]] -name = "icu_properties_data" -version = "1.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67a8effbc3dd3e4ba1afa8ad918d5684b8868b3b26500753effea8d2eed19569" - -[[package]] -name = "icu_provider" -version = "1.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ed421c8a8ef78d3e2dbc98a973be2f3770cb42b606e3ab18d6237c4dfde68d9" -dependencies = [ - "displaydoc", - "icu_locid", - "icu_provider_macros", - "stable_deref_trait", - "tinystr", - "writeable", - "yoke", - "zerofrom", - "zerovec", -] - -[[package]] -name = "icu_provider_macros" -version = "1.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.66", -] - [[package]] name = "idna" -version = "1.0.0" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4716a3a0933a1d01c2f72450e89596eb51dd34ef3c211ccd875acdf1f8fe47ed" +checksum = "634d9b1461af396cad843f47fdba5597a4f9e6ddd4bfb6ff5d85028c25cb12f6" dependencies = [ - "icu_normalizer", - "icu_properties", - "smallvec", - "utf8_iter", + "unicode-bidi", + "unicode-normalization", ] [[package]] @@ -1768,14 +1810,26 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.2.6" +version = "2.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "168fb715dda47215e360912c096649d23d58bf392ac62f73919e831745e40f26" +checksum = "93ead53efc7ea8ed3cfb0c79fc8023fbb782a5432b52830b6518941cebe6505c" dependencies = [ "equivalent", "hashbrown 0.14.5", ] +[[package]] +name = "instant" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e0242819d153cba4b4b05a5a8f2a7e9bbf97b6055b2a002b395c96b5ff3c0222" +dependencies = [ + "cfg-if", + "js-sys", + "wasm-bindgen", + "web-sys", +] + [[package]] name = "integer-encoding" version = "3.0.4" @@ -1790,9 +1844,9 @@ checksum = "8f518f335dce6725a761382244631d86cf0ccb2863413590b31338feb467f9c3" [[package]] name = "is_terminal_polyfill" -version = "1.70.0" +version = "1.70.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8478577c03552c21db0e2724ffb8986a5ce7af88107e6be5d2ee6e158c12800" +checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" [[package]] name = "itertools" @@ -1812,6 +1866,15 @@ dependencies = [ "either", ] +[[package]] +name = "itertools" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "1.0.11" @@ -1830,18 +1893,18 @@ dependencies = [ [[package]] name = "jobserver" -version = "0.1.31" +version = "0.1.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2b099aaa34a9751c5bf0878add70444e1ed2dd73f347be99003d4577277de6e" +checksum = "48d1dbcbbeb6a7fec7e059840aa538bd62aaccf972c7346c4d9d2059312853d0" dependencies = [ "libc", ] [[package]] name = "js-sys" -version = "0.3.69" +version = "0.3.70" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "29c15563dc2726973df627357ce0c9ddddbea194836909d655df6a75d2cf296d" +checksum = "1868808506b929d7b0cfa8f75951347aa71bb21144b7791bae35d9bccfcfe37a" dependencies = [ "wasm-bindgen", ] @@ -1987,9 +2050,9 @@ dependencies = [ [[package]] name = "lazy_static" -version = "1.4.0" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" +checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" [[package]] name = "lexical-core" @@ -2057,21 +2120,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.155" +version = "0.2.157" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c" - -[[package]] -name = "libgit2-sys" -version = "0.16.2+1.7.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ee4126d8b4ee5c9d9ea891dd875cfdc1e9d0950437179104b183d7d8a74d24e8" -dependencies = [ - "cc", - "libc", - "libz-sys", - "pkg-config", -] +checksum = "374af5f94e54fa97cf75e945cce8a6b201e88a1a07e688b47dfd2a59c66dbd86" [[package]] name = "libm" @@ -2079,30 +2130,12 @@ version = "0.2.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058" -[[package]] -name = "libz-sys" -version = "1.1.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c15da26e5af7e25c90b37a2d75cdbf940cf4a55316de9d84c679c9b8bfabf82e" -dependencies = [ - "cc", - "libc", - "pkg-config", - "vcpkg", -] - [[package]] name = "linux-raw-sys" version = "0.4.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89" -[[package]] -name = "litemap" -version = "0.7.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "643cb0b8d4fcc284004d5fd0d67ccf61dfffadb7f75e1e71bc420f4688a3a704" - [[package]] name = "lock_api" version = "0.4.12" @@ -2115,9 +2148,9 @@ dependencies = [ [[package]] name = "log" -version = "0.4.21" +version = "0.4.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c" +checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" [[package]] name = "lz4_flex" @@ -2172,9 +2205,9 @@ checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" [[package]] name = "miniz_oxide" -version = "0.7.3" +version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87dfd01fe195c66b572b37921ad8803d010623c0aca821bea2302239d155cdae" +checksum = "b8a240ddb74feaf34a79a7add65a741f3167852fba007066dcac1ca548d89c08" dependencies = [ "adler", ] @@ -2191,6 +2224,18 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "mio" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "80e04d1dcff3aae0704555fe5fee3bcfaf3d1fdf8a7e521d5b9d2b42acb52cec" +dependencies = [ + "hermit-abi 0.3.9", + "libc", + "wasi", + "windows-sys 0.52.0", +] + [[package]] name = "multimap" version = "0.10.0" @@ -2223,9 +2268,9 @@ dependencies = [ [[package]] name = "num-bigint" -version = "0.4.5" +version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c165a9ab64cf766f73521c0dd2cfdff64f488b8f0b3e621face3462d3db536d7" +checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9" dependencies = [ "num-integer", "num-traits", @@ -2314,32 +2359,34 @@ dependencies = [ [[package]] name = "object" -version = "0.36.0" +version = "0.36.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "576dfe1fc8f9df304abb159d767a29d0476f7750fbf8aa7ad07816004a207434" +checksum = "27b64972346851a39438c60b341ebc01bba47464ae329e55cf343eb93964efd9" dependencies = [ "memchr", ] [[package]] name = "object_store" -version = "0.7.1" -source = "git+https://github.com/lakesoul-io/arrow-rs.git?branch=object_store_0.7_opt#3fd4b6ba312ddc18b7f8ce509350546ef03c7ae6" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6da452820c715ce78221e8202ccc599b4a52f3e1eb3eedb487b680c81a8e3f3" dependencies = [ "async-trait", - "base64", + "base64 0.22.1", "bytes", "chrono", "futures", "humantime", "hyper", - "itertools 0.11.0", + "itertools 0.13.0", + "md-5", "parking_lot", "percent-encoding", "quick-xml", "rand", "reqwest", - "ring 0.16.20", + "ring", "serde", "serde_json", "snafu", @@ -2355,6 +2402,12 @@ version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" +[[package]] +name = "openssl-probe" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" + [[package]] name = "ordered-float" version = "2.10.1" @@ -2400,15 +2453,16 @@ checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8" dependencies = [ "cfg-if", "libc", - "redox_syscall 0.5.2", + "redox_syscall 0.5.3", "smallvec", - "windows-targets 0.52.5", + "windows-targets 0.52.6", ] [[package]] name = "parquet" -version = "48.0.1" -source = "git+https://github.com/lakesoul-io/arrow-rs.git?branch=arrow-rs-48-parquet-bufferred#bb0ebe19cb0b43fcf1b7f3606f33e8b6e0eea756" +version = "52.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e977b9066b4d3b03555c22bdc442f3fadebd96a39111249113087d0edb2691cd" dependencies = [ "ahash", "arrow-array", @@ -2418,12 +2472,13 @@ dependencies = [ "arrow-ipc", "arrow-schema", "arrow-select", - "base64", + "base64 0.22.1", "brotli", "bytes", "chrono", "flate2", "futures", + "half", "hashbrown 0.14.5", "lz4_flex", "num", @@ -2436,6 +2491,7 @@ dependencies = [ "tokio", "twox-hash", "zstd", + "zstd-sys", ] [[package]] @@ -2453,6 +2509,43 @@ version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" +[[package]] +name = "pbjson" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1030c719b0ec2a2d25a5df729d6cff1acf3cc230bf766f4f97833591f7577b90" +dependencies = [ + "base64 0.21.7", + "serde", +] + +[[package]] +name = "pbjson-build" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2580e33f2292d34be285c5bc3dba5259542b083cfad6037b6d70345f24dcb735" +dependencies = [ + "heck 0.4.1", + "itertools 0.11.0", + "prost", + "prost-types", +] + +[[package]] +name = "pbjson-types" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "18f596653ba4ac51bdecbb4ef6773bc7f56042dc13927910de1684ad3d32aa12" +dependencies = [ + "bytes", + "chrono", + "pbjson", + "pbjson-build", + "prost", + "prost-build", + "serde", +] + [[package]] name = "percent-encoding" version = "2.3.1" @@ -2466,7 +2559,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b4c5cc86750666a3ed20bdaf5ca2a0344f9c67674cae0515bec2da16fbaa47db" dependencies = [ "fixedbitset", - "indexmap 2.2.6", + "indexmap 2.4.0", ] [[package]] @@ -2507,6 +2600,26 @@ dependencies = [ "siphasher", ] +[[package]] +name = "pin-project" +version = "1.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6bf43b791c5b9e34c3d182969b4abb522f9343702850a2e57f460d00d09b4b3" +dependencies = [ + "pin-project-internal", +] + +[[package]] +name = "pin-project-internal" +version = "1.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.75", +] + [[package]] name = "pin-project-lite" version = "0.2.14" @@ -2521,9 +2634,9 @@ checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" [[package]] name = "piper" -version = "0.2.3" +version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae1d5c74c9876f070d3e8fd503d748c7d974c3e48da8f41350fa5222ef9b4391" +checksum = "96c8c490f422ef9a4efd2cb5b42b76c8613d7e7dfc1caf667b8a3350a5acc066" dependencies = [ "atomic-waker", "fastrand", @@ -2538,9 +2651,9 @@ checksum = "d231b230927b5e4ad203db57bbcbee2802f6bce620b1e4a9024a07d94e2907ec" [[package]] name = "postgres" -version = "0.19.7" +version = "0.19.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7915b33ed60abc46040cbcaa25ffa1c7ec240668e0477c4f3070786f5916d451" +checksum = "6c9ec84ab55b0f9e418675de50052d494ba893fd28c65769a6e68fcdacbee2b8" dependencies = [ "bytes", "fallible-iterator", @@ -2559,16 +2672,16 @@ dependencies = [ "heck 0.4.1", "proc-macro2", "quote", - "syn 2.0.66", + "syn 2.0.75", ] [[package]] name = "postgres-protocol" -version = "0.6.6" +version = "0.6.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49b6c5ef183cd3ab4ba005f1ca64c21e8bd97ce4699cfea9e8d9a2c4958ca520" +checksum = "acda0ebdebc28befa84bee35e651e4c5f09073d668c7aed4cf7e23c3cda84b23" dependencies = [ - "base64", + "base64 0.22.1", "byteorder", "bytes", "fallible-iterator", @@ -2582,9 +2695,9 @@ dependencies = [ [[package]] name = "postgres-types" -version = "0.2.6" +version = "0.2.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8d2234cdee9408b523530a9b6d2d6b373d1db34f6a8e51dc03ded1828d7fb67c" +checksum = "02048d9e032fb3cc3413bbf7b83a15d84a5d419778e2628751896d856498eee9" dependencies = [ "array-init", "bytes", @@ -2598,9 +2711,12 @@ dependencies = [ [[package]] name = "ppv-lite86" -version = "0.2.17" +version = "0.2.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" +checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04" +dependencies = [ + "zerocopy", +] [[package]] name = "prettyplease" @@ -2609,7 +2725,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5f12335488a2f3b0a83b14edad48dca9879ce89b2edd10e80237e4e852dd645e" dependencies = [ "proc-macro2", - "syn 2.0.66", + "syn 2.0.75", ] [[package]] @@ -2624,9 +2740,9 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.85" +version = "1.0.86" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22244ce15aa966053a896d1accb3a6e68469b97c7f33f284b99f0d576879fc23" +checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77" dependencies = [ "unicode-ident", ] @@ -2658,7 +2774,7 @@ dependencies = [ "prost", "prost-types", "regex", - "syn 2.0.66", + "syn 2.0.75", "tempfile", ] @@ -2672,7 +2788,7 @@ dependencies = [ "itertools 0.12.1", "proc-macro2", "quote", - "syn 2.0.66", + "syn 2.0.75", ] [[package]] @@ -2695,28 +2811,76 @@ dependencies = [ ] [[package]] -name = "protobuf-src" -version = "1.1.0+21.5" +name = "protobuf-src" +version = "2.1.0+27.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7edafa3bcc668fa93efafcbdf58d7821bbda0f4b458ac7fae3d57ec0fec8167" +dependencies = [ + "cmake", +] + +[[package]] +name = "pure-rust-locales" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1190fd18ae6ce9e137184f207593877e70f39b015040156b1e05081cdfe3733a" + +[[package]] +name = "quick-xml" +version = "0.36.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96a05e2e8efddfa51a84ca47cec303fac86c8541b686d37cac5efc0e094417bc" +dependencies = [ + "memchr", + "serde", +] + +[[package]] +name = "quinn" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b22d8e7369034b9a7132bc2008cac12f2013c8132b45e0554e6e20e2617f2156" +dependencies = [ + "bytes", + "pin-project-lite", + "quinn-proto", + "quinn-udp", + "rustc-hash", + "rustls", + "socket2", + "thiserror", + "tokio", + "tracing", +] + +[[package]] +name = "quinn-proto" +version = "0.11.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7ac8852baeb3cc6fb83b93646fb93c0ffe5d14bf138c945ceb4b9948ee0e3c1" +checksum = "ba92fb39ec7ad06ca2582c0ca834dfeadcaf06ddfc8e635c80aa7e1c05315fdd" dependencies = [ - "autotools", + "bytes", + "rand", + "ring", + "rustc-hash", + "rustls", + "slab", + "thiserror", + "tinyvec", + "tracing", ] [[package]] -name = "pure-rust-locales" -version = "0.8.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1190fd18ae6ce9e137184f207593877e70f39b015040156b1e05081cdfe3733a" - -[[package]] -name = "quick-xml" -version = "0.30.0" +name = "quinn-udp" +version = "0.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eff6510e86862b57b210fd8cbe8ed3f0d7d600b9c2863cd4549a2e033c66e956" +checksum = "8bffec3605b73c6f1754535084a85229fa8a30f86014e6c81aeec4abb68b0285" dependencies = [ - "memchr", - "serde", + "libc", + "once_cell", + "socket2", + "tracing", + "windows-sys 0.52.0", ] [[package]] @@ -2769,18 +2933,18 @@ dependencies = [ [[package]] name = "redox_syscall" -version = "0.5.2" +version = "0.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c82cf8cff14456045f55ec4241383baeff27af886adb72ffb2162f99911de0fd" +checksum = "2a908a6e00f1fdd0dfd9c0eb08ce85126f6d8bbda50017e74bc4a4b7d4a926a4" dependencies = [ - "bitflags 2.5.0", + "bitflags 2.6.0", ] [[package]] name = "regex" -version = "1.10.5" +version = "1.10.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b91213439dad192326a0d7c6ee3955910425f441d7038e0d6933b0aec5c4517f" +checksum = "4219d74c6b67a3654a9fbebc4b419e22126d13d2f3c4a07ee0cb61ff79a79619" dependencies = [ "aho-corasick", "memchr", @@ -2822,30 +2986,31 @@ checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b" [[package]] name = "regress" -version = "0.7.1" +version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4ed9969cad8051328011596bf549629f1b800cf1731e7964b1eef8dfc480d2c2" +checksum = "0eae2a1ebfecc58aff952ef8ccd364329abe627762f5bf09ff42eb9d98522479" dependencies = [ - "hashbrown 0.13.2", + "hashbrown 0.14.5", "memchr", ] [[package]] name = "reqwest" -version = "0.11.27" +version = "0.12.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd67538700a17451e7cba03ac727fb961abb7607553461627b97de0b89cf4a62" +checksum = "c7d6d2a27d57148378eb5e111173f4276ad26340ecc5c49a4a2152167a2d6a37" dependencies = [ - "base64", + "base64 0.22.1", "bytes", - "encoding_rs", "futures-core", "futures-util", "h2", "http", "http-body", + "http-body-util", "hyper", "hyper-rustls", + "hyper-util", "ipnet", "js-sys", "log", @@ -2853,13 +3018,15 @@ dependencies = [ "once_cell", "percent-encoding", "pin-project-lite", + "quinn", "rustls", + "rustls-native-certs", "rustls-pemfile", + "rustls-pki-types", "serde", "serde_json", "serde_urlencoded", "sync_wrapper", - "system-configuration", "tokio", "tokio-rustls", "tokio-util", @@ -2869,25 +3036,9 @@ dependencies = [ "wasm-bindgen-futures", "wasm-streams", "web-sys", - "webpki-roots", "winreg", ] -[[package]] -name = "ring" -version = "0.16.20" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3053cf52e236a3ed746dfc745aa9cacf1b791d846bdaf412f60a8d7d6e17c8fc" -dependencies = [ - "cc", - "libc", - "once_cell", - "spin 0.5.2", - "untrusted 0.7.1", - "web-sys", - "winapi", -] - [[package]] name = "ring" version = "0.17.8" @@ -2898,8 +3049,8 @@ dependencies = [ "cfg-if", "getrandom", "libc", - "spin 0.9.8", - "untrusted 0.9.0", + "spin", + "untrusted", "windows-sys 0.52.0", ] @@ -2909,6 +3060,12 @@ version = "0.1.24" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" +[[package]] +name = "rustc-hash" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "583034fd73374156e66797ed8e5b0d5690409c9226b22d87cb7f19821c05d152" + [[package]] name = "rustc_version" version = "0.4.0" @@ -2924,7 +3081,7 @@ version = "0.38.34" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "70dc5ec042f7a43c4a73241207cecc9873a06d45debb38b329f8541d85c2730f" dependencies = [ - "bitflags 2.5.0", + "bitflags 2.6.0", "errno", "libc", "linux-raw-sys", @@ -2933,33 +3090,56 @@ dependencies = [ [[package]] name = "rustls" -version = "0.21.12" +version = "0.23.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f56a14d1f48b391359b22f731fd4bd7e43c97f3c50eee276f3aa09c94784d3e" +checksum = "c58f8c84392efc0a126acce10fa59ff7b3d2ac06ab451a33f2741989b806b044" dependencies = [ - "log", - "ring 0.17.8", + "once_cell", + "ring", + "rustls-pki-types", "rustls-webpki", - "sct", + "subtle", + "zeroize", +] + +[[package]] +name = "rustls-native-certs" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a88d6d420651b496bdd98684116959239430022a115c1240e6c3993be0b15fba" +dependencies = [ + "openssl-probe", + "rustls-pemfile", + "rustls-pki-types", + "schannel", + "security-framework", ] [[package]] name = "rustls-pemfile" -version = "1.0.4" +version = "2.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1c74cae0a4cf6ccbbf5f359f08efdf8ee7e1dc532573bf0db71968cb56b1448c" +checksum = "196fe16b00e106300d3e45ecfcb764fa292a535d7326a29a5875c579c7417425" dependencies = [ - "base64", + "base64 0.22.1", + "rustls-pki-types", ] +[[package]] +name = "rustls-pki-types" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc0a2ce646f8655401bb81e7927b812614bd5d91dbc968696be50603510fcaf0" + [[package]] name = "rustls-webpki" -version = "0.101.7" +version = "0.102.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b6275d1ee7a1cd780b64aca7726599a1dbc893b1e64144529e55c3c2f745765" +checksum = "8e6b52d4fda176fd835fdc55a835d4a89b8499cad995885a21149d5ad62f852e" dependencies = [ - "ring 0.17.8", - "untrusted 0.9.0", + "ring", + "rustls-pki-types", + "untrusted", ] [[package]] @@ -2983,6 +3163,15 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "schannel" +version = "0.1.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fbc91545643bcf3a0bbb6569265615222618bdf33ce4ffbbd13c4bbd4c093534" +dependencies = [ + "windows-sys 0.52.0", +] + [[package]] name = "schemars" version = "0.8.21" @@ -3004,7 +3193,7 @@ dependencies = [ "proc-macro2", "quote", "serde_derive_internals", - "syn 2.0.66", + "syn 2.0.75", ] [[package]] @@ -3014,13 +3203,26 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" [[package]] -name = "sct" -version = "0.7.1" +name = "security-framework" +version = "2.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "897b2245f0b511c87893af39b033e5ca9cce68824c4d7e7630b5a1d339658d02" +dependencies = [ + "bitflags 2.6.0", + "core-foundation", + "core-foundation-sys", + "libc", + "security-framework-sys", +] + +[[package]] +name = "security-framework-sys" +version = "2.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da046153aa2352493d6cb7da4b6e5c0c057d8a1d0a9aa8560baffdd945acd414" +checksum = "75da29fe9b9b08fe9d6b22b5b4bcbc75d8db3aa31e639aa56bb62e9d46bfceaf" dependencies = [ - "ring 0.17.8", - "untrusted 0.9.0", + "core-foundation-sys", + "libc", ] [[package]] @@ -3028,6 +3230,9 @@ name = "semver" version = "1.0.23" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "61697e0a1c7e512e84a621326239844a24d8207b4669b41bc18b32ea5cbf988b" +dependencies = [ + "serde", +] [[package]] name = "seq-macro" @@ -3037,22 +3242,22 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4" [[package]] name = "serde" -version = "1.0.203" +version = "1.0.208" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7253ab4de971e72fb7be983802300c30b5a7f0c2e56fab8abfc6a214307c0094" +checksum = "cff085d2cb684faa248efb494c39b68e522822ac0de72ccf08109abde717cfb2" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.203" +version = "1.0.208" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "500cbc0ebeb6f46627f50f3f5811ccf6bf00643be300b4c3eabc0ef55dc5b5ba" +checksum = "24008e81ff7613ed8e5ba0cfaf24e2c2f1e5b8a0495711e44fcd4882fca62bcf" dependencies = [ "proc-macro2", "quote", - "syn 2.0.66", + "syn 2.0.75", ] [[package]] @@ -3063,30 +3268,31 @@ checksum = "18d26a20a969b9e3fdf2fc2d9f21eda6c40e2de84c9408bb5d3b05d499aae711" dependencies = [ "proc-macro2", "quote", - "syn 2.0.66", + "syn 2.0.75", ] [[package]] name = "serde_json" -version = "1.0.117" +version = "1.0.125" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "455182ea6142b14f93f4bc5320a2b31c1f266b66a4a5c858b013302a5d8cbfc3" +checksum = "83c8e735a073ccf5be70aa8066aa984eaf2fa000db6c8d0100ae605b366d31ed" dependencies = [ "itoa", + "memchr", "ryu", "serde", ] [[package]] name = "serde_tokenstream" -version = "0.2.1" +version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8790a7c3fe883e443eaa2af6f705952bc5d6e8671a220b9335c8cae92c037e74" +checksum = "64060d864397305347a78851c51588fd283767e7e7589829e8121d65512340f1" dependencies = [ "proc-macro2", "quote", "serde", - "syn 2.0.66", + "syn 2.0.75", ] [[package]] @@ -3107,7 +3313,7 @@ version = "0.9.34+deprecated" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6a8b1a1a2ebf674015cc02edccce75287f1a0130d394307b36743c2f5d504b47" dependencies = [ - "indexmap 2.2.6", + "indexmap 2.4.0", "itoa", "ryu", "serde", @@ -3134,6 +3340,12 @@ dependencies = [ "lazy_static", ] +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + [[package]] name = "signal-hook" version = "0.3.17" @@ -3146,12 +3358,12 @@ dependencies = [ [[package]] name = "signal-hook-mio" -version = "0.2.3" +version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "29ad2e15f37ec9a6cc544097b78a1ec90001e9f71b81338ca39f430adaca99af" +checksum = "34db1a06d485c9142248b7a054f034b349b212551f3dfd19c94d45a754a217cd" dependencies = [ "libc", - "mio", + "mio 0.8.11", "signal-hook", ] @@ -3223,12 +3435,6 @@ dependencies = [ "windows-sys 0.52.0", ] -[[package]] -name = "spin" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d" - [[package]] name = "spin" version = "0.9.8" @@ -3237,9 +3443,9 @@ checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" [[package]] name = "sqlparser" -version = "0.39.0" +version = "0.49.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "743b4dc2cbde11890ccb254a8fc9d537fa41b36da00de2a1c5e9848c9bc42bd7" +checksum = "a4a404d0e14905361b918cb8afdb73605e25c1d5029312bd9785142dcb3aa49e" dependencies = [ "log", "sqlparser_derive", @@ -3247,21 +3453,15 @@ dependencies = [ [[package]] name = "sqlparser_derive" -version = "0.1.1" +version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "55fe75cb4a364c7f7ae06c7dbbc8d84bddd85d6cdf9975963c3935bc1991761e" +checksum = "01b2e185515564f15375f593fb966b5718bc624ba77fe49fa4616ad619690554" dependencies = [ "proc-macro2", "quote", - "syn 1.0.109", + "syn 2.0.75", ] -[[package]] -name = "stable_deref_trait" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" - [[package]] name = "static_assertions" version = "1.1.0" @@ -3293,19 +3493,13 @@ checksum = "063e6045c0e62079840579a7e47a355ae92f60eb74daaf156fb1e84ba164e63f" [[package]] name = "strum" -version = "0.25.0" +version = "0.26.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "290d54ea6f91c969195bdbcd7442c8c2a2ba87da8bf60a7ee86a235d4bc1e125" +checksum = "8fec0f0aef304996cf250b31b5a10dee7980c85da9d759361292b8bca5a18f06" dependencies = [ - "strum_macros 0.25.3", + "strum_macros 0.26.4", ] -[[package]] -name = "strum" -version = "0.26.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d8cec3501a5194c432b2b7976db6b7d10ec95c253208b45f83f7136aa985e29" - [[package]] name = "strum_macros" version = "0.24.3" @@ -3319,19 +3513,6 @@ dependencies = [ "syn 1.0.109", ] -[[package]] -name = "strum_macros" -version = "0.25.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "23dc1fa9ac9c169a78ba62f0b841814b7abae11bdd047b9c58f893439e309ea0" -dependencies = [ - "heck 0.4.1", - "proc-macro2", - "quote", - "rustversion", - "syn 2.0.66", -] - [[package]] name = "strum_macros" version = "0.26.4" @@ -3342,17 +3523,19 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.66", + "syn 2.0.75", ] [[package]] name = "substrait" -version = "0.19.0" +version = "0.36.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7299fc531294d189834eeaf7928482f311c0ada2cf0007948989cf75d0228183" +checksum = "b1ee6e584c8bf37104b7eb51c25eae07a9321b0e01379bec3b7c462d2f42afbf" dependencies = [ - "git2", - "heck 0.4.1", + "heck 0.5.0", + "pbjson", + "pbjson-build", + "pbjson-types", "prettyplease", "prost", "prost-build", @@ -3363,16 +3546,16 @@ dependencies = [ "serde", "serde_json", "serde_yaml", - "syn 2.0.66", + "syn 2.0.75", "typify", "walkdir", ] [[package]] name = "subtle" -version = "2.5.0" +version = "2.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc" +checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" [[package]] name = "syn" @@ -3387,9 +3570,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.66" +version = "2.0.75" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c42f3f41a2de00b01c0aaad383c5a45241efc8b2d1eda5661812fda5f3cdcff5" +checksum = "f6af063034fc1935ede7be0122941bafa9bacb949334d090b77ca98b5817c7d9" dependencies = [ "proc-macro2", "quote", @@ -3398,52 +3581,21 @@ dependencies = [ [[package]] name = "sync_wrapper" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2047c6ded9c721764247e62cd3b03c09ffc529b2ba5b10ec482ae507a4a70160" - -[[package]] -name = "synstructure" -version = "0.13.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.66", -] - -[[package]] -name = "system-configuration" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba3a3adc5c275d719af8cb4272ea1c4a6d668a777f37e115f6d11ddbc1c8e0e7" -dependencies = [ - "bitflags 1.3.2", - "core-foundation", - "system-configuration-sys", -] - -[[package]] -name = "system-configuration-sys" -version = "0.5.0" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a75fb188eb626b924683e3b95e3a48e63551fcfb51949de2f06a9d91dbee93c9" -dependencies = [ - "core-foundation-sys", - "libc", -] +checksum = "a7065abeca94b6a8a577f9bd45aa0867a2238b74e8eb67cf10d492bc39351394" [[package]] name = "tempfile" -version = "3.10.1" +version = "3.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85b77fafb263dd9d05cbeac119526425676db3784113aa9295c88498cbf8bff1" +checksum = "04cbcdd0c794ebb0d4cf35e88edd2f7d2c4c3e9a5a6dab322839b321c6a87a64" dependencies = [ "cfg-if", "fastrand", + "once_cell", "rustix", - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] @@ -3474,7 +3626,7 @@ checksum = "5999e24eaa32083191ba4e425deb75cdf25efefabe5aaccb7446dd0d4122a3f5" dependencies = [ "proc-macro2", "quote", - "syn 2.0.66", + "syn 2.0.75", ] [[package]] @@ -3485,22 +3637,22 @@ checksum = "23d434d3f8967a09480fb04132ebe0a3e088c173e6d0ee7897abbdf4eab0f8b9" [[package]] name = "thiserror" -version = "1.0.61" +version = "1.0.63" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c546c80d6be4bc6a00c0f01730c08df82eaa7a7a61f11d656526506112cc1709" +checksum = "c0342370b38b6a11b6cc11d6a805569958d54cfa061a29969c3b5ce2ea405724" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.61" +version = "1.0.63" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "46c3384250002a6d5af4d114f2845d37b57521033f30d5c3f46c4d70e1197533" +checksum = "a4558b58466b9ad7ca0f102865eccc95938dca1a74a856f2b57b6629050da261" dependencies = [ "proc-macro2", "quote", - "syn 2.0.66", + "syn 2.0.75", ] [[package]] @@ -3533,21 +3685,11 @@ dependencies = [ "crunchy", ] -[[package]] -name = "tinystr" -version = "0.7.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9117f5d4db391c1cf6927e7bea3db74b9a1c1add8f7eda9ffd5364f40f57b82f" -dependencies = [ - "displaydoc", - "zerovec", -] - [[package]] name = "tinyvec" -version = "1.6.0" +version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87cc5ceb3875bb20c2890005a4e226a4651264a5c75edb2421b52861a0a0cb50" +checksum = "445e881f4f6d382d5f27c034e25eb92edd7c784ceab92a0937db7f2e9471b938" dependencies = [ "tinyvec_macros", ] @@ -3560,39 +3702,38 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.38.0" +version = "1.39.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba4f4a02a7a80d6f274636f0aa95c7e383b912d41fe721a31f29e29698585a4a" +checksum = "9babc99b9923bfa4804bd74722ff02c0381021eafa4db9949217e3be8e84fff5" dependencies = [ "backtrace", "bytes", "libc", - "mio", - "num_cpus", + "mio 1.0.2", "parking_lot", "pin-project-lite", "signal-hook-registry", "socket2", "tokio-macros", - "windows-sys 0.48.0", + "windows-sys 0.52.0", ] [[package]] name = "tokio-macros" -version = "2.3.0" +version = "2.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f5ae998a069d4b5aba8ee9dad856af7d520c3699e6159b185c2acd48155d39a" +checksum = "693d596312e88961bc67d7f1f97af8a70227d9f90c31bba5806eec004978d752" dependencies = [ "proc-macro2", "quote", - "syn 2.0.66", + "syn 2.0.75", ] [[package]] name = "tokio-postgres" -version = "0.7.10" +version = "0.7.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d340244b32d920260ae7448cb72b6e238bddc3d4f7603394e7dd46ed8e48f5b8" +checksum = "03adcf0147e203b6032c0b2d30be1415ba03bc348901f3ff1cc0df6a733e60c3" dependencies = [ "async-trait", "byteorder", @@ -3616,11 +3757,12 @@ dependencies = [ [[package]] name = "tokio-rustls" -version = "0.24.1" +version = "0.26.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c28327cf380ac148141087fbfb9de9d7bd4e84ab5d2c28fbc911d753de8a7081" +checksum = "0c7bc40d0e5a97695bb96e27995cd3a08538541b0a846f65bba7a359f36700d4" dependencies = [ "rustls", + "rustls-pki-types", "tokio", ] @@ -3660,9 +3802,9 @@ dependencies = [ [[package]] name = "toml_datetime" -version = "0.6.6" +version = "0.6.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4badfd56924ae69bcc9039335b2e017639ce3f9b001c393c1b2d1ef846ce2cbf" +checksum = "0dd7358ecb8fc2f8d014bf86f6f638ce72ba252a2c3a2572f2a795f1d23efb41" [[package]] name = "toml_edit" @@ -3670,16 +3812,37 @@ version = "0.19.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1b5bb770da30e5cbfde35a2d7b9b8a2c4b8ef89548a7a6aeab5c9a576e3e7421" dependencies = [ - "indexmap 2.2.6", + "indexmap 2.4.0", "toml_datetime", "winnow", ] +[[package]] +name = "tower" +version = "0.4.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c" +dependencies = [ + "futures-core", + "futures-util", + "pin-project", + "pin-project-lite", + "tokio", + "tower-layer", + "tower-service", +] + +[[package]] +name = "tower-layer" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e" + [[package]] name = "tower-service" -version = "0.3.2" +version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52" +checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3" [[package]] name = "tracing" @@ -3700,7 +3863,7 @@ checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.66", + "syn 2.0.75", ] [[package]] @@ -3766,9 +3929,9 @@ checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825" [[package]] name = "typify" -version = "0.0.14" +version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c2e3b707a653e2915a2fc2c4ee96a3d30b9554b9435eb4cc8b5c6c74bbdd3044" +checksum = "adb6beec125971dda80a086f90b4a70f60f222990ce4d63ad0fc140492f53444" dependencies = [ "typify-impl", "typify-macro", @@ -3776,35 +3939,38 @@ dependencies = [ [[package]] name = "typify-impl" -version = "0.0.14" +version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d9c752192779f666e4c868672dee56a652b82c08032c7e9d23f6a845b282298" +checksum = "93bbb24e990654aff858d80fee8114f4322f7d7a1b1ecb45129e2fcb0d0ad5ae" dependencies = [ - "heck 0.4.1", + "heck 0.5.0", "log", "proc-macro2", "quote", "regress", "schemars", + "semver", + "serde", "serde_json", - "syn 2.0.66", + "syn 2.0.75", "thiserror", "unicode-ident", ] [[package]] name = "typify-macro" -version = "0.0.14" +version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a14defd554507e72a2bb93cd081c8b374cfed43b3d986b141ad3839d9fd6986b" +checksum = "f8e6491896e955692d68361c68db2b263e3bec317ec0b684e0e2fa882fb6e31e" dependencies = [ "proc-macro2", "quote", "schemars", + "semver", "serde", "serde_json", "serde_tokenstream", - "syn 2.0.66", + "syn 2.0.75", "typify-impl", ] @@ -3853,12 +4019,6 @@ version = "0.2.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "673aac59facbab8a9007c7f6108d11f63b603f7cabff99fabf650fea5c32b861" -[[package]] -name = "untrusted" -version = "0.7.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a" - [[package]] name = "untrusted" version = "0.9.0" @@ -3867,27 +4027,15 @@ checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" [[package]] name = "url" -version = "2.5.1" +version = "2.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f7c25da092f0a868cdf09e8674cd3b7ef3a7d92a24253e663a2fb85e2496de56" +checksum = "22784dbdf76fdde8af1aeda5622b546b422b6fc585325248a2bf9f5e41e94d6c" dependencies = [ "form_urlencoded", "idna", "percent-encoding", ] -[[package]] -name = "utf16_iter" -version = "1.0.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8232dd3cdaed5356e0f716d285e4b40b932ac434100fe9b7e0e8e935b9e6246" - -[[package]] -name = "utf8_iter" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" - [[package]] name = "utf8parse" version = "0.2.2" @@ -3896,9 +4044,9 @@ checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" [[package]] name = "uuid" -version = "1.8.0" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a183cf7feeba97b4dd1c0d46788634f6221d87fa961b305bed08c851829efcc0" +checksum = "81dfa00651efa65069b0b6b651f4aaa31ba9e3c3ce0137aaad053604ee7e0314" dependencies = [ "getrandom", "rand", @@ -3907,13 +4055,13 @@ dependencies = [ [[package]] name = "uuid-macro-internal" -version = "1.8.0" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9881bea7cbe687e36c9ab3b778c36cd0487402e270304e8b1296d5085303c1a2" +checksum = "ee1cd046f83ea2c4e920d6ee9f7c3537ef928d75dce5d84a87c2c5d6b3999a3a" dependencies = [ "proc-macro2", "quote", - "syn 2.0.66", + "syn 2.0.75", ] [[package]] @@ -3922,17 +4070,11 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d" -[[package]] -name = "vcpkg" -version = "0.2.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" - [[package]] name = "version_check" -version = "0.9.4" +version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" [[package]] name = "walkdir" @@ -3967,34 +4109,35 @@ checksum = "b8dad83b4f25e74f184f64c43b150b91efe7647395b42289f38e50566d82855b" [[package]] name = "wasm-bindgen" -version = "0.2.92" +version = "0.2.93" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4be2531df63900aeb2bca0daaaddec08491ee64ceecbee5076636a3b026795a8" +checksum = "a82edfc16a6c469f5f44dc7b571814045d60404b55a0ee849f9bcfa2e63dd9b5" dependencies = [ "cfg-if", + "once_cell", "wasm-bindgen-macro", ] [[package]] name = "wasm-bindgen-backend" -version = "0.2.92" +version = "0.2.93" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "614d787b966d3989fa7bb98a654e369c762374fd3213d212cfc0251257e747da" +checksum = "9de396da306523044d3302746f1208fa71d7532227f15e347e2d93e4145dd77b" dependencies = [ "bumpalo", "log", "once_cell", "proc-macro2", "quote", - "syn 2.0.66", + "syn 2.0.75", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-futures" -version = "0.4.42" +version = "0.4.43" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "76bc14366121efc8dbb487ab05bcc9d346b3b5ec0eaa76e46594cabbe51762c0" +checksum = "61e9300f63a621e96ed275155c108eb6f843b6a26d053f122ab69724559dc8ed" dependencies = [ "cfg-if", "js-sys", @@ -4004,9 +4147,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.92" +version = "0.2.93" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1f8823de937b71b9460c0c34e25f3da88250760bec0ebac694b49997550d726" +checksum = "585c4c91a46b072c92e908d99cb1dcdf95c5218eeb6f3bf1efa991ee7a68cccf" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -4014,22 +4157,22 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.92" +version = "0.2.93" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7" +checksum = "afc340c74d9005395cf9dd098506f7f44e38f2b4a21c6aaacf9a105ea5e1e836" dependencies = [ "proc-macro2", "quote", - "syn 2.0.66", + "syn 2.0.75", "wasm-bindgen-backend", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-shared" -version = "0.2.92" +version = "0.2.93" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af190c94f2773fdb3729c55b007a722abb5384da03bc0986df4c289bf5567e96" +checksum = "c62a0a307cb4a311d3a07867860911ca130c3494e8c2719593806c08bc5d0484" [[package]] name = "wasm-streams" @@ -4046,20 +4189,14 @@ dependencies = [ [[package]] name = "web-sys" -version = "0.3.69" +version = "0.3.70" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77afa9a11836342370f4817622a2f0f418b134426d91a82dfb48f532d2ec13ef" +checksum = "26fdeaafd9bd129f65e7c031593c24d62186301e0c72c8978fa1678be7d532c0" dependencies = [ "js-sys", "wasm-bindgen", ] -[[package]] -name = "webpki-roots" -version = "0.25.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f20c57d8d7db6d3b86154206ae5d8fba62dd39573114de97c2cb0578251f8e1" - [[package]] name = "whoami" version = "1.5.1" @@ -4089,11 +4226,11 @@ checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" [[package]] name = "winapi-util" -version = "0.1.8" +version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4d4cc384e1e73b93bafa6fb4f1df8c41695c8a91cf9c4c64358067d15a7b6c6b" +checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" dependencies = [ - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] @@ -4108,7 +4245,7 @@ version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9" dependencies = [ - "windows-targets 0.52.5", + "windows-targets 0.52.6", ] [[package]] @@ -4126,7 +4263,16 @@ version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" dependencies = [ - "windows-targets 0.52.5", + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets 0.52.6", ] [[package]] @@ -4146,18 +4292,18 @@ dependencies = [ [[package]] name = "windows-targets" -version = "0.52.5" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6f0713a46559409d202e70e28227288446bf7841d3211583a4b53e3f6d96e7eb" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" dependencies = [ - "windows_aarch64_gnullvm 0.52.5", - "windows_aarch64_msvc 0.52.5", - "windows_i686_gnu 0.52.5", + "windows_aarch64_gnullvm 0.52.6", + "windows_aarch64_msvc 0.52.6", + "windows_i686_gnu 0.52.6", "windows_i686_gnullvm", - "windows_i686_msvc 0.52.5", - "windows_x86_64_gnu 0.52.5", - "windows_x86_64_gnullvm 0.52.5", - "windows_x86_64_msvc 0.52.5", + "windows_i686_msvc 0.52.6", + "windows_x86_64_gnu 0.52.6", + "windows_x86_64_gnullvm 0.52.6", + "windows_x86_64_msvc 0.52.6", ] [[package]] @@ -4168,9 +4314,9 @@ checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" [[package]] name = "windows_aarch64_gnullvm" -version = "0.52.5" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7088eed71e8b8dda258ecc8bac5fb1153c5cffaf2578fc8ff5d61e23578d3263" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" [[package]] name = "windows_aarch64_msvc" @@ -4180,9 +4326,9 @@ checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" [[package]] name = "windows_aarch64_msvc" -version = "0.52.5" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9985fd1504e250c615ca5f281c3f7a6da76213ebd5ccc9561496568a2752afb6" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" [[package]] name = "windows_i686_gnu" @@ -4192,15 +4338,15 @@ checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" [[package]] name = "windows_i686_gnu" -version = "0.52.5" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "88ba073cf16d5372720ec942a8ccbf61626074c6d4dd2e745299726ce8b89670" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" [[package]] name = "windows_i686_gnullvm" -version = "0.52.5" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87f4261229030a858f36b459e748ae97545d6f1ec60e5e0d6a3d32e0dc232ee9" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" [[package]] name = "windows_i686_msvc" @@ -4210,9 +4356,9 @@ checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" [[package]] name = "windows_i686_msvc" -version = "0.52.5" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "db3c2bf3d13d5b658be73463284eaf12830ac9a26a90c717b7f771dfe97487bf" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" [[package]] name = "windows_x86_64_gnu" @@ -4222,9 +4368,9 @@ checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" [[package]] name = "windows_x86_64_gnu" -version = "0.52.5" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4e4246f76bdeff09eb48875a0fd3e2af6aada79d409d33011886d3e1581517d9" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" [[package]] name = "windows_x86_64_gnullvm" @@ -4234,9 +4380,9 @@ checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" [[package]] name = "windows_x86_64_gnullvm" -version = "0.52.5" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "852298e482cd67c356ddd9570386e2862b5673c85bd5f88df9ab6802b334c596" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" [[package]] name = "windows_x86_64_msvc" @@ -4246,9 +4392,9 @@ checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" [[package]] name = "windows_x86_64_msvc" -version = "0.52.5" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bec47e5bfd1bff0eeaf6d8b485cc1074891a197ab4225d504cb7a1ab88b02bf0" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" [[package]] name = "winnow" @@ -4261,26 +4407,14 @@ dependencies = [ [[package]] name = "winreg" -version = "0.50.0" +version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "524e57b2c537c0f9b1e69f1965311ec12182b4122e45035b1508cd24d2adadb1" +checksum = "a277a57398d4bfa075df44f501a17cfdf8542d224f0d36095a2adc7aee4ef0a5" dependencies = [ "cfg-if", "windows-sys 0.48.0", ] -[[package]] -name = "write16" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d1890f4022759daae28ed4fe62859b1236caebfc61ede2f63ed4e695f3f6d936" - -[[package]] -name = "writeable" -version = "0.5.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e9df38ee2d2c3c5948ea468a8406ff0db0b29ae1ffde1bcf20ef305bcc95c51" - [[package]] name = "xz2" version = "0.1.7" @@ -4290,116 +4424,56 @@ dependencies = [ "lzma-sys", ] -[[package]] -name = "yoke" -version = "0.7.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c5b1314b079b0930c31e3af543d8ee1757b1951ae1e1565ec704403a7240ca5" -dependencies = [ - "serde", - "stable_deref_trait", - "yoke-derive", - "zerofrom", -] - -[[package]] -name = "yoke-derive" -version = "0.7.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "28cc31741b18cb6f1d5ff12f5b7523e3d6eb0852bbbad19d73905511d9849b95" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.66", - "synstructure", -] - [[package]] name = "zerocopy" -version = "0.7.34" +version = "0.7.35" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae87e3fcd617500e5d106f0380cf7b77f3c6092aae37191433159dda23cfb087" +checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0" dependencies = [ + "byteorder", "zerocopy-derive", ] [[package]] name = "zerocopy-derive" -version = "0.7.34" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "15e934569e47891f7d9411f1a451d947a60e000ab3bd24fbb970f000387d1b3b" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.66", -] - -[[package]] -name = "zerofrom" -version = "0.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91ec111ce797d0e0784a1116d0ddcdbea84322cd79e5d5ad173daeba4f93ab55" -dependencies = [ - "zerofrom-derive", -] - -[[package]] -name = "zerofrom-derive" -version = "0.1.4" +version = "0.7.35" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ea7b4a3637ea8669cedf0f1fd5c286a17f3de97b8dd5a70a6c167a1730e63a5" +checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.66", - "synstructure", -] - -[[package]] -name = "zerovec" -version = "0.10.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bb2cc8827d6c0994478a15c53f374f46fbd41bea663d809b14744bc42e6b109c" -dependencies = [ - "yoke", - "zerofrom", - "zerovec-derive", + "syn 2.0.75", ] [[package]] -name = "zerovec-derive" -version = "0.10.2" +name = "zeroize" +version = "1.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97cf56601ee5052b4417d90c8755c6683473c926039908196cf35d99f893ebe7" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.66", -] +checksum = "ced3678a2879b30306d323f4542626697a464a97c0a07c9aebf7ebca65cd4dde" [[package]] name = "zstd" -version = "0.13.1" +version = "0.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2d789b1514203a1120ad2429eae43a7bd32b90976a7bb8a05f7ec02fa88cc23a" +checksum = "fcf2b778a664581e31e389454a7072dab1647606d44f7feea22cd5abb9c9f3f9" dependencies = [ "zstd-safe", ] [[package]] name = "zstd-safe" -version = "7.1.0" +version = "7.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1cd99b45c6bc03a018c8b8a86025678c87e55526064e38f9df301989dce7ec0a" +checksum = "54a3ab4db68cea366acc5c897c7b4d4d1b8994a9cd6e6f841f8964566a419059" dependencies = [ "zstd-sys", ] [[package]] name = "zstd-sys" -version = "2.0.10+zstd.1.5.6" +version = "2.0.12+zstd.1.5.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c253a4914af5bafc8fa8c86ee400827e83cf6ec01195ec1f1ed8441bf00d65aa" +checksum = "0a4e40c320c3cb459d9a9ff6de98cff88f4751ee9275d140e2be94a2b74e4c13" dependencies = [ "cc", "pkg-config", diff --git a/rust/Cargo.toml b/rust/Cargo.toml index 96d6e6378..abdd797f6 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -19,17 +19,28 @@ opt-level = "z" lto = true [workspace.dependencies] -datafusion = { git = "https://github.com/lakesoul-io/arrow-datafusion.git", branch = "datafusion-33-parquet-prefetch" } -datafusion-common = { git = "https://github.com/lakesoul-io/arrow-datafusion.git", branch = "datafusion-33-parquet-prefetch" } -datafusion-substrait = { git = "https://github.com/lakesoul-io/arrow-datafusion.git", branch = "datafusion-33-parquet-prefetch" } -arrow = { git = "https://github.com/lakesoul-io/arrow-rs.git", branch = "arrow-rs-48-parquet-bufferred" } -arrow-schema = { git = "https://github.com/lakesoul-io/arrow-rs.git", branch = "arrow-rs-48-parquet-bufferred" } -arrow-array = { git = "https://github.com/lakesoul-io/arrow-rs.git", branch = "arrow-rs-48-parquet-bufferred" } -arrow-buffer = { git = "https://github.com/lakesoul-io/arrow-rs.git", branch = "arrow-rs-48-parquet-bufferred" } -arrow-cast = { git = "https://github.com/lakesoul-io/arrow-rs.git", branch = "arrow-rs-48-parquet-bufferred" } -arrow-arith = { git = "https://github.com/lakesoul-io/arrow-rs.git", branch = "arrow-rs-48-parquet-bufferred" } -parquet = { git = "https://github.com/lakesoul-io/arrow-rs.git", branch = "arrow-rs-48-parquet-bufferred" } -object_store = { git = "https://github.com/lakesoul-io/arrow-rs.git", branch = "object_store_0.7_opt", features = ["aws", "http"] } +#datafusion = { git = "https://github.com/lakesoul-io/arrow-datafusion.git", branch = "datafusion-33-parquet-prefetch" } +#datafusion-common = { git = "https://github.com/lakesoul-io/arrow-datafusion.git", branch = "datafusion-33-parquet-prefetch" } +#datafusion-substrait = { git = "https://github.com/lakesoul-io/arrow-datafusion.git", branch = "datafusion-33-parquet-prefetch" } +#arrow = { git = "https://github.com/lakesoul-io/arrow-rs.git", branch = "arrow-rs-48-parquet-bufferred" } +#arrow-schema = { git = "https://github.com/lakesoul-io/arrow-rs.git", branch = "arrow-rs-48-parquet-bufferred" } +#arrow-array = { git = "https://github.com/lakesoul-io/arrow-rs.git", branch = "arrow-rs-48-parquet-bufferred" } +#arrow-buffer = { git = "https://github.com/lakesoul-io/arrow-rs.git", branch = "arrow-rs-48-parquet-bufferred" } +#arrow-cast = { git = "https://github.com/lakesoul-io/arrow-rs.git", branch = "arrow-rs-48-parquet-bufferred" } +#arrow-arith = { git = "https://github.com/lakesoul-io/arrow-rs.git", branch = "arrow-rs-48-parquet-bufferred" } +#parquet = { git = "https://github.com/lakesoul-io/arrow-rs.git", branch = "arrow-rs-48-parquet-bufferred" } +#object_store = { git = "https://github.com/lakesoul-io/arrow-rs.git", branch = "object_store_0.7_opt", features = ["aws", "http"] } +datafusion = "41.0.0" +datafusion-common = "41.0.0" +datafusion-substrait = "41.0.0" +arrow = "52.2.0" +arrow-schema = "52.2.0" +arrow-array = "52.2.0" +arrow-buffer = "52.2.0" +arrow-cast = "52.2.0" +arrow-arith = "52.2.0" +parquet = "52.2.0" +object_store = { version = "0.10.2", features = ["aws", "http"] } tokio-stream = "0.1.9" tokio = { version = "1", features = ["full"] } diff --git a/rust/lakesoul-datafusion/src/catalog/lakesoul_catalog.rs b/rust/lakesoul-datafusion/src/catalog/lakesoul_catalog.rs index 2752e551c..6cd5b1766 100644 --- a/rust/lakesoul-datafusion/src/catalog/lakesoul_catalog.rs +++ b/rust/lakesoul-datafusion/src/catalog/lakesoul_catalog.rs @@ -3,7 +3,6 @@ // SPDX-License-Identifier: Apache-2.0 use crate::catalog::LakeSoulNamespace; -use datafusion::catalog::schema::SchemaProvider; use datafusion::catalog::CatalogProvider; use datafusion::error::{DataFusionError, Result}; use datafusion::prelude::SessionContext; @@ -13,6 +12,7 @@ use proto::proto::entity::Namespace; use std::any::Any; use std::fmt::{Debug, Formatter}; use std::sync::{Arc, RwLock}; +use datafusion::catalog_common::SchemaProvider; use tokio::runtime::Handle; /// A metadata wrapper diff --git a/rust/lakesoul-datafusion/src/catalog/lakesoul_namespace.rs b/rust/lakesoul-datafusion/src/catalog/lakesoul_namespace.rs index 1ea606896..cf941039e 100644 --- a/rust/lakesoul-datafusion/src/catalog/lakesoul_namespace.rs +++ b/rust/lakesoul-datafusion/src/catalog/lakesoul_namespace.rs @@ -4,14 +4,15 @@ use crate::catalog::create_io_config_builder; use async_trait::async_trait; -use datafusion::catalog::schema::SchemaProvider; +use datafusion::catalog_common::SchemaProvider; use datafusion::datasource::file_format::parquet::ParquetFormat; use datafusion::datasource::TableProvider; use datafusion::error::DataFusionError; +use datafusion::error::DataFusionError::External; use datafusion::error::Result; use datafusion::prelude::SessionContext; use lakesoul_io::datasource::file_format::LakeSoulParquetFormat; -use lakesoul_io::datasource::listing::LakeSoulListingTable; +use lakesoul_io::datasource::table::LakeSoulTableProvider; use lakesoul_metadata::error::LakeSoulMetaDataError; use lakesoul_metadata::MetaDataClientRef; use std::any::Any; @@ -98,46 +99,36 @@ impl SchemaProvider for LakeSoulNamespace { /// Search table by name /// return LakeSoulListing table - async fn table(&self, name: &str) -> Option> { + async fn table(&self, name: &str) -> Result>, DataFusionError> { let _guard = self.namespace_lock.read().await; - if self + + match self .metadata_client .get_table_info_by_table_name(name, &self.namespace) .await - .is_ok() { - debug!("call table() on table: {}.{}", &self.namespace, name); - let config; - if let Ok(config_builder) = - create_io_config_builder(self.metadata_client.clone(), Some(name), true, self.namespace()).await - { - config = config_builder.build(); - } else { - return None; - } - // Maybe should change - let file_format = Arc::new(LakeSoulParquetFormat::new( - Arc::new(ParquetFormat::new()), - config.clone(), - )); - if let Ok(table_provider) = LakeSoulListingTable::new_with_config_and_format( - &self.context.state(), - config, - file_format, - // care this - false, - ) - .await - { - debug!("get table provider success"); - return Some(Arc::new(table_provider)); - } - debug("get table provider fail"); - return None; - } else { - debug("get table provider fail"); - None + Ok(_) => {} + Err(_) => return Ok(None), } + debug!("call table() on table: {}.{}", &self.namespace, name); + let config = create_io_config_builder(self.metadata_client.clone(), Some(name), true, self.namespace()) + .await + .map_err(|e| External(Box::new(e)))? + .build(); + // Maybe should change + let file_format = Arc::new(LakeSoulParquetFormat::new( + Arc::new(ParquetFormat::new()), + config.clone(), + )); + let table_provider = LakeSoulTableProvider::new_with_config_and_format( + &self.context.state(), + config, + file_format, + // care this + false, + ) + .await?; + Ok(Some(Arc::new(table_provider))) } /// If supported by the implementation, adds a new table to this schema. @@ -176,7 +167,7 @@ impl SchemaProvider for LakeSoulNamespace { Arc::new(ParquetFormat::new()), config.clone(), )); - if let Ok(table_provider) = LakeSoulListingTable::new_with_config_and_format( + if let Ok(table_provider) = LakeSoulTableProvider::new_with_config_and_format( &cxt.state(), config, file_format, diff --git a/rust/lakesoul-datafusion/src/catalog/mod.rs b/rust/lakesoul-datafusion/src/catalog/mod.rs index 2a04a0c93..2eea3628a 100644 --- a/rust/lakesoul-datafusion/src/catalog/mod.rs +++ b/rust/lakesoul-datafusion/src/catalog/mod.rs @@ -2,20 +2,19 @@ // // SPDX-License-Identifier: Apache-2.0 -use datafusion::catalog::TableReference; +use datafusion::catalog_common::TableReference; +use lakesoul_io::lakesoul_io_config::{LakeSoulIOConfig, LakeSoulIOConfigBuilder}; +use lakesoul_metadata::MetaDataClientRef; +use proto::proto::entity::{CommitOp, DataCommitInfo, DataFileOp, FileOp, TableInfo, Uuid}; use std::env; use std::fmt::Debug; use std::sync::Arc; use std::time::SystemTime; -use lakesoul_io::lakesoul_io_config::{LakeSoulIOConfig, LakeSoulIOConfigBuilder}; -use lakesoul_metadata::MetaDataClientRef; -use proto::proto::entity::{CommitOp, DataCommitInfo, DataFileOp, FileOp, TableInfo, Uuid}; - -use crate::lakesoul_table::helpers::create_io_config_builder_from_table_info; -use crate::serialize::arrow_java::ArrowJavaSchema; // use crate::transaction::TransactionMetaInfo; use crate::error::{LakeSoulError, Result}; +use crate::lakesoul_table::helpers::create_io_config_builder_from_table_info; +use crate::serialize::arrow_java::ArrowJavaSchema; // pub mod lakesoul_sink; // pub mod lakesoul_source; @@ -85,6 +84,7 @@ pub(crate) async fn create_io_config_builder( } else { vec![] }; + println!("get table {} files: {:?}", table_name, data_files); create_io_config_builder_from_table_info(Arc::new(table_info)).map(|builder| builder.with_files(data_files)) } else { Ok(LakeSoulIOConfigBuilder::new()) diff --git a/rust/lakesoul-datafusion/src/datasource/file_format/metadata_format.rs b/rust/lakesoul-datafusion/src/datasource/file_format/metadata_format.rs index 2f11fb3ca..99c7930c6 100644 --- a/rust/lakesoul-datafusion/src/datasource/file_format/metadata_format.rs +++ b/rust/lakesoul-datafusion/src/datasource/file_format/metadata_format.rs @@ -12,15 +12,19 @@ use arrow::record_batch::RecordBatch; use async_trait::async_trait; use arrow::datatypes::{DataType, Field, Schema, SchemaBuilder, SchemaRef}; -use datafusion::common::{project_schema, FileType, Statistics}; -use datafusion::datasource::physical_plan::ParquetExec; +use datafusion::common::{project_schema, GetExt, Statistics}; +use datafusion::datasource::file_format::file_compression_type::FileCompressionType; +use datafusion::datasource::file_format::parquet::ParquetFormatFactory; +use datafusion::datasource::physical_plan::parquet::ParquetExecBuilder; use datafusion::error::DataFusionError; use datafusion::execution::TaskContext; -use datafusion::physical_expr::PhysicalSortExpr; +use datafusion::physical_expr::EquivalenceProperties; use datafusion::physical_plan::projection::ProjectionExec; use datafusion::physical_plan::stream::RecordBatchStreamAdapter; use datafusion::physical_plan::union::UnionExec; -use datafusion::physical_plan::{DisplayAs, DisplayFormatType, Distribution, Partitioning, SendableRecordBatchStream}; +use datafusion::physical_plan::{ + DisplayAs, DisplayFormatType, Distribution, Partitioning, PlanProperties, SendableRecordBatchStream, +}; use datafusion::sql::TableReference; use datafusion::{ datasource::{ @@ -96,6 +100,14 @@ impl FileFormat for LakeSoulMetaDataParquetFormat { self } + fn get_ext(&self) -> String { + ParquetFormatFactory::new().get_ext() + } + + fn get_ext_with_compression(&self, _file_compression_type: &FileCompressionType) -> Result { + Ok(self.get_ext()) + } + async fn infer_schema( &self, state: &SessionState, @@ -126,11 +138,7 @@ impl FileFormat for LakeSoulMetaDataParquetFormat { // If enable pruning then combine the filters to build the predicate. // If disable pruning then set the predicate to None, thus readers // will not prune data based on the statistics. - let predicate = self - .parquet_format - .enable_pruning(state.config_options()) - .then(|| filters.cloned()) - .flatten(); + let predicate = self.parquet_format.enable_pruning().then(|| filters.cloned()).flatten(); let file_schema = conf.file_schema.clone(); let mut builder = SchemaBuilder::from(file_schema.fields()); @@ -165,15 +173,16 @@ impl FileFormat for LakeSoulMetaDataParquetFormat { HashMap::new(); let mut column_nullable = HashSet::::new(); - for config in &flatten_conf { - let (partition_desc, partition_columnar_value) = partition_desc_from_file_scan_config(config)?; + for config in flatten_conf { + let (partition_desc, partition_columnar_value) = partition_desc_from_file_scan_config(&config)?; let partition_columnar_value = Arc::new(partition_columnar_value); - let parquet_exec = Arc::new(ParquetExec::new( - config.clone(), - predicate.clone(), - self.parquet_format.metadata_size_hint(state.config_options()), - )); + let mut builder = ParquetExecBuilder::new(config); + if let Some(predicate) = &predicate { + builder = builder.with_predicate(predicate.clone()); + } + + let parquet_exec = Arc::new(builder.build()); for field in parquet_exec.schema().fields().iter() { if field.is_nullable() { column_nullable.insert(field.name().clone()); @@ -252,10 +261,6 @@ impl FileFormat for LakeSoulMetaDataParquetFormat { as _, ) } - - fn file_type(&self) -> FileType { - FileType::PARQUET - } } // /// Execution plan for writing record batches to a [`LakeSoulParquetSink`] @@ -276,6 +281,8 @@ pub struct LakeSoulHashSinkExec { metadata_client: MetaDataClientRef, range_partitions: Arc>, + + cache: PlanProperties, } impl Debug for LakeSoulHashSinkExec { @@ -295,13 +302,19 @@ impl LakeSoulHashSinkExec { let (range_partitions, _) = parse_table_info_partitions(table_info.partitions.clone()) .map_err(|_| DataFusionError::External("parse table_info.partitions failed".into()))?; let range_partitions = Arc::new(range_partitions); + let schema = make_sink_schema(); Ok(Self { - input, - sink_schema: make_sink_schema(), + input: input.clone(), + sink_schema: schema.clone(), sort_order, table_info, metadata_client, range_partitions, + cache: PlanProperties::new( + EquivalenceProperties::new(schema), + Partitioning::UnknownPartitioning(1), + input.properties().execution_mode, + ), }) } @@ -434,6 +447,10 @@ impl DisplayAs for LakeSoulHashSinkExec { } impl ExecutionPlan for LakeSoulHashSinkExec { + fn name(&self) -> &str { + "LakeSoulHashSinkExec" + } + /// Return a reference to Any that can be used for downcasting fn as_any(&self) -> &dyn Any { self @@ -444,16 +461,8 @@ impl ExecutionPlan for LakeSoulHashSinkExec { self.sink_schema.clone() } - fn output_partitioning(&self) -> Partitioning { - Partitioning::UnknownPartitioning(1) - } - - fn unbounded_output(&self, _children: &[bool]) -> Result { - Ok(_children[0]) - } - - fn output_ordering(&self) -> Option<&[PhysicalSortExpr]> { - None + fn properties(&self) -> &PlanProperties { + &self.cache } fn required_input_distribution(&self) -> Vec { @@ -474,6 +483,7 @@ impl ExecutionPlan for LakeSoulHashSinkExec { Some(requirements) => vec![Some(requirements.clone())], None => vec![self .input + .properties() .output_ordering() .map(PhysicalSortRequirement::from_sort_exprs)], } @@ -489,8 +499,8 @@ impl ExecutionPlan for LakeSoulHashSinkExec { vec![false] } - fn children(&self) -> Vec> { - vec![self.input.clone()] + fn children(&self) -> Vec<&Arc> { + vec![&self.input] } fn with_new_children(self: Arc, children: Vec>) -> Result> { @@ -501,6 +511,7 @@ impl ExecutionPlan for LakeSoulHashSinkExec { table_info: self.table_info.clone(), range_partitions: self.range_partitions.clone(), metadata_client: self.metadata_client.clone(), + cache: self.cache.clone(), })) } @@ -512,7 +523,7 @@ impl ExecutionPlan for LakeSoulHashSinkExec { "FileSinkExec can only be called on partition 0!".to_string(), )); } - let num_input_partitions = self.input.output_partitioning().partition_count(); + let num_input_partitions = self.input.properties().output_partitioning().partition_count(); // launch one async task per *input* partition let mut join_handles = vec![]; diff --git a/rust/lakesoul-datafusion/src/datasource/table_provider.rs b/rust/lakesoul-datafusion/src/datasource/table_provider.rs index 6c5308a5a..aa71ca867 100644 --- a/rust/lakesoul-datafusion/src/datasource/table_provider.rs +++ b/rust/lakesoul-datafusion/src/datasource/table_provider.rs @@ -9,21 +9,22 @@ use std::sync::Arc; use arrow::compute::SortOptions; use arrow::datatypes::{DataType, Schema, SchemaRef}; use async_trait::async_trait; -use datafusion::{execution::context::SessionState, logical_expr::Expr}; -use datafusion::common::{FileTypeWriterOptions, project_schema, Statistics, ToDFSchema}; -use datafusion::datasource::file_format::FileFormat; +use datafusion::catalog::Session; +use datafusion::common::{project_schema, Statistics, ToDFSchema}; use datafusion::datasource::file_format::parquet::ParquetFormat; +use datafusion::datasource::file_format::FileFormat; use datafusion::datasource::listing::{ListingOptions, ListingTableUrl, PartitionedFile}; use datafusion::datasource::physical_plan::{FileScanConfig, FileSinkConfig}; use datafusion::datasource::TableProvider; use datafusion::error::{DataFusionError, Result}; -use datafusion::logical_expr::{TableProviderFilterPushDown, TableType}; use datafusion::logical_expr::expr::Sort; -use datafusion::optimizer::utils::conjunction; +use datafusion::logical_expr::utils::conjunction; +use datafusion::logical_expr::{TableProviderFilterPushDown, TableType}; use datafusion::physical_expr::{create_physical_expr, LexOrdering, PhysicalSortExpr}; use datafusion::physical_plan::empty::EmptyExec; use datafusion::physical_plan::ExecutionPlan; use datafusion::scalar::ScalarValue; +use datafusion::{execution::context::SessionState, logical_expr::Expr}; use futures::stream::FuturesUnordered; use futures::StreamExt; @@ -89,7 +90,9 @@ impl LakeSoulTableProvider { let file_format: Arc = Arc::new( LakeSoulMetaDataParquetFormat::new( client.clone(), - Arc::new(ParquetFormat::new()), + Arc::new( + ParquetFormat::new().with_enable_pruning(session_state.config_options().execution.parquet.pruning), + ), table_info.clone(), lakesoul_io_config.clone(), ) @@ -140,11 +143,9 @@ impl LakeSoulTableProvider { fn is_partition_filter(&self, f: &Expr) -> bool { // O(nm), n = number of expr fields, m = number of range partitions - if let Ok(cols) = f.to_columns() { - cols.iter().all(|col| self.range_partitions.contains(&col.name)) - } else { - false - } + f.column_refs() + .iter() + .all(|col| self.range_partitions.contains(&col.name)) } pub fn options(&self) -> &ListingOptions { @@ -203,7 +204,7 @@ impl LakeSoulTableProvider { async fn list_files_for_scan<'a>( &'a self, - ctx: &'a SessionState, + ctx: &'a dyn Session, filters: &'a [Expr], _limit: Option, ) -> Result<(Vec>, Statistics)> { @@ -223,17 +224,22 @@ impl LakeSoulTableProvider { ) })?; - let prune_partition_info = prune_partitions(all_partition_info, filters, self.table_partition_cols()) - .await - .map_err(|_| { - DataFusionError::External( - format!( - "get all partition_info of table {} failed", - &self.table_info().table_name - ) - .into(), + let prune_partition_info = prune_partitions( + self.table_info.as_ref(), + all_partition_info, + filters, + self.table_partition_cols(), + ) + .await + .map_err(|_| { + DataFusionError::External( + format!( + "get all partition_info of table {} failed", + &self.table_info().table_name ) - })?; + .into(), + ) + })?; let mut futures = FuturesUnordered::new(); for partition in prune_partition_info { @@ -259,6 +265,7 @@ impl LakeSoulTableProvider { object_meta, partition_values: partition_values.clone(), range: None, + statistics: None, extensions: None, }) .collect::>(); @@ -285,7 +292,7 @@ impl TableProvider for LakeSoulTableProvider { async fn scan( &self, - state: &SessionState, + state: &dyn Session, projection: Option<&Vec>, filters: &[Expr], limit: Option, @@ -296,7 +303,7 @@ impl TableProvider for LakeSoulTableProvider { if partitioned_file_lists.is_empty() { let schema = self.schema(); let projected_schema = project_schema(&schema, projection)?; - return Ok(Arc::new(EmptyExec::new(false, projected_schema))); + return Ok(Arc::new(EmptyExec::new(projected_schema))); } // extract types of partition columns @@ -311,7 +318,7 @@ impl TableProvider for LakeSoulTableProvider { let filters = if let Some(expr) = conjunction(filters.to_vec()) { // NOTE: Use the table schema (NOT file schema) here because `expr` may contain references to partition columns. let table_df_schema = self.schema().as_ref().clone().to_dfschema()?; - let filters = create_physical_expr(&expr, &table_df_schema, &self.schema(), state.execution_props())?; + let filters = create_physical_expr(&expr, &table_df_schema, state.execution_props())?; Some(filters) } else { None @@ -320,14 +327,15 @@ impl TableProvider for LakeSoulTableProvider { let object_store_url = if let Some(url) = self.table_paths().first() { url.object_store() } else { - return Ok(Arc::new(EmptyExec::new(false, Arc::new(Schema::empty())))); + return Ok(Arc::new(EmptyExec::new(Arc::new(Schema::empty())))); }; + let session_state = state.as_any().downcast_ref::().unwrap(); // create the execution plan self.options() .format .create_physical_plan( - state, + session_state, FileScanConfig { object_store_url, file_schema: self.schema(), @@ -338,7 +346,6 @@ impl TableProvider for LakeSoulTableProvider { limit, output_ordering: self.try_create_output_ordering()?, table_partition_cols, - infinite_source: false, }, filters.as_ref(), ) @@ -360,7 +367,7 @@ impl TableProvider for LakeSoulTableProvider { async fn insert_into( &self, - state: &SessionState, + state: &dyn Session, input: Arc, overwrite: bool, ) -> Result> { @@ -368,13 +375,6 @@ impl TableProvider for LakeSoulTableProvider { // Get the object store for the table path. let _store = state.runtime_env().object_store(table_path)?; - let file_format = self.options().format.as_ref(); - - let file_type_writer_options = match &self.options().file_type_write_options { - Some(opt) => opt.clone(), - None => FileTypeWriterOptions::build_default(&file_format.file_type(), state.config_options())?, - }; - // Sink related option, apart from format let config = FileSinkConfig { object_store_url: self.table_paths()[0].object_store(), @@ -382,17 +382,8 @@ impl TableProvider for LakeSoulTableProvider { file_groups: vec![], output_schema: self.schema(), table_partition_cols: self.options().table_partition_cols.clone(), - writer_mode: datafusion::datasource::file_format::write::FileWriterMode::PutMultipart, - // A plan can produce finite number of rows even if it has unbounded sources, like LIMIT - // queries. Thus, we can check if the plan is streaming to ensure file sink input is - // unbounded. When `unbounded_input` flag is `true` for sink, we occasionally call `yield_now` - // to consume data at the input. When `unbounded_input` flag is `false` (e.g. non-streaming data), - // all the data at the input is sink after execution finishes. See discussion for rationale: - // https://github.com/apache/arrow-datafusion/pull/7610#issuecomment-1728979918 - unbounded_input: false, - single_file_output: self.options().single_file, overwrite, - file_type_writer_options, + keep_partition_by_columns: false, }; let unsorted: Vec> = vec![]; @@ -402,9 +393,10 @@ impl TableProvider for LakeSoulTableProvider { None }; + let session_state = state.as_any().downcast_ref::().unwrap(); self.options() .format - .create_writer_physical_plan(input, state, config, order_requirements) + .create_writer_physical_plan(input, session_state, config, order_requirements) .await } } diff --git a/rust/lakesoul-datafusion/src/lakesoul_table/helpers.rs b/rust/lakesoul-datafusion/src/lakesoul_table/helpers.rs index 5e8ccec1c..ef6b42621 100644 --- a/rust/lakesoul-datafusion/src/lakesoul_table/helpers.rs +++ b/rust/lakesoul-datafusion/src/lakesoul_table/helpers.rs @@ -14,10 +14,7 @@ use arrow_arith::boolean::and; use arrow_cast::cast; use datafusion::{ - common::{DFField, DFSchema}, - error::DataFusionError, - execution::context::ExecutionProps, - logical_expr::Expr, + common::DFSchema, error::DataFusionError, execution::context::ExecutionProps, logical_expr::Expr, physical_expr::create_physical_expr, }; use lakesoul_metadata::MetaDataClientRef; @@ -46,6 +43,7 @@ pub(crate) fn create_io_config_builder_from_table_info(table_info: Arc, filters: &[Expr], partition_cols: &[(String, DataType)], @@ -87,7 +85,12 @@ pub async fn prune_partitions( let df_schema = DFSchema::new_with_metadata( partition_cols .iter() - .map(|(n, d)| DFField::new_unqualified(n, d.clone(), true)) + .map(|(n, d)| { + ( + Some(format!("{}.{}", table_info.table_namespace, table_info.table_name).into()), + Arc::new(Field::new(n, d.clone(), false)), + ) + }) .collect(), Default::default(), )?; @@ -99,7 +102,7 @@ pub async fn prune_partitions( // Applies `filter` to `batch` returning `None` on error let do_filter = |filter| -> Option { - let expr = create_physical_expr(filter, &df_schema, &schema, &props).ok()?; + let expr = create_physical_expr(filter, &df_schema, &props).ok()?; expr.evaluate(&batch).ok()?.into_array(all_partition_info.len()).ok() }; diff --git a/rust/lakesoul-datafusion/src/planner/physical_planner.rs b/rust/lakesoul-datafusion/src/planner/physical_planner.rs index 0552a3ce4..fda6dedf2 100644 --- a/rust/lakesoul-datafusion/src/planner/physical_planner.rs +++ b/rust/lakesoul-datafusion/src/planner/physical_planner.rs @@ -76,7 +76,6 @@ impl PhysicalPlanner for LakeSoulPhysicalPlanner { let physical_input = if !lakesoul_table.primary_keys().is_empty() || !lakesoul_table.range_partitions().is_empty() { - let input_schema = physical_input.schema(); let input_dfschema = input.as_ref().schema(); let sort_expr = column_names_to_physical_sort_expr( [ @@ -86,13 +85,11 @@ impl PhysicalPlanner for LakeSoulPhysicalPlanner { .concat() .as_slice(), input_dfschema, - &input_schema, session_state, )?; let hash_partitioning_expr = column_names_to_physical_expr( lakesoul_table.primary_keys(), input_dfschema, - &input_schema, session_state, )?; @@ -101,7 +98,6 @@ impl PhysicalPlanner for LakeSoulPhysicalPlanner { let range_partitioning_expr = column_names_to_physical_expr( lakesoul_table.range_partitions(), input_dfschema, - &input_schema, session_state, )?; let sort_exec = Arc::new(SortExec::new(sort_expr, physical_input)); @@ -116,7 +112,7 @@ impl PhysicalPlanner for LakeSoulPhysicalPlanner { provider.insert_into(session_state, physical_input, false).await } - Err(e) => return Err(DataFusionError::External(Box::new(e))), + Err(e) => Err(DataFusionError::External(Box::new(e))), } } LogicalPlan::Statement(statement) => { @@ -146,10 +142,9 @@ impl PhysicalPlanner for LakeSoulPhysicalPlanner { &self, expr: &Expr, input_dfschema: &DFSchema, - input_schema: &Schema, session_state: &SessionState, ) -> Result> { self.default_planner - .create_physical_expr(expr, input_dfschema, input_schema, session_state) + .create_physical_expr(expr, input_dfschema, session_state) } } diff --git a/rust/lakesoul-datafusion/src/serialize/arrow_java.rs b/rust/lakesoul-datafusion/src/serialize/arrow_java.rs index a6e47b3b6..9b019b152 100644 --- a/rust/lakesoul-datafusion/src/serialize/arrow_java.rs +++ b/rust/lakesoul-datafusion/src/serialize/arrow_java.rs @@ -274,6 +274,7 @@ impl From<&FieldRef> for ArrowJavaField { DataType::Duration(_) => todo!("Duration type not supported"), DataType::Interval(_) => todo!("Interval type not supported"), DataType::RunEndEncoded(_, _) => todo!("RunEndEncoded type not supported"), + _ => panic!(), }; let nullable = field.is_nullable(); ArrowJavaField { diff --git a/rust/lakesoul-datafusion/src/test/catalog_tests.rs b/rust/lakesoul-datafusion/src/test/catalog_tests.rs index 2b4517c72..ccf9578c1 100644 --- a/rust/lakesoul-datafusion/src/test/catalog_tests.rs +++ b/rust/lakesoul-datafusion/src/test/catalog_tests.rs @@ -10,8 +10,7 @@ mod catalog_tests { use arrow::array::{ArrayRef, Int32Array, RecordBatch}; use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; use datafusion::assert_batches_eq; - use datafusion::catalog::schema::SchemaProvider; - use datafusion::catalog::CatalogProvider; + use datafusion::catalog::{CatalogProvider, SchemaProvider}; use lakesoul_io::lakesoul_io_config::create_session_context; use lakesoul_io::lakesoul_io_config::LakeSoulIOConfigBuilder; use lakesoul_metadata::{MetaDataClient, MetaDataClientRef}; @@ -178,7 +177,7 @@ mod catalog_tests { assert_eq!(names.len(), tables.len()); for name in names { assert!(schema.table_exist(&name)); - assert!(schema.table(&name).await.is_some()); + assert!(schema.table(&name).await.unwrap().is_some()); assert!(schema.deregister_table(&name).unwrap().is_some()); } } @@ -284,9 +283,9 @@ mod catalog_tests { }); } - #[test] - fn test_all_cases() { - test_catalog_api(); - test_catalog_sql(); - } + // #[test] + // fn test_all_cases() { + // test_catalog_api(); + // test_catalog_sql(); + // } } diff --git a/rust/lakesoul-datafusion/src/test/hash_tests.rs b/rust/lakesoul-datafusion/src/test/hash_tests.rs index c9fb4c540..cd7d4ccdf 100644 --- a/rust/lakesoul-datafusion/src/test/hash_tests.rs +++ b/rust/lakesoul-datafusion/src/test/hash_tests.rs @@ -3,13 +3,14 @@ // SPDX-License-Identifier: Apache-2.0 mod hash_tests { + use chrono::DateTime; use lakesoul_io::hash_utils::{HashValue, HASH_SEED}; #[test] fn chrono_test() { let date = chrono::NaiveDate::parse_from_str("0001-01-01", "%Y-%m-%d").unwrap(); let datetime = date.and_hms_opt(12, 12, 12).unwrap(); - let epoch_time = chrono::NaiveDateTime::from_timestamp_millis(0).unwrap(); + let epoch_time = DateTime::from_timestamp_millis(0).unwrap().naive_utc(); println!("{}", datetime.signed_duration_since(epoch_time).num_days() as i32); println!( @@ -27,7 +28,7 @@ mod hash_tests { lakesoul_io::constant::FLINK_TIMESTAMP_FORMAT, ) .unwrap(); - let epoch_time = chrono::NaiveDateTime::from_timestamp_millis(0).unwrap(); + let epoch_time = DateTime::from_timestamp_millis(0).unwrap().naive_utc(); println!("{}", datetime.signed_duration_since(epoch_time).num_days() as i32); println!( diff --git a/rust/lakesoul-datafusion/src/test/upsert_tests.rs b/rust/lakesoul-datafusion/src/test/upsert_tests.rs index 5f91239bb..5888cf0a7 100644 --- a/rust/lakesoul-datafusion/src/test/upsert_tests.rs +++ b/rust/lakesoul-datafusion/src/test/upsert_tests.rs @@ -1462,10 +1462,10 @@ mod upsert_with_io_config_tests { .and_hms_micro_opt(8, 28, 53, 123456) .unwrap(); - let val1 = dt1.timestamp_micros(); - let val2 = dt2.timestamp_micros(); - let val3 = dt3.timestamp_micros(); - let val4 = dt4.timestamp_micros(); + let val1 = dt1.and_utc().timestamp_micros(); + let val2 = dt2.and_utc().timestamp_micros(); + let val3 = dt3.and_utc().timestamp_micros(); + let val4 = dt4.and_utc().timestamp_micros(); let table_name = "test_merge_same_column_with_timestamp_type_i64_time"; let builder = init_table( @@ -1519,10 +1519,10 @@ mod upsert_with_io_config_tests { .and_hms_micro_opt(8, 28, 53, 123456) .unwrap(); - let val1 = dt1.timestamp_micros(); - let _val2 = _dt2.timestamp_micros(); - let val3 = dt3.timestamp_micros(); - let val4 = dt4.timestamp_micros(); + let val1 = dt1.and_utc().timestamp_micros(); + let _val2 = _dt2.and_utc().timestamp_micros(); + let val3 = dt3.and_utc().timestamp_micros(); + let val4 = dt4.and_utc().timestamp_micros(); let table_name = "merge_different_columns_with_timestamp_type_i32_time"; let builder = init_table( @@ -3232,10 +3232,10 @@ mod upsert_with_metadata_tests { .and_hms_micro_opt(8, 28, 53, 123456) .unwrap(); - let val1 = dt1.timestamp_micros(); - let val2 = dt2.timestamp_micros(); - let val3 = dt3.timestamp_micros(); - let val4 = dt4.timestamp_micros(); + let val1 = dt1.and_utc().timestamp_micros(); + let val2 = dt2.and_utc().timestamp_micros(); + let val3 = dt3.and_utc().timestamp_micros(); + let val4 = dt4.and_utc().timestamp_micros(); let table_name = "test_merge_same_column_with_timestamp_type_i64_time"; let client = Arc::new(MetaDataClient::from_env().await?); @@ -3307,10 +3307,10 @@ mod upsert_with_metadata_tests { .and_hms_micro_opt(8, 28, 53, 123456) .unwrap(); - let val1 = dt1.timestamp_micros(); - let _val2 = _dt2.timestamp_micros(); - let val3 = dt3.timestamp_micros(); - let val4 = dt4.timestamp_micros(); + let val1 = dt1.and_utc().timestamp_micros(); + let _val2 = _dt2.and_utc().timestamp_micros(); + let val3 = dt3.and_utc().timestamp_micros(); + let val4 = dt4.and_utc().timestamp_micros(); let table_name = "merge_different_columns_with_timestamp_type_i32_time"; let client = Arc::new(MetaDataClient::from_env().await?); diff --git a/rust/lakesoul-io/Cargo.toml b/rust/lakesoul-io/Cargo.toml index 33e35ad90..89642eaf0 100644 --- a/rust/lakesoul-io/Cargo.toml +++ b/rust/lakesoul-io/Cargo.toml @@ -44,7 +44,6 @@ anyhow = { workspace = true, features = [] } prost = { workspace = true } env_logger = "0.11" - [features] hdfs = ["dep:hdrs"] default = [] @@ -55,9 +54,6 @@ datafusion-substrait = { workspace = true } [target.'cfg(not(target_os = "windows"))'.dependencies] datafusion-substrait = { workspace = true, features = ["protoc"] } - - - [dev-dependencies] tempfile = "3.3.0" comfy-table = "6.0" diff --git a/rust/lakesoul-io/src/datasource/empty_schema.rs b/rust/lakesoul-io/src/datasource/empty_schema.rs index 5c52c44d6..bbf8c880e 100644 --- a/rust/lakesoul-io/src/datasource/empty_schema.rs +++ b/rust/lakesoul-io/src/datasource/empty_schema.rs @@ -8,10 +8,10 @@ use std::sync::Arc; use arrow::datatypes::{Schema, SchemaRef}; use async_trait::async_trait; -use datafusion::execution::context::SessionState; use datafusion::logical_expr::Expr; use datafusion::physical_plan::ExecutionPlan; use datafusion::{datasource::TableProvider, logical_expr::TableType}; +use datafusion::catalog::Session; use datafusion_common::Result; use super::physical_plan::EmptySchemaScanExec; @@ -47,7 +47,7 @@ impl TableProvider for EmptySchemaProvider { async fn scan( &self, - _state: &SessionState, + _state: &dyn Session, _projections: Option<&Vec>, // filters and limit can be used here to inject some push-down operations if needed _filters: &[Expr], diff --git a/rust/lakesoul-io/src/datasource/file_format.rs b/rust/lakesoul-io/src/datasource/file_format.rs index db2cf41ef..7334e0671 100644 --- a/rust/lakesoul-io/src/datasource/file_format.rs +++ b/rust/lakesoul-io/src/datasource/file_format.rs @@ -16,14 +16,16 @@ use datafusion::execution::context::SessionState; use datafusion::physical_expr::PhysicalSortRequirement; use datafusion::physical_plan::projection::ProjectionExec; use datafusion::physical_plan::{ExecutionPlan, PhysicalExpr}; -use datafusion_common::{project_schema, FileType, Result, Statistics}; +use datafusion_common::{project_schema, DataFusionError, GetExt, Result, Statistics}; use object_store::{ObjectMeta, ObjectStore}; -use async_trait::async_trait; - -use crate::datasource::{listing::LakeSoulListingTable, physical_plan::MergeParquetExec}; +use crate::datasource::{table::LakeSoulTableProvider, physical_plan::MergeParquetExec}; use crate::lakesoul_io_config::LakeSoulIOConfig; +use async_trait::async_trait; +use datafusion::datasource::file_format::file_compression_type::FileCompressionType; +use datafusion::datasource::file_format::parquet::ParquetFormatFactory; +use datafusion_common::parsers::CompressionTypeVariant; /// LakeSoul `FileFormat` implementation for supporting Apache Parquet /// @@ -62,6 +64,23 @@ impl FileFormat for LakeSoulParquetFormat { self } + fn get_ext(&self) -> String { + ParquetFormatFactory::new().get_ext() + } + + fn get_ext_with_compression( + &self, + file_compression_type: &FileCompressionType, + ) -> Result { + let ext = self.get_ext(); + match file_compression_type.get_variant() { + CompressionTypeVariant::UNCOMPRESSED => Ok(ext), + _ => Err(DataFusionError::Internal( + "Parquet FileFormat does not support compression.".into(), + )), + } + } + async fn infer_schema( &self, state: &SessionState, @@ -94,11 +113,11 @@ impl FileFormat for LakeSoulParquetFormat { // will not prune data based on the statistics. let predicate = self .parquet_format - .enable_pruning(state.config_options()) + .enable_pruning() .then(|| filters.cloned()) .flatten(); - let table_schema = LakeSoulListingTable::compute_table_schema(conf.file_schema.clone(), &self.conf)?; + let table_schema = LakeSoulTableProvider::compute_table_schema(conf.file_schema.clone(), &self.conf)?; // projection for Table instead of File let projection = conf.projection.clone(); let target_schema = project_schema(&table_schema, projection.as_ref())?; @@ -126,7 +145,7 @@ impl FileFormat for LakeSoulParquetFormat { merged_schema.clone(), flatten_conf, predicate, - self.parquet_format.metadata_size_hint(state.config_options()), + self.parquet_format.metadata_size_hint(), self.conf.clone(), )?); @@ -155,10 +174,6 @@ impl FileFormat for LakeSoulParquetFormat { .create_writer_physical_plan(input, state, conf, order_requirements) .await } - - fn file_type(&self) -> FileType { - FileType::PARQUET - } } pub async fn flatten_file_scan_config( @@ -196,7 +211,6 @@ pub async fn flatten_file_scan_config( let limit = conf.limit; let table_partition_cols = conf.table_partition_cols.clone(); let output_ordering = conf.output_ordering.clone(); - let infinite_source = conf.infinite_source; let config = FileScanConfig { object_store_url: object_store_url.clone(), file_schema, @@ -206,7 +220,6 @@ pub async fn flatten_file_scan_config( limit, table_partition_cols, output_ordering, - infinite_source, }; flatten_configs.push(config); } diff --git a/rust/lakesoul-io/src/datasource/listing.rs b/rust/lakesoul-io/src/datasource/listing.rs deleted file mode 100644 index 3366868d8..000000000 --- a/rust/lakesoul-io/src/datasource/listing.rs +++ /dev/null @@ -1,156 +0,0 @@ -// SPDX-FileCopyrightText: 2023 LakeSoul Contributors -// -// SPDX-License-Identifier: Apache-2.0 - -use std::any::Any; -use std::fmt::{Debug, Formatter}; -use std::sync::Arc; - -use arrow_schema::SchemaBuilder; -use async_trait::async_trait; - -use arrow::datatypes::{Schema, SchemaRef}; - -use datafusion::datasource::file_format::FileFormat; -use datafusion::datasource::listing::{ListingOptions, ListingTable, ListingTableUrl}; -use datafusion::execution::context::SessionState; -use datafusion::physical_plan::ExecutionPlan; -use datafusion::{datasource::TableProvider, logical_expr::Expr}; - -use datafusion::logical_expr::{TableProviderFilterPushDown, TableType}; -use datafusion_common::{DataFusionError, Result}; -use tracing::{debug, instrument}; - -use crate::helpers::listing_table_from_lakesoul_io_config; -use crate::lakesoul_io_config::LakeSoulIOConfig; -use crate::transform::uniform_schema; - -pub struct LakeSoulListingTable { - listing_table: Arc, - lakesoul_io_config: LakeSoulIOConfig, - table_schema: SchemaRef, -} - -impl Debug for LakeSoulListingTable { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - f.debug_struct("LakeSoulListingTable{..}").finish() - } -} - -impl LakeSoulListingTable { - pub async fn new_with_config_and_format( - session_state: &SessionState, - lakesoul_io_config: LakeSoulIOConfig, - file_format: Arc, - as_sink: bool, - ) -> Result { - let (file_schema, listing_table) = - listing_table_from_lakesoul_io_config(session_state, lakesoul_io_config.clone(), file_format, as_sink) - .await?; - let file_schema = file_schema.ok_or_else(|| DataFusionError::Internal("No schema provided.".into()))?; - let table_schema = Self::compute_table_schema(file_schema, &lakesoul_io_config)?; - - Ok(Self { - listing_table, - lakesoul_io_config, - table_schema, - }) - } - - pub fn options(&self) -> &ListingOptions { - self.listing_table.options() - } - - pub fn table_paths(&self) -> &Vec { - self.listing_table.table_paths() - } - - pub fn compute_table_schema(file_schema: SchemaRef, config: &LakeSoulIOConfig) -> Result { - let target_schema = if config.inferring_schema { - SchemaRef::new(Schema::empty()) - } else { - uniform_schema(config.target_schema()) - }; - let mut builder = SchemaBuilder::from(target_schema.fields()); - // O(n^2), n is the number of fields in file_schema and config.partition_schema - for field in file_schema.fields() { - if !target_schema.field_with_name(field.name()).is_ok() { - builder.try_merge(field)?; - } - } - for field in config.partition_schema().fields() { - if !target_schema.field_with_name(field.name()).is_ok() { - builder.try_merge(field)?; - } - } - Ok(Arc::new(builder.finish())) - } -} - -#[async_trait] -impl TableProvider for LakeSoulListingTable { - fn as_any(&self) -> &dyn Any { - self - } - - fn schema(&self) -> SchemaRef { - self.table_schema.clone() - } - - fn table_type(&self) -> TableType { - TableType::Base - } - - #[instrument] - async fn scan( - &self, - state: &SessionState, - projection: Option<&Vec>, - // filters and limit can be used here to inject some push-down operations if needed - filters: &[Expr], - limit: Option, - ) -> Result> { - debug!("listing scan start"); - self.listing_table.scan(state, projection, filters, limit).await - } - - fn supports_filters_pushdown(&self, filters: &[&Expr]) -> Result> { - if self.lakesoul_io_config.primary_keys.is_empty() { - if self.lakesoul_io_config.parquet_filter_pushdown { - Ok(vec![TableProviderFilterPushDown::Exact; filters.len()]) - } else { - Ok(vec![TableProviderFilterPushDown::Unsupported; filters.len()]) - } - } else { - // O(nml), n = number of filters, m = number of primary keys, l = number of columns - filters - .iter() - .map(|f| { - if let Ok(cols) = f.to_columns() { - if self.lakesoul_io_config.parquet_filter_pushdown - && cols - .iter() - .all(|col| self.lakesoul_io_config.primary_keys.contains(&col.name)) - { - // use primary key - Ok(TableProviderFilterPushDown::Inexact) - } else { - Ok(TableProviderFilterPushDown::Unsupported) - } - } else { - Ok(TableProviderFilterPushDown::Unsupported) - } - }) - .collect() - } - } - - async fn insert_into( - &self, - state: &SessionState, - input: Arc, - overwrite: bool, - ) -> Result> { - self.listing_table.insert_into(state, input, overwrite).await - } -} diff --git a/rust/lakesoul-io/src/datasource/mod.rs b/rust/lakesoul-io/src/datasource/mod.rs index 206a8d376..ca6608396 100644 --- a/rust/lakesoul-io/src/datasource/mod.rs +++ b/rust/lakesoul-io/src/datasource/mod.rs @@ -4,5 +4,5 @@ pub mod empty_schema; pub mod file_format; -pub mod listing; +pub mod table; pub mod physical_plan; diff --git a/rust/lakesoul-io/src/datasource/physical_plan/defatul_column.rs b/rust/lakesoul-io/src/datasource/physical_plan/defatul_column.rs index 51c6d6042..f5322608e 100644 --- a/rust/lakesoul-io/src/datasource/physical_plan/defatul_column.rs +++ b/rust/lakesoul-io/src/datasource/physical_plan/defatul_column.rs @@ -6,9 +6,10 @@ use std::sync::Arc; use std::{any::Any, collections::HashMap}; use arrow_schema::SchemaRef; +use datafusion::physical_expr::EquivalenceProperties; +use datafusion::physical_plan::PlanProperties; use datafusion::{ execution::TaskContext, - physical_expr::PhysicalSortExpr, physical_plan::{DisplayAs, DisplayFormatType, ExecutionPlan, SendableRecordBatchStream}, }; use datafusion_common::{DataFusionError, Result}; @@ -20,6 +21,7 @@ pub struct DefaultColumnExec { input: Arc, target_schema: SchemaRef, default_column_value: Arc>, + cache: PlanProperties, } impl DefaultColumnExec { @@ -28,10 +30,16 @@ impl DefaultColumnExec { target_schema: SchemaRef, default_column_value: Arc>, ) -> Result { + let execution_mode = input.properties().execution_mode; Ok(Self { input, - target_schema, + target_schema: target_schema.clone(), default_column_value, + cache: PlanProperties::new( + EquivalenceProperties::new(target_schema), + datafusion::physical_plan::Partitioning::UnknownPartitioning(1), + execution_mode, + ), }) } } @@ -43,6 +51,10 @@ impl DisplayAs for DefaultColumnExec { } impl ExecutionPlan for DefaultColumnExec { + fn name(&self) -> &str { + "DefaultColumnExec" + } + fn as_any(&self) -> &dyn Any { self } @@ -51,15 +63,11 @@ impl ExecutionPlan for DefaultColumnExec { self.target_schema.clone() } - fn output_partitioning(&self) -> datafusion::physical_plan::Partitioning { - datafusion::physical_plan::Partitioning::UnknownPartitioning(1) - } - - fn output_ordering(&self) -> Option<&[PhysicalSortExpr]> { - None + fn properties(&self) -> &PlanProperties { + &self.cache } - fn children(&self) -> Vec> { + fn children(&self) -> Vec<&Arc> { vec![] } @@ -74,8 +82,8 @@ impl ExecutionPlan for DefaultColumnExec { ))); } - let mut streams = Vec::with_capacity(self.input.output_partitioning().partition_count()); - for i in 0..self.input.output_partitioning().partition_count() { + let mut streams = Vec::with_capacity(self.cache.output_partitioning().partition_count()); + for i in 0..self.cache.output_partitioning().partition_count() { let stream = self.input.execute(i, context.clone())?; streams.push(stream); } diff --git a/rust/lakesoul-io/src/datasource/physical_plan/empty_schema.rs b/rust/lakesoul-io/src/datasource/physical_plan/empty_schema.rs index 18d42d3ad..6c40ac770 100644 --- a/rust/lakesoul-io/src/datasource/physical_plan/empty_schema.rs +++ b/rust/lakesoul-io/src/datasource/physical_plan/empty_schema.rs @@ -5,27 +5,34 @@ use std::any::Any; use std::sync::Arc; +use crate::default_column_stream::empty_schema_stream::EmptySchemaStream; use arrow_schema::{Schema, SchemaRef}; +use datafusion::physical_expr::EquivalenceProperties; +use datafusion::physical_plan::{ExecutionMode, PlanProperties}; use datafusion::{ execution::TaskContext, - physical_expr::PhysicalSortExpr, physical_plan::{DisplayAs, DisplayFormatType, ExecutionPlan, SendableRecordBatchStream}, }; use datafusion_common::Result; -use crate::default_column_stream::empty_schema_stream::EmptySchemaStream; - #[derive(Debug)] pub struct EmptySchemaScanExec { count: usize, empty_schema: SchemaRef, + cache: PlanProperties, } impl EmptySchemaScanExec { pub fn new(count: usize) -> Self { + let empty_schema = SchemaRef::new(Schema::empty()); Self { count, - empty_schema: SchemaRef::new(Schema::empty()), + empty_schema: empty_schema.clone(), + cache: PlanProperties::new( + EquivalenceProperties::new(empty_schema), + datafusion::physical_plan::Partitioning::UnknownPartitioning(1), + ExecutionMode::Bounded, + ), } } } @@ -37,6 +44,10 @@ impl DisplayAs for EmptySchemaScanExec { } impl ExecutionPlan for EmptySchemaScanExec { + fn name(&self) -> &str { + "EmptySchemaScanExec" + } + fn as_any(&self) -> &dyn Any { self } @@ -45,15 +56,11 @@ impl ExecutionPlan for EmptySchemaScanExec { self.empty_schema.clone() } - fn output_partitioning(&self) -> datafusion::physical_plan::Partitioning { - datafusion::physical_plan::Partitioning::UnknownPartitioning(1) - } - - fn output_ordering(&self) -> Option<&[PhysicalSortExpr]> { - None + fn properties(&self) -> &PlanProperties { + &self.cache } - fn children(&self) -> Vec> { + fn children(&self) -> Vec<&Arc> { vec![] } diff --git a/rust/lakesoul-io/src/datasource/physical_plan/merge.rs b/rust/lakesoul-io/src/datasource/physical_plan/merge.rs index 7e7f53aed..66b4d903a 100644 --- a/rust/lakesoul-io/src/datasource/physical_plan/merge.rs +++ b/rust/lakesoul-io/src/datasource/physical_plan/merge.rs @@ -7,13 +7,16 @@ use std::{any::Any, collections::HashMap}; use arrow_schema::{Field, Schema, SchemaRef}; use datafusion::dataframe::DataFrame; +use datafusion::datasource::physical_plan::parquet::ParquetExecBuilder; use datafusion::logical_expr::Expr; +use datafusion::physical_expr::EquivalenceProperties; +use datafusion::physical_plan::{ExecutionMode, PlanProperties}; use datafusion::{ - datasource::physical_plan::{FileScanConfig, ParquetExec}, + datasource::physical_plan::FileScanConfig, execution::TaskContext, - physical_expr::PhysicalSortExpr, physical_plan::{DisplayAs, DisplayFormatType, ExecutionPlan, PhysicalExpr, SendableRecordBatchStream}, }; +use datafusion_common::config::TableParquetOptions; use datafusion_common::{DFSchemaRef, DataFusionError, Result}; use datafusion_substrait::substrait::proto::Plan; use log::debug; @@ -32,6 +35,7 @@ pub struct MergeParquetExec { default_column_value: Arc>, merge_operators: Arc>, inputs: Vec>, + cache: PlanProperties, } impl MergeParquetExec { @@ -46,8 +50,14 @@ impl MergeParquetExec { // source file parquet scan let mut inputs = Vec::>::new(); for config in flatten_configs { - let single_exec = Arc::new(ParquetExec::new(config, predicate.clone(), metadata_size_hint)); - inputs.push(single_exec); + let mut builder = ParquetExecBuilder::new_with_options(config, TableParquetOptions::default()); + if let Some(predicate) = &predicate { + builder = builder.with_predicate(predicate.clone()); + } + if let Some(metadata_size_hint) = metadata_size_hint { + builder = builder.with_metadata_size_hint(metadata_size_hint); + } + inputs.push(Arc::new(builder.build())); } // O(nml), n = number of schema fields, m = number of file schema fields, l = number of files let schema = SchemaRef::new(Schema::new( @@ -76,11 +86,12 @@ impl MergeParquetExec { let merge_operators: Arc> = Arc::new(io_config.merge_operators); Ok(Self { - schema, + schema: schema.clone(), inputs, primary_keys, default_column_value, merge_operators, + cache: Self::compute_properties(schema), }) } @@ -94,11 +105,12 @@ impl MergeParquetExec { let merge_operators = Arc::new(io_config.merge_operators); Ok(Self { - schema, + schema: schema.clone(), inputs, primary_keys, default_column_value, merge_operators, + cache: Self::compute_properties(schema), }) } @@ -113,6 +125,14 @@ impl MergeParquetExec { pub fn merge_operators(&self) -> Arc> { self.merge_operators.clone() } + + fn compute_properties(schema: SchemaRef) -> PlanProperties { + PlanProperties::new( + EquivalenceProperties::new(schema), + datafusion::physical_plan::Partitioning::UnknownPartitioning(1), + ExecutionMode::Bounded, + ) + } } impl DisplayAs for MergeParquetExec { @@ -122,6 +142,10 @@ impl DisplayAs for MergeParquetExec { } impl ExecutionPlan for MergeParquetExec { + fn name(&self) -> &str { + "MergeParquetExec" + } + fn as_any(&self) -> &dyn Any { self } @@ -130,16 +154,12 @@ impl ExecutionPlan for MergeParquetExec { self.schema.clone() } - fn output_partitioning(&self) -> datafusion::physical_plan::Partitioning { - datafusion::physical_plan::Partitioning::UnknownPartitioning(1) + fn properties(&self) -> &PlanProperties { + &self.cache } - fn output_ordering(&self) -> Option<&[PhysicalSortExpr]> { - None - } - - fn children(&self) -> Vec> { - self.inputs.clone() + fn children(&self) -> Vec<&Arc> { + self.inputs.iter().map(|c| c).collect() } fn with_new_children(self: Arc, inputs: Vec>) -> Result> { @@ -149,6 +169,7 @@ impl ExecutionPlan for MergeParquetExec { primary_keys: self.primary_keys(), default_column_value: self.default_column_value(), merge_operators: self.merge_operators(), + cache: Self::compute_properties(self.schema()), })) } @@ -162,7 +183,7 @@ impl ExecutionPlan for MergeParquetExec { let mut stream_init_futs = Vec::with_capacity(self.inputs.len()); for i in 0..self.inputs.len() { let input = &self.inputs[i]; - let input_partition_count = input.output_partitioning().partition_count(); + let input_partition_count = input.properties().output_partitioning().partition_count(); if input_partition_count != 1 { return Err(DataFusionError::Internal(format!( "Invalid input partition count {input_partition_count}. \ @@ -256,15 +277,13 @@ pub fn convert_filter(df: &DataFrame, filter_str: Vec, filter_protos: Ve let arrow_schema = Arc::new(Schema::from(df.schema())); debug!("schema:{:?}", arrow_schema); let mut str_filters = vec![]; - let mut proto_filters = vec![]; for f in &filter_str { let filter = FilterParser::parse(f.clone(), arrow_schema.clone())?; str_filters.push(filter); } - for p in &filter_protos { - let e = FilterParser::parse_proto(p, df.schema())?; - proto_filters.push(e); - } + let proto_filters = filter_protos.into_iter().map(|plan| { + FilterParser::parse_substrait_plan(plan, df.schema()) + }).collect::>>()?; debug!("str filters: {:#?}", str_filters); debug!("proto filters: {:#?}", proto_filters); if proto_filters.is_empty() { diff --git a/rust/lakesoul-io/src/datasource/table.rs b/rust/lakesoul-io/src/datasource/table.rs new file mode 100644 index 000000000..b6c1c5285 --- /dev/null +++ b/rust/lakesoul-io/src/datasource/table.rs @@ -0,0 +1,200 @@ +// SPDX-FileCopyrightText: 2023 LakeSoul Contributors +// +// SPDX-License-Identifier: Apache-2.0 + +use std::any::Any; +use std::fmt::{Debug, Formatter}; +use std::sync::Arc; + +use arrow_schema::SchemaBuilder; +use async_trait::async_trait; + +use arrow::datatypes::{Schema, SchemaRef}; + +use crate::helpers::listing_table_from_lakesoul_io_config; +use crate::lakesoul_io_config::LakeSoulIOConfig; +use crate::transform::uniform_schema; +use datafusion::catalog::Session; +use datafusion::datasource::file_format::FileFormat; +use datafusion::datasource::listing::{ListingOptions, ListingTableUrl, PartitionedFile}; +use datafusion::datasource::physical_plan::FileScanConfig; +use datafusion::execution::context::SessionState; +use datafusion::logical_expr::utils::conjunction; +use datafusion::logical_expr::{TableProviderFilterPushDown, TableType}; +use datafusion::physical_plan::empty::EmptyExec; +use datafusion::physical_plan::ExecutionPlan; +use datafusion::{datasource::TableProvider, logical_expr::Expr}; +use datafusion_common::DataFusionError::ObjectStore; +use datafusion_common::{DataFusionError, Result, Statistics, ToDFSchema}; +use futures::future; +use object_store::path::Path; +use tracing::debug; +use url::Url; + +pub struct LakeSoulTableProvider { + lakesoul_io_config: LakeSoulIOConfig, + table_schema: SchemaRef, + listing_options: ListingOptions, + listing_table_paths: Vec, +} + +impl Debug for LakeSoulTableProvider { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + f.debug_struct("LakeSoulListingTable{..}").finish() + } +} + +impl LakeSoulTableProvider { + pub async fn new_with_config_and_format( + session_state: &SessionState, + lakesoul_io_config: LakeSoulIOConfig, + file_format: Arc, + as_sink: bool, + ) -> Result { + let (file_schema, listing_table) = listing_table_from_lakesoul_io_config( + session_state, + lakesoul_io_config.clone(), + file_format.clone(), + as_sink, + ) + .await?; + + let listing_options = listing_table.options().clone(); + let listing_table_paths = listing_table.table_paths().clone(); + let file_schema = file_schema.ok_or_else(|| DataFusionError::Internal("No schema provided.".into()))?; + let table_schema = Self::compute_table_schema(file_schema, &lakesoul_io_config)?; + + Ok(Self { + lakesoul_io_config, + table_schema, + listing_options, + listing_table_paths, + }) + } + + pub fn compute_table_schema(file_schema: SchemaRef, config: &LakeSoulIOConfig) -> Result { + let target_schema = if config.inferring_schema { + SchemaRef::new(Schema::empty()) + } else { + uniform_schema(config.target_schema()) + }; + let mut builder = SchemaBuilder::from(target_schema.fields()); + // O(n^2), n is the number of fields in file_schema and config.partition_schema + for field in file_schema.fields() { + if !target_schema.field_with_name(field.name()).is_ok() { + builder.try_merge(field)?; + } + } + for field in config.partition_schema().fields() { + if !target_schema.field_with_name(field.name()).is_ok() { + builder.try_merge(field)?; + } + } + Ok(Arc::new(builder.finish())) + } +} + +#[async_trait] +impl TableProvider for LakeSoulTableProvider { + fn as_any(&self) -> &dyn Any { + self + } + + fn schema(&self) -> SchemaRef { + self.table_schema.clone() + } + + fn table_type(&self) -> TableType { + TableType::Base + } + + async fn scan( + &self, + state: &dyn Session, + projection: Option<&Vec>, + // filters and limit can be used here to inject some push-down operations if needed + filters: &[Expr], + limit: Option, + ) -> Result> { + debug!("listing scan start"); + let object_store_url = if let Some(url) = self.listing_table_paths.get(0) { + url.object_store() + } else { + return Ok(Arc::new(EmptyExec::new(Arc::new(Schema::empty())))); + }; + let statistics = Statistics::new_unknown(&self.schema()); + + let filters = if let Some(expr) = conjunction(filters.to_vec()) { + // NOTE: Use the table schema (NOT file schema) here because `expr` may contain references to partition columns. + let table_df_schema = self.table_schema.as_ref().clone().to_dfschema()?; + let filters = + datafusion::physical_expr::create_physical_expr(&expr, &table_df_schema, state.execution_props())?; + Some(filters) + } else { + None + }; + + let session_state = state.as_any().downcast_ref::().unwrap(); + let store = state.runtime_env().object_store(object_store_url.clone())?; + let partition_files: Result> = + future::try_join_all(self.listing_table_paths.iter().map(|url| { + let store = store.clone(); + async move { + Ok(PartitionedFile::from( + store + .head(&Path::from_url_path( + >::as_ref(url).path(), + )?) + .await + .map_err(ObjectStore)?, + )) + } + })) + .await; + self.listing_options + .format + .create_physical_plan( + session_state, + FileScanConfig { + object_store_url, + file_schema: Arc::clone(&self.schema()), + file_groups: vec![partition_files?], + statistics, + projection: projection.cloned(), + limit, + output_ordering: vec![], + table_partition_cols: vec![], + }, + filters.as_ref(), + ) + .await + } + + fn supports_filters_pushdown(&self, filters: &[&Expr]) -> Result> { + if self.lakesoul_io_config.primary_keys.is_empty() { + if self.lakesoul_io_config.parquet_filter_pushdown { + Ok(vec![TableProviderFilterPushDown::Exact; filters.len()]) + } else { + Ok(vec![TableProviderFilterPushDown::Unsupported; filters.len()]) + } + } else { + // O(nml), n = number of filters, m = number of primary keys, l = number of columns + filters + .iter() + .map(|f| { + let cols = f.column_refs(); + if self.lakesoul_io_config.parquet_filter_pushdown + && cols + .iter() + .all(|col| self.lakesoul_io_config.primary_keys.contains(&col.name)) + { + // use primary key + Ok(TableProviderFilterPushDown::Inexact) + } else { + Ok(TableProviderFilterPushDown::Unsupported) + } + }) + .collect() + } + } +} diff --git a/rust/lakesoul-io/src/default_column_stream/empty_schema_stream.rs b/rust/lakesoul-io/src/default_column_stream/empty_schema_stream.rs index a3033ecdd..11886a34f 100644 --- a/rust/lakesoul-io/src/default_column_stream/empty_schema_stream.rs +++ b/rust/lakesoul-io/src/default_column_stream/empty_schema_stream.rs @@ -52,7 +52,7 @@ impl Stream for EmptySchemaStream { vec![], &RecordBatchOptions::new().with_row_count(Some(row_count)), ); - Poll::Ready(Some(batch.map_err(ArrowError))) + Poll::Ready(Some(batch.map_err(|e| ArrowError(e, None)))) } else { Poll::Ready(None) } diff --git a/rust/lakesoul-io/src/filter/parser.rs b/rust/lakesoul-io/src/filter/parser.rs index 970673bab..4ccbb406e 100644 --- a/rust/lakesoul-io/src/filter/parser.rs +++ b/rust/lakesoul-io/src/filter/parser.rs @@ -2,34 +2,30 @@ // // SPDX-License-Identifier: Apache-2.0 -use std::collections::HashMap; use std::ops::Not; -use std::str::FromStr; use std::sync::Arc; use anyhow::anyhow; -use arrow_schema::{DataType, Field, Fields, SchemaRef, TimeUnit}; -use datafusion::logical_expr::{expr, BinaryExpr, BuiltinScalarFunction, Expr, Operator}; -use datafusion::prelude::col; +use arrow_schema::{DataType, Field, Fields, SchemaRef}; +use datafusion::functions::core::expr_ext::FieldAccessor; +use datafusion::logical_expr::Expr; +use datafusion::prelude::{col, SessionContext}; use datafusion::scalar::ScalarValue; -use datafusion_common::{not_impl_err, plan_err, Column, DFSchema, DataFusionError, Result}; -use datafusion_substrait::substrait; -use datafusion_substrait::substrait::proto::expression::field_reference::ReferenceType::DirectReference; +use datafusion_common::DataFusionError::{External, Internal}; +use datafusion_common::{Column, DFSchema, Result}; +use datafusion_substrait::extensions::Extensions; +use datafusion_substrait::logical_plan::consumer::from_substrait_rex; +use datafusion_substrait::substrait::proto::expression::field_reference::ReferenceType; use datafusion_substrait::substrait::proto::expression::literal::LiteralType; -use datafusion_substrait::substrait::proto::expression::reference_segment::ReferenceType; -use datafusion_substrait::substrait::proto::expression::{Literal, RexType}; -use datafusion_substrait::substrait::proto::extensions::simple_extension_declaration::MappingType; +use datafusion_substrait::substrait::proto::expression::reference_segment::StructField; +use datafusion_substrait::substrait::proto::expression::{reference_segment, Literal, RexType}; use datafusion_substrait::substrait::proto::function_argument::ArgType; +use datafusion_substrait::substrait::proto::plan_rel::RelType::Root; use datafusion_substrait::substrait::proto::r#type::Nullability; -use datafusion_substrait::substrait::proto::read_rel::ReadType; -use datafusion_substrait::substrait::proto::rel::RelType; -use datafusion_substrait::substrait::proto::{plan_rel, r#type, Expression, Plan, Rel, Type}; -use datafusion_substrait::variation_const::{ - DATE_32_TYPE_REF, DATE_64_TYPE_REF, DECIMAL_128_TYPE_REF, DECIMAL_256_TYPE_REF, DEFAULT_CONTAINER_TYPE_REF, - DEFAULT_TYPE_REF, LARGE_CONTAINER_TYPE_REF, TIMESTAMP_MICRO_TYPE_REF, TIMESTAMP_MILLI_TYPE_REF, - TIMESTAMP_NANO_TYPE_REF, TIMESTAMP_SECOND_TYPE_REF, UNSIGNED_INTEGER_TYPE_REF, -}; -use log::debug; +use datafusion_substrait::substrait::proto::rel::RelType::Read; +use datafusion_substrait::substrait::proto::{Expression, FunctionArgument, Plan, PlanRel, Rel, RelRoot}; +use tokio::runtime::{Builder, Handle}; +use tokio::task; pub struct Parser {} @@ -76,12 +72,10 @@ impl Parser { } fn parse_filter_str(filter: String) -> Result<(String, String, String)> { - let op_offset = filter - .find('(') - .ok_or(DataFusionError::External(anyhow!("wrong filter str").into()))?; + let op_offset = filter.find('(').ok_or(External(anyhow!("wrong filter str").into()))?; let (op, filter) = filter.split_at(op_offset); if !filter.ends_with(')') { - return Err(DataFusionError::External(anyhow!("wrong filter str").into())); + return Err(External(anyhow!("wrong filter str").into())); } let filter = &filter[1..filter.len() - 1]; let mut k: usize = 0; @@ -118,17 +112,13 @@ impl Parser { DataType::Decimal128(precision, scale) => { if precision <= 18 { Expr::Literal(ScalarValue::Decimal128( - Some( - value - .parse::() - .map_err(|e| DataFusionError::External(Box::new(e)))?, - ), + Some(value.parse::().map_err(|e| External(Box::new(e)))?), precision, scale, )) } else { let binary_vec = Parser::parse_binary_array(value.as_str())? - .ok_or(DataFusionError::External(anyhow!("parse binary array failed").into()))?; + .ok_or(External(anyhow!("parse binary array failed").into()))?; let mut arr = [0u8; 16]; for idx in 0..binary_vec.len() { arr[idx + 16 - binary_vec.len()] = binary_vec[idx]; @@ -141,52 +131,32 @@ impl Parser { } } DataType::Boolean => Expr::Literal(ScalarValue::Boolean(Some( - value - .parse::() - .map_err(|e| DataFusionError::External(Box::new(e)))?, + value.parse::().map_err(|e| External(Box::new(e)))?, ))), DataType::Binary => Expr::Literal(ScalarValue::Binary(Parser::parse_binary_array(value.as_str())?)), DataType::Float32 => Expr::Literal(ScalarValue::Float32(Some( - value - .parse::() - .map_err(|e| DataFusionError::External(Box::new(e)))?, + value.parse::().map_err(|e| External(Box::new(e)))?, ))), DataType::Float64 => Expr::Literal(ScalarValue::Float64(Some( - value - .parse::() - .map_err(|e| DataFusionError::External(Box::new(e)))?, + value.parse::().map_err(|e| External(Box::new(e)))?, ))), DataType::Int8 => Expr::Literal(ScalarValue::Int8(Some( - value - .parse::() - .map_err(|e| DataFusionError::External(Box::new(e)))?, + value.parse::().map_err(|e| External(Box::new(e)))?, ))), DataType::Int16 => Expr::Literal(ScalarValue::Int16(Some( - value - .parse::() - .map_err(|e| DataFusionError::External(Box::new(e)))?, + value.parse::().map_err(|e| External(Box::new(e)))?, ))), DataType::Int32 => Expr::Literal(ScalarValue::Int32(Some( - value - .parse::() - .map_err(|e| DataFusionError::External(Box::new(e)))?, + value.parse::().map_err(|e| External(Box::new(e)))?, ))), DataType::Int64 => Expr::Literal(ScalarValue::Int64(Some( - value - .parse::() - .map_err(|e| DataFusionError::External(Box::new(e)))?, + value.parse::().map_err(|e| External(Box::new(e)))?, ))), DataType::Date32 => Expr::Literal(ScalarValue::Date32(Some( - value - .parse::() - .map_err(|e| DataFusionError::External(Box::new(e)))?, + value.parse::().map_err(|e| External(Box::new(e)))?, ))), DataType::Timestamp(_, _) => Expr::Literal(ScalarValue::TimestampMicrosecond( - Some( - value - .parse::() - .map_err(|e| DataFusionError::External(Box::new(e)))?, - ), + Some(value.parse::().map_err(|e| External(Box::new(e)))?), Some(crate::constant::LAKESOUL_TIMEZONE.into()), )), DataType::Utf8 => { @@ -215,7 +185,7 @@ impl Parser { .iter() .map(|s| s.parse::()) .map(|s| { - let s = s.map_err(|e| DataFusionError::External(Box::new(e)))?; + let s = s.map_err(|e| External(Box::new(e)))?; if s < 0 { Ok((s + 256) as u8) } else { @@ -228,389 +198,79 @@ impl Parser { Ok(res) } - pub(crate) fn parse_proto(plan: &Plan, df_schema: &DFSchema) -> Result { - let function_extension = plan - .extensions - .iter() - .map(|e| match &e.mapping_type { - Some(ext) => match ext { - MappingType::ExtensionFunction(ext_f) => Ok((ext_f.function_anchor, &ext_f.name)), - _ => not_impl_err!("Extension type not supported: {ext:?}"), - }, - None => not_impl_err!("Cannot parse empty extension"), - }) - .collect::>>()?; - // Parse relations - match plan.relations.len() { - 1 => match plan.relations[0].rel_type.as_ref() { - Some(rt) => match rt { - plan_rel::RelType::Rel(rel) => Ok(Parser::parse_rel(rel, &function_extension, df_schema)?), - plan_rel::RelType::Root(root) => Ok(Parser::parse_rel( - root.input - .as_ref() - .ok_or(DataFusionError::Substrait("wrong root".to_string()))?, - &function_extension, - df_schema, - )?), - }, - None => plan_err!("Cannot parse plan relation: None"), - }, - _ => not_impl_err!( - "Substrait plan with more than 1 relation trees not supported. Number of relation trees: {:?}", - plan.relations.len() - ), - } - } - - fn parse_rel(rel: &Rel, extensions: &HashMap, df_schema: &DFSchema) -> Result { - match &rel.rel_type { - Some(RelType::Read(read)) => match &read.as_ref().read_type { - None => { - not_impl_err!("unsupported") - } - Some(ReadType::NamedTable(_nt)) => { - let e = read - .filter - .as_ref() - .ok_or(DataFusionError::Substrait("wrong filter".to_string()))?; - Parser::parse_rex(e.as_ref(), df_schema, extensions) - } - Some(_) => { - not_impl_err!("un supported") - } - }, - _ => not_impl_err!("un supported"), - } - } - - // recursion - fn parse_rex(e: &Expression, input_schema: &DFSchema, extensions: &HashMap) -> Result { - match &e.rex_type { - Some(RexType::Selection(field_ref)) => match &field_ref.reference_type { - Some(DirectReference(direct)) => match &direct.reference_type.as_ref() { - Some(ReferenceType::MapKey(x)) => match &x.child.as_ref() { - Some(_) => not_impl_err!("MapKey is not supported"), - None => { - let literal = x - .map_key - .as_ref() - .ok_or(DataFusionError::Substrait("can not get map key".into()))?; - let sv = from_substrait_literal(literal)?; - let field_name = match sv { - ScalarValue::Utf8(s) => { - s.ok_or(DataFusionError::Substrait("can not get map key".into())) + // caller may only pass MapKey for field reference, + // we need to change it to StructField since from_substrait_field_reference + // only supports it + fn modify_substrait_argument(arguments: &mut Vec, df_schema: &DFSchema) { + for arg in arguments { + match &mut arg.arg_type { + Some(ArgType::Value(Expression { + rex_type: Some(RexType::Selection(f)), + })) => { + if let Some(ReferenceType::DirectReference(reference_segment)) = &mut f.reference_type { + if let Some(reference_segment::ReferenceType::MapKey(map_key)) = + &mut reference_segment.reference_type + { + if let Some(Literal { + literal_type: Some(LiteralType::String(name)), + .. + }) = &map_key.map_key + { + if let Some(idx) = df_schema.index_of_column_by_name(None, name.as_ref()) { + reference_segment.reference_type = + Some(reference_segment::ReferenceType::StructField(Box::new(StructField { + field: idx as i32, + child: None, + }))); } - _ => not_impl_err!("map key wrong type"), - }?; - debug!("field name: {}", field_name); - let column = input_schema - .field_with_unqualified_name(&field_name)? - .qualified_column(); - Ok(Expr::Column(Column { - relation: column.relation, - name: column.name, - })) - } - }, - _ => not_impl_err!("Direct reference with types other than MapKey is not supported"), - }, - _ => not_impl_err!("unsupported field ref type"), - }, - Some(RexType::ScalarFunction(f)) => { - let fn_name = extensions.get(&f.function_reference).ok_or_else(|| { - DataFusionError::NotImplemented(format!( - "Aggregated function not found: function reference = {:?}", - f.function_reference - )) - })?; - let fn_type = scalar_function_type_from_str(fn_name)?; - match fn_type { - ScalarFunctionType::Builtin(fun) => { - let mut args = Vec::with_capacity(f.arguments.len()); - for arg in &f.arguments { - let arg_expr = match &arg.arg_type { - Some(ArgType::Value(e)) => Parser::parse_rex(e, input_schema, extensions), - _ => not_impl_err!("Aggregated function argument non-Value type not supported"), - }; - args.push(arg_expr?); - } - Ok(Expr::ScalarFunction(expr::ScalarFunction { fun, args })) - } - ScalarFunctionType::Op(op) => { - if f.arguments.len() != 2 { - return not_impl_err!("Expect two arguments for binary operator {op:?}"); - } - let lhs = &f.arguments[0].arg_type; - let rhs = &f.arguments[1].arg_type; - - match (lhs, rhs) { - (Some(ArgType::Value(l)), Some(ArgType::Value(r))) => Ok(Expr::BinaryExpr(BinaryExpr { - left: Box::new(Parser::parse_rex(l, input_schema, extensions)?), - op, - right: Box::new(Parser::parse_rex(r, input_schema, extensions)?), - })), - (l, r) => not_impl_err!("Invalid arguments for binary expression: {l:?} and {r:?}"), - } - } - ScalarFunctionType::Not => { - let arg = f.arguments.first().ok_or_else(|| { - DataFusionError::Substrait("expect one argument for `NOT` expr".to_string()) - })?; - match &arg.arg_type { - Some(ArgType::Value(e)) => { - let expr = Parser::parse_rex(e, input_schema, extensions)?; - Ok(Expr::Not(Box::new(expr))) - } - _ => not_impl_err!("Invalid arguments for Not expression"), - } - } - ScalarFunctionType::IsNull => { - let arg = f.arguments.first().ok_or_else(|| { - DataFusionError::Substrait("expect one argument for `IS NULL` expr".to_string()) - })?; - match &arg.arg_type { - Some(ArgType::Value(e)) => { - let expr = Parser::parse_rex(e, input_schema, extensions)?; - Ok(Expr::IsNull(Box::new(expr))) - } - _ => not_impl_err!("Invalid arguments for IS NULL expression"), - } - } - ScalarFunctionType::IsNotNull => { - let arg = f.arguments.first().ok_or_else(|| { - DataFusionError::Substrait("expect one argument for `IS NOT NULL` expr".to_string()) - })?; - match &arg.arg_type { - Some(ArgType::Value(e)) => { - let expr = Parser::parse_rex(e, input_schema, extensions)?; - Ok(Expr::IsNotNull(Box::new(expr))) - } - _ => { - not_impl_err!("Invalid arguments for IS NOT NULL expression") } } } - _ => not_impl_err!("not implemented"), } + Some(ArgType::Value(Expression { + rex_type: Some(RexType::ScalarFunction(f)), + })) => { + Self::modify_substrait_argument(&mut f.arguments, df_schema); + } + _ => {} } - Some(RexType::Literal(lit)) => { - let scalar_value = from_substrait_literal(lit)?; - Ok(Expr::Literal(scalar_value)) - } - _ => unimplemented!(), } } -} - -enum ScalarFunctionType { - Builtin(BuiltinScalarFunction), - Op(Operator), - /// [Expr::Not] - Not, - /// [Expr::Like] Used for filtering rows based on the given wildcard pattern. Case-sensitive - Like, - /// [Expr::Like] Case insensitive operator counterpart of `Like` - ILike, - /// [Expr::IsNull] - IsNull, - /// [Expr::IsNotNull] - IsNotNull, -} - -pub fn name_to_op(name: &str) -> Result { - match name { - "equal" => Ok(Operator::Eq), - "not_equal" => Ok(Operator::NotEq), - "lt" => Ok(Operator::Lt), - "lte" => Ok(Operator::LtEq), - "gt" => Ok(Operator::Gt), - "gte" => Ok(Operator::GtEq), - "add" => Ok(Operator::Plus), - "subtract" => Ok(Operator::Minus), - "multiply" => Ok(Operator::Multiply), - "divide" => Ok(Operator::Divide), - "mod" => Ok(Operator::Modulo), - "and" => Ok(Operator::And), - "or" => Ok(Operator::Or), - "is_distinct_from" => Ok(Operator::IsDistinctFrom), - "is_not_distinct_from" => Ok(Operator::IsNotDistinctFrom), - "regex_match" => Ok(Operator::RegexMatch), - "regex_imatch" => Ok(Operator::RegexIMatch), - "regex_not_match" => Ok(Operator::RegexNotMatch), - "regex_not_imatch" => Ok(Operator::RegexNotIMatch), - "bitwise_and" => Ok(Operator::BitwiseAnd), - "bitwise_or" => Ok(Operator::BitwiseOr), - "str_concat" => Ok(Operator::StringConcat), - "at_arrow" => Ok(Operator::AtArrow), - "arrow_at" => Ok(Operator::ArrowAt), - "bitwise_xor" => Ok(Operator::BitwiseXor), - "bitwise_shift_right" => Ok(Operator::BitwiseShiftRight), - "bitwise_shift_left" => Ok(Operator::BitwiseShiftLeft), - _ => not_impl_err!("Unsupported function name: {name:?}"), - } -} - -fn scalar_function_type_from_str(name: &str) -> Result { - let (name, _) = name - .split_once(':') - .ok_or(DataFusionError::Substrait("wrong func type".to_string()))?; - if let Ok(op) = datafusion_substrait::logical_plan::consumer::name_to_op(name) { - return Ok(ScalarFunctionType::Op(op)); - } - - if let Ok(fun) = BuiltinScalarFunction::from_str(name) { - return Ok(ScalarFunctionType::Builtin(fun)); - } - match name { - "not" => Ok(ScalarFunctionType::Not), - "like" => Ok(ScalarFunctionType::Like), - "ilike" => Ok(ScalarFunctionType::ILike), - "is_null" => Ok(ScalarFunctionType::IsNull), - "is_not_null" => Ok(ScalarFunctionType::IsNotNull), - others => not_impl_err!("Unsupported function name: {others:?}"), - } -} - -fn from_substrait_literal(lit: &Literal) -> Result { - let scalar_value = match &lit.literal_type { - Some(LiteralType::Boolean(b)) => ScalarValue::Boolean(Some(*b)), - Some(LiteralType::I8(n)) => match lit.type_variation_reference { - DEFAULT_TYPE_REF => ScalarValue::Int8(Some(*n as i8)), - UNSIGNED_INTEGER_TYPE_REF => ScalarValue::UInt8(Some(*n as u8)), - others => { - return Err(DataFusionError::Substrait(format!( - "Unknown type variation reference {others}", - ))); - } - }, - Some(LiteralType::I16(n)) => match lit.type_variation_reference { - DEFAULT_TYPE_REF => ScalarValue::Int16(Some(*n as i16)), - UNSIGNED_INTEGER_TYPE_REF => ScalarValue::UInt16(Some(*n as u16)), - others => { - return Err(DataFusionError::Substrait(format!( - "Unknown type variation reference {others}", - ))); - } - }, - Some(LiteralType::I32(n)) => match lit.type_variation_reference { - DEFAULT_TYPE_REF => ScalarValue::Int32(Some(*n)), - UNSIGNED_INTEGER_TYPE_REF => ScalarValue::UInt32(Some(*n as u32)), - others => { - return Err(DataFusionError::Substrait(format!( - "Unknown type variation reference {others}", - ))); - } - }, - Some(LiteralType::I64(n)) => match lit.type_variation_reference { - DEFAULT_TYPE_REF => ScalarValue::Int64(Some(*n)), - UNSIGNED_INTEGER_TYPE_REF => ScalarValue::UInt64(Some(*n as u64)), - others => { - return Err(DataFusionError::Substrait(format!( - "Unknown type variation reference {others}", - ))); - } - }, - Some(LiteralType::Fp32(f)) => ScalarValue::Float32(Some(*f)), - Some(LiteralType::Fp64(f)) => ScalarValue::Float64(Some(*f)), - Some(LiteralType::Timestamp(t)) => ScalarValue::TimestampMicrosecond(Some(*t), None), - Some(LiteralType::TimestampTz(t)) => ScalarValue::TimestampMicrosecond(Some(*t), None), - - Some(LiteralType::Date(d)) => ScalarValue::Date32(Some(*d)), - Some(LiteralType::String(s)) => match lit.type_variation_reference { - DEFAULT_CONTAINER_TYPE_REF => ScalarValue::Utf8(Some(s.clone())), - LARGE_CONTAINER_TYPE_REF => ScalarValue::LargeUtf8(Some(s.clone())), - others => { - return Err(DataFusionError::Substrait(format!( - "Unknown type variation reference {others}", - ))); + pub(crate) fn parse_substrait_plan(plan: Plan, df_schema: &DFSchema) -> Result { + let handle = Handle::try_current(); + let closure = async { + let ctx = SessionContext::default(); + if let Some(PlanRel { + rel_type: + Some(Root(RelRoot { + input: + Some(Rel { + rel_type: Some(Read(mut read_rel)), + }), + .. + })), + }) = plan.relations.get(0).cloned() + { + if let Some(ref mut expression) = &mut read_rel.filter { + let extensions = Extensions::try_from(&plan.extensions)?; + if let Some(RexType::ScalarFunction(f)) = &mut expression.rex_type { + Self::modify_substrait_argument(&mut f.arguments, df_schema); + } + return from_substrait_rex(&ctx, expression, df_schema, &extensions).await; + } } - }, - Some(LiteralType::Binary(b)) => match lit.type_variation_reference { - DEFAULT_CONTAINER_TYPE_REF => ScalarValue::Binary(Some(b.clone())), - LARGE_CONTAINER_TYPE_REF => ScalarValue::LargeBinary(Some(b.clone())), - others => { - return Err(DataFusionError::Substrait(format!( - "Unknown type variation reference {others}", - ))); + Err(Internal(format!("encountered wrong substrait plan {:?}", plan))) + }; + match handle { + Ok(handle) => task::block_in_place(move || handle.block_on(closure)), + _ => { + let runtime = Builder::new_current_thread() + .build() + .map_err(|e| External(Box::new(e)))?; + runtime.block_on(closure) } - }, - Some(LiteralType::FixedBinary(b)) => ScalarValue::FixedSizeBinary(b.len() as _, Some(b.clone())), - Some(LiteralType::Decimal(d)) => { - let value: [u8; 16] = d.value.clone().try_into().or(Err(DataFusionError::Substrait( - "Failed to parse decimal value".to_string(), - )))?; - let p = d - .precision - .try_into() - .map_err(|e| DataFusionError::Substrait(format!("Failed to parse decimal precision: {e}")))?; - let s = d - .scale - .try_into() - .map_err(|e| DataFusionError::Substrait(format!("Failed to parse decimal scale: {e}")))?; - ScalarValue::Decimal128(Some(std::primitive::i128::from_le_bytes(value)), p, s) - } - Some(LiteralType::Null(ntype)) => from_substrait_null(ntype)?, - _ => return not_impl_err!("Unsupported literal_type: {:?}", lit.literal_type), - }; - - Ok(scalar_value) -} - -fn from_substrait_null(null_type: &Type) -> Result { - if let Some(kind) = &null_type.kind { - match kind { - r#type::Kind::Bool(_) => Ok(ScalarValue::Boolean(None)), - r#type::Kind::I8(integer) => match integer.type_variation_reference { - DEFAULT_TYPE_REF => Ok(ScalarValue::Int8(None)), - UNSIGNED_INTEGER_TYPE_REF => Ok(ScalarValue::UInt8(None)), - v => not_impl_err!("Unsupported Substrait type variation {v} of type {kind:?}"), - }, - r#type::Kind::I16(integer) => match integer.type_variation_reference { - DEFAULT_TYPE_REF => Ok(ScalarValue::Int16(None)), - UNSIGNED_INTEGER_TYPE_REF => Ok(ScalarValue::UInt16(None)), - v => not_impl_err!("Unsupported Substrait type variation {v} of type {kind:?}"), - }, - r#type::Kind::I32(integer) => match integer.type_variation_reference { - DEFAULT_TYPE_REF => Ok(ScalarValue::Int32(None)), - UNSIGNED_INTEGER_TYPE_REF => Ok(ScalarValue::UInt32(None)), - v => not_impl_err!("Unsupported Substrait type variation {v} of type {kind:?}"), - }, - r#type::Kind::I64(integer) => match integer.type_variation_reference { - DEFAULT_TYPE_REF => Ok(ScalarValue::Int64(None)), - UNSIGNED_INTEGER_TYPE_REF => Ok(ScalarValue::UInt64(None)), - v => not_impl_err!("Unsupported Substrait type variation {v} of type {kind:?}"), - }, - r#type::Kind::Fp32(_) => Ok(ScalarValue::Float32(None)), - r#type::Kind::Fp64(_) => Ok(ScalarValue::Float64(None)), - r#type::Kind::Timestamp(ts) => match ts.type_variation_reference { - TIMESTAMP_SECOND_TYPE_REF => Ok(ScalarValue::TimestampSecond(None, None)), - TIMESTAMP_MILLI_TYPE_REF => Ok(ScalarValue::TimestampMillisecond(None, None)), - TIMESTAMP_MICRO_TYPE_REF => Ok(ScalarValue::TimestampMicrosecond(None, None)), - TIMESTAMP_NANO_TYPE_REF => Ok(ScalarValue::TimestampNanosecond(None, None)), - v => not_impl_err!("Unsupported Substrait type variation {v} of type {kind:?}"), - }, - r#type::Kind::Date(date) => match date.type_variation_reference { - DATE_32_TYPE_REF => Ok(ScalarValue::Date32(None)), - DATE_64_TYPE_REF => Ok(ScalarValue::Date64(None)), - v => not_impl_err!("Unsupported Substrait type variation {v} of type {kind:?}"), - }, - r#type::Kind::Binary(binary) => match binary.type_variation_reference { - DEFAULT_CONTAINER_TYPE_REF => Ok(ScalarValue::Binary(None)), - LARGE_CONTAINER_TYPE_REF => Ok(ScalarValue::LargeBinary(None)), - v => not_impl_err!("Unsupported Substrait type variation {v} of type {kind:?}"), - }, - // FixedBinary is not supported because `None` doesn't have length - r#type::Kind::String(string) => match string.type_variation_reference { - DEFAULT_CONTAINER_TYPE_REF => Ok(ScalarValue::Utf8(None)), - LARGE_CONTAINER_TYPE_REF => Ok(ScalarValue::LargeUtf8(None)), - v => not_impl_err!("Unsupported Substrait type variation {v} of type {kind:?}"), - }, - r#type::Kind::Decimal(d) => Ok(ScalarValue::Decimal128(None, d.precision as u8, d.scale as i8)), - _ => not_impl_err!("Unsupported Substrait type: {kind:?}"), } - } else { - not_impl_err!("Null type without kind is not supported") } } @@ -645,81 +305,6 @@ fn qualified_expr(expr_str: &str, schema: SchemaRef) -> Option<(Expr, Arc } } -fn _from_substrait_type(dt: &substrait::proto::Type) -> Result<(DataType, Nullability)> { - match &dt.kind { - Some(s_kind) => match s_kind { - r#type::Kind::Bool(b) => Ok((DataType::Boolean, b.nullability())), - r#type::Kind::I8(integer) => match integer.type_variation_reference { - DEFAULT_TYPE_REF => Ok((DataType::Int8, integer.nullability())), - UNSIGNED_INTEGER_TYPE_REF => Ok((DataType::UInt8, integer.nullability())), - v => not_impl_err!("Unsupported Substrait type variation {v} of type {s_kind:?}"), - }, - r#type::Kind::I16(integer) => match integer.type_variation_reference { - DEFAULT_TYPE_REF => Ok((DataType::Int16, integer.nullability())), - UNSIGNED_INTEGER_TYPE_REF => Ok((DataType::UInt16, integer.nullability())), - v => not_impl_err!("Unsupported Substrait type variation {v} of type {s_kind:?}"), - }, - r#type::Kind::I32(integer) => match integer.type_variation_reference { - DEFAULT_TYPE_REF => Ok((DataType::Int32, integer.nullability())), - UNSIGNED_INTEGER_TYPE_REF => Ok((DataType::UInt32, integer.nullability())), - v => not_impl_err!("Unsupported Substrait type variation {v} of type {s_kind:?}"), - }, - r#type::Kind::I64(integer) => match integer.type_variation_reference { - DEFAULT_TYPE_REF => Ok((DataType::Int64, integer.nullability())), - UNSIGNED_INTEGER_TYPE_REF => Ok((DataType::UInt64, integer.nullability())), - v => not_impl_err!("Unsupported Substrait type variation {v} of type {s_kind:?}"), - }, - r#type::Kind::Fp32(fp) => Ok((DataType::Float32, fp.nullability())), - r#type::Kind::Fp64(fp) => Ok((DataType::Float64, fp.nullability())), - r#type::Kind::Timestamp(ts) => Ok((DataType::Timestamp(TimeUnit::Microsecond, None), ts.nullability())), - r#type::Kind::TimestampTz(tz) => Ok(( - DataType::Timestamp(TimeUnit::Microsecond, Some(crate::constant::LAKESOUL_TIMEZONE.into())), - tz.nullability(), - )), - r#type::Kind::Date(date) => match date.type_variation_reference { - DATE_32_TYPE_REF => Ok((DataType::Date32, date.nullability())), - DATE_64_TYPE_REF => Ok((DataType::Date64, date.nullability())), - v => not_impl_err!("Unsupported Substrait type variation {v} of type {s_kind:?}"), - }, - r#type::Kind::Binary(binary) => match binary.type_variation_reference { - DEFAULT_CONTAINER_TYPE_REF => Ok((DataType::Binary, binary.nullability())), - LARGE_CONTAINER_TYPE_REF => Ok((DataType::LargeBinary, binary.nullability())), - v => not_impl_err!("Unsupported Substrait type variation {v} of type {s_kind:?}"), - }, - r#type::Kind::FixedBinary(fixed) => Ok((DataType::FixedSizeBinary(fixed.length), fixed.nullability())), - r#type::Kind::String(string) => match string.type_variation_reference { - DEFAULT_CONTAINER_TYPE_REF => Ok((DataType::Utf8, string.nullability())), - LARGE_CONTAINER_TYPE_REF => Ok((DataType::LargeUtf8, string.nullability())), - v => not_impl_err!("Unsupported Substrait type variation {v} of type {s_kind:?}"), - }, - r#type::Kind::List(_list) => { - not_impl_err!("Unsupported") - // let (inner_type, _nullability) = - // from_substrait_type(list.r#type.as_ref().ok_or_else(|| { - // DataFusionError::Substrait( - // "List type must have inner type".to_string(), - // ) - // })?)?; - // let field = Arc::new(Field::new("list_item", inner_type, true)); - // match list.type_variation_reference { - // DEFAULT_CONTAINER_TYPE_REF => Ok(DataType::List(field)), - // LARGE_CONTAINER_TYPE_REF => Ok(DataType::LargeList(field)), - // v => not_impl_err!( - // "Unsupported Substrait type variation {v} of type {s_kind:?}" - // )?, - // } - } - r#type::Kind::Decimal(d) => match d.type_variation_reference { - DECIMAL_128_TYPE_REF => Ok((DataType::Decimal128(d.precision as u8, d.scale as i8), d.nullability())), - DECIMAL_256_TYPE_REF => Ok((DataType::Decimal256(d.precision as u8, d.scale as i8), d.nullability())), - v => not_impl_err!("Unsupported Substrait type variation {v} of type {s_kind:?}"), - }, - _ => not_impl_err!("Unsupported Substrait type: {s_kind:?}"), - }, - _ => not_impl_err!("`None` Substrait kind is not supported"), - } -} - fn _from_nullability(nullability: Nullability) -> bool { match nullability { Nullability::Unspecified => true, @@ -765,7 +350,7 @@ mod tests { ]; let byte_array = unsafe { std::mem::transmute::<&[i8], &[u8]>(&byte_array[..]) }; let plan = Plan::decode(&byte_array[..]).unwrap(); - let e = Parser::parse_proto(&plan, df.schema()).unwrap(); + let e = Parser::parse_substrait_plan(plan, df.schema()).unwrap(); let df = df.filter(e).unwrap(); let df = df.explain(true, true).unwrap(); df.show().await.unwrap() diff --git a/rust/lakesoul-io/src/hash_utils/mod.rs b/rust/lakesoul-io/src/hash_utils/mod.rs index 5dce6018c..d35babe71 100644 --- a/rust/lakesoul-io/src/hash_utils/mod.rs +++ b/rust/lakesoul-io/src/hash_utils/mod.rs @@ -122,6 +122,18 @@ impl HashValue for [u8] { } } +impl HashValue for IntervalDayTime { + fn hash_one(&self, _seed: u32) -> u32 { + panic!("IntervalDayTimeType is not supported yet."); + } +} + +impl HashValue for IntervalMonthDayNano { + fn hash_one(&self, _seed: u32) -> u32 { + panic!("IntervalDayTimeType is not supported yet."); + } +} + /// Builds hash values of PrimitiveArray and writes them into `hashes_buffer` /// If `rehash==true` this combines the previous hash value in the buffer /// with the new hash using `combine_hashes` @@ -132,7 +144,7 @@ fn hash_array_primitive( rehash: bool, ) where T: ArrowPrimitiveType, - ::Native: HashValue, + ::Native: HashValue, { assert_eq!( hashes_buffer.len(), @@ -436,7 +448,6 @@ mod tests { #[test] // Tests actual values of hashes, which are different if forcing collisions - #[cfg(not(feature = "force_hash_collisions"))] fn create_hashes_for_dict_arrays() { let strings = [Some("foo"), None, Some("bar"), Some("foo"), None]; @@ -485,7 +496,6 @@ mod tests { #[test] // Tests actual values of hashes, which are different if forcing collisions - #[cfg(not(feature = "force_hash_collisions"))] fn create_hashes_for_list_arrays() { let data = vec![ Some(vec![Some(0), Some(1), Some(2)]), @@ -511,7 +521,6 @@ mod tests { #[test] // Tests actual values of hashes, which are different if forcing collisions - #[cfg(not(feature = "force_hash_collisions"))] fn create_multi_column_hash_for_dict_arrays() { let strings1 = [Some("foo"), None, Some("bar")]; let strings2 = [Some("blarg"), Some("blah"), None]; diff --git a/rust/lakesoul-io/src/hdfs/mod.rs b/rust/lakesoul-io/src/hdfs/mod.rs index 7c0ddf51c..3c2cb96bf 100644 --- a/rust/lakesoul-io/src/hdfs/mod.rs +++ b/rust/lakesoul-io/src/hdfs/mod.rs @@ -10,21 +10,24 @@ use async_trait::async_trait; use bytes::Bytes; use datafusion::error::Result; use datafusion_common::DataFusionError; -use futures::stream::BoxStream; -// use futures::TryStreamExt; -use hdrs::{Client, ClientBuilder}; +use futures::stream::{empty, BoxStream}; +use futures::{FutureExt, StreamExt}; +use hdrs::{Client, ClientBuilder, File}; use object_store::path::Path; -use object_store::Error::Generic; -use object_store::{GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore}; -use parquet::data_type::AsBytes; +use object_store::Error::{Generic, Precondition}; +use object_store::{ + Attributes, GetOptions, GetRange, GetResult, GetResultPayload, ListResult, MultipartUpload, ObjectMeta, + ObjectStore, PutMode, PutMultipartOpts, PutOptions, PutPayload, PutResult, UploadPart, +}; use std::fmt::{Debug, Display, Formatter}; use std::io::ErrorKind::NotFound; -use std::io::{Read, Seek, SeekFrom}; +use std::io::SeekFrom; use std::ops::Range; use std::sync::Arc; -use tokio::io::{AsyncWrite, AsyncWriteExt}; +use tokio::io::{AsyncReadExt, AsyncSeekExt, AsyncWrite, AsyncWriteExt}; +use tokio::sync::Mutex; use tokio_util::compat::{FuturesAsyncReadCompatExt, FuturesAsyncWriteCompatExt}; -// use tokio_util::io::ReaderStream; +use tokio_util::io::ReaderStream; pub struct Hdfs { client: Arc, @@ -45,25 +48,42 @@ impl Hdfs { }) } + fn file_exist(client: Arc, path: &str) -> object_store::Result { + let meta = client.metadata(path); + match meta { + Err(e) => { + if e.kind() == NotFound { + Ok(false) + } else { + Err(Generic { + store: "hdfs", + source: Box::new(e), + }) + } + } + Ok(_) => Ok(true), + } + } + async fn is_file_exist(&self, path: &Path) -> object_store::Result { let t = add_leading_slash(path); let client = self.client.clone(); - maybe_spawn_blocking(Box::new(move || { - let meta = client.metadata(t.as_str()); - match meta { - Err(e) => { - if e.kind() == NotFound { - Ok(false) - } else { - Err(Generic { - store: "hdfs", - source: Box::new(e), - }) - } - } - Ok(_) => Ok(true), - } - })) + maybe_spawn_blocking(Box::new(move || Self::file_exist(client, t.as_str()))).await + } + + async fn delete(client: Arc, location: &Path) -> object_store::Result<()> { + let t = add_leading_slash(location); + let location = location.clone(); + maybe_spawn_blocking(move || match location.filename() { + None => client.remove_dir(t.as_str()).map_err(|e| Generic { + store: "hdfs", + source: Box::new(e), + }), + Some(_) => client.remove_file(t.as_str()).map_err(|e| Generic { + store: "hdfs", + source: Box::new(e), + }), + }) .await } } @@ -82,25 +102,36 @@ impl Debug for Hdfs { #[async_trait] impl ObjectStore for Hdfs { - async fn put(&self, location: &Path, bytes: Bytes) -> object_store::Result<()> { + async fn put_opts( + &self, + location: &Path, + payload: PutPayload, + opts: PutOptions, + ) -> object_store::Result { let location = add_leading_slash(location); - let mut async_write = self - .client - .open_file() - .write(true) - .create(true) - .truncate(true) + let mut async_write = self.client.open_file(); + async_write.write(true); + if opts.mode == PutMode::Create { + async_write.create_new(true); + } else { + async_write.create(true).truncate(true); + } + let mut async_write = async_write .async_open(location.as_ref()) .await .map_err(|e| Generic { store: "hdfs", source: Box::new(e), })? - .compat(); - async_write.write_all(bytes.as_bytes()).await.map_err(|e| Generic { - store: "hdfs", - source: Box::new(e), - })?; + .compat_write(); + for mut bytes in payload.into_iter() { + async_write.write_all_buf(&mut bytes) + .await + .map_err(|e| Generic { + store: "hdfs", + source: Box::new(e), + })? + } async_write.flush().await.map_err(|e| Generic { store: "hdfs", source: Box::new(e), @@ -108,71 +139,126 @@ impl ObjectStore for Hdfs { async_write.shutdown().await.map_err(|e| Generic { store: "hdfs", source: Box::new(e), + })?; + Ok(PutResult { + e_tag: None, + version: None, }) } - async fn put_multipart( + async fn put_multipart_opts( &self, location: &Path, - ) -> object_store::Result<(MultipartId, Box)> { + _opts: PutMultipartOpts, + ) -> object_store::Result> { // hdrs uses Unblocking underneath, so we don't have to // implement concurrent write - let location = add_leading_slash(location); + let location_ = add_leading_slash(location); let async_write = self .client .open_file() .write(true) .create(true) .truncate(true) - .async_open(location.as_ref()) + .async_open(location_.as_ref()) .await .map_err(|e| Generic { store: "hdfs", source: Box::new(e), - })?; - Ok((location.to_string(), Box::new(async_write.compat_write()))) + })? + .compat_write(); + Ok(Box::new(HDFSMultiPartUpload { + client: self.client.clone(), + writer: Arc::new(Mutex::new(Box::new(async_write))), + location: location.clone(), + })) } - async fn abort_multipart(&self, location: &Path, _multipart_id: &MultipartId) -> object_store::Result<()> { - let file_exist = self.is_file_exist(location).await?; - if file_exist { - self.delete(location).await - } else { - Ok(()) + async fn get_opts(&self, location: &Path, options: GetOptions) -> object_store::Result { + let object_meta = self.head(location).await?; + if options.head { + return Ok(GetResult { + payload: GetResultPayload::Stream(empty::>().boxed()), + attributes: Attributes::default(), + range: 0..object_meta.size, + meta: object_meta, + }); } - } + let location = add_leading_slash(location); + let range = if let Some(r) = options.range { + match r { + GetRange::Bounded(range) => Ok(range), + GetRange::Offset(offset) => { + if offset >= object_meta.size { + Err(Precondition { + path: location.clone(), + source: format!( + "Request offset {} invalid against file size {}", + offset, object_meta.size + ) + .into(), + }) + } else { + Ok(offset..object_meta.size) + } + } + GetRange::Suffix(last) => { + if last > object_meta.size { + Err(Precondition { + path: location.clone(), + source: format!( + "Request last offset {} invalid against file size {}", + last, object_meta.size + ) + .into(), + }) + } else { + Ok((object_meta.size - last)..object_meta.size) + } + } + } + } else { + Ok(0..object_meta.size) + }?; - // async fn get(&self, location: &Path) -> object_store::Result { - // let path = add_leading_slash(location); - // let async_file = self - // .client - // .open_file() - // .read(true) - // .async_open(path.as_str()) - // .await - // .map_err(|e| Generic { - // store: "hdfs", - // source: Box::new(e), - // })?; - // let reader_stream = ReaderStream::new(async_file.compat()); - // Ok(GetResult{ - // payload: GetResultPayload::Stream(Box::pin(reader_stream.map_err(|e| Generic { - // store: "hdfs", - // source: Box::new(e), - // }))), - // meta: - // }) - // } - - async fn get_opts(&self, location: &Path, _options: GetOptions) -> object_store::Result { - self.get(location).await + let mut async_read = self + .client + .open_file() + .read(true) + .async_open(location.as_ref()) + .await + .map_err(|e| Generic { + store: "hdfs", + source: Box::new(e), + })? + .compat(); + async_read + .seek(SeekFrom::Start(range.start as u64)) + .await + .map_err(|e| Generic { + store: "hdfs", + source: Box::new(e), + })?; + let read = async_read.take((range.end - range.start) as u64); + let stream = ReaderStream::new(read); + Ok(GetResult { + payload: GetResultPayload::Stream(Box::pin(stream.map(|item| { + item.map_err(|e| Generic { + store: "hdfs", + source: Box::new(e), + }) + }))), + meta: object_meta, + range, + attributes: Attributes::default(), + }) } async fn get_range(&self, location: &Path, range: Range) -> object_store::Result { let location = add_leading_slash(location); let client = self.client.clone(); maybe_spawn_blocking(move || { - let mut file = client + let file = client .open_file() .read(true) .open(location.as_ref()) @@ -180,26 +266,56 @@ impl ObjectStore for Hdfs { store: "hdfs", source: Box::new(e), })?; - file.seek(SeekFrom::Start(range.start as u64)).map_err(|e| Generic { - store: "hdfs", - source: Box::new(e), - })?; let to_read = range.end - range.start; let mut buf = vec![0; to_read]; - file.read_exact(buf.as_mut_slice()).map_err(|e| Generic { - store: "hdfs", - source: Box::new(e), - })?; - Ok(buf.into()) + let read_size = read_at(&file, &mut buf, range.start as u64)?; + if read_size != to_read { + Err(Generic { + store: "hdfs", + source: format!("read file {} range not complete", location).into(), + }) + } else { + Ok(buf.into()) + } }) .await } async fn get_ranges(&self, location: &Path, ranges: &[Range]) -> object_store::Result> { - // tweak coalesce size and concurrency for hdfs + let location = add_leading_slash(location); + let client = self.client.clone(); + let file = Arc::new( + client + .open_file() + .read(true) + .open(location.as_ref()) + .map_err(|e| Generic { + store: "hdfs", + source: Box::new(e), + })?, + ); coalesce_ranges( ranges, - |range| self.get_range(location, range), + move |range| { + let location = location.clone(); + let file = file.clone(); + async move { + maybe_spawn_blocking(move || { + let to_read = range.end - range.start; + let mut buf = vec![0; to_read]; + let read_size = read_at(&file, &mut buf, range.start as u64)?; + if read_size != to_read { + Err(Generic { + store: "hdfs", + source: format!("read file {} range not complete", location).into(), + }) + } else { + Ok(buf.into()) + } + }) + .await + } + }, OBJECT_STORE_COALESCE_DEFAULT, ) .await @@ -221,32 +337,17 @@ impl ObjectStore for Hdfs { last_modified: meta.modified().into(), size: meta.len() as usize, e_tag: None, + version: None, }) }) .await } async fn delete(&self, location: &Path) -> object_store::Result<()> { - let t = add_leading_slash(location); - let location = location.clone(); - let client = self.client.clone(); - maybe_spawn_blocking(move || match location.filename() { - None => client.remove_dir(t.as_str()).map_err(|e| Generic { - store: "hdfs", - source: Box::new(e), - }), - Some(_) => client.remove_file(t.as_str()).map_err(|e| Generic { - store: "hdfs", - source: Box::new(e), - }), - }) - .await + Hdfs::delete(self.client.clone(), location).await } - async fn list( - &self, - _prefix: Option<&Path>, - ) -> object_store::Result>> { + fn list(&self, _prefix: Option<&Path>) -> BoxStream<'_, object_store::Result> { todo!() } @@ -285,10 +386,7 @@ impl ObjectStore for Hdfs { store: "hdfs", source: Box::new(e), })?; - async_write.shutdown().await.map_err(|e| Generic { - store: "hdfs", - source: Box::new(e), - }) + Ok(()) } async fn rename(&self, from: &Path, to: &Path) -> object_store::Result<()> { @@ -331,6 +429,67 @@ impl ObjectStore for Hdfs { } } +fn read_at(file: &File, buf: &mut [u8], offset: u64) -> object_store::Result { + file.read_at(buf, offset).map_err(|e| Generic { + store: "hdfs", + source: Box::new(e), + }) +} + +struct HDFSMultiPartUpload { + client: Arc, + writer: Arc>>, + location: Path, +} + +impl Debug for HDFSMultiPartUpload { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + f.write_fmt(format_args!("HDFS MultiPartUpload at location {}", self.location)) + } +} + +#[async_trait] +impl MultipartUpload for HDFSMultiPartUpload { + fn put_part(&mut self, data: PutPayload) -> UploadPart { + let writer = self.writer.clone(); + async move { + let mut writer = writer.lock().await; + for mut bytes in data.into_iter() { + writer.write_all_buf(&mut bytes) + .await + .map_err(|e| Generic { + store: "hdfs", + source: Box::new(e), + })?; + } + Ok(()) + } + .boxed() + } + + async fn complete(&mut self) -> object_store::Result { + let mut writer = self.writer.lock().await; + writer.flush().await.map_err(|e| Generic { + store: "hdfs", + source: Box::new(e), + })?; + Ok(PutResult { + e_tag: None, + version: None, + }) + } + + async fn abort(&mut self) -> object_store::Result<()> { + self.complete().await?; + let file_exist = Hdfs::file_exist(self.client.clone(), self.location.as_ref())?; + if file_exist { + Hdfs::delete(self.client.clone(), &self.location).await + } else { + Ok(()) + } + } +} + fn add_leading_slash(path: &Path) -> String { ["/", path.as_ref().trim_start_matches('/')].join("") } @@ -341,8 +500,9 @@ mod tests { use bytes::Bytes; use datafusion::datasource::object_store::ObjectStoreUrl; use futures::StreamExt; + use object_store::buffered::BufWriter; use object_store::path::Path; - use object_store::GetResult::Stream; + use object_store::GetResultPayload::Stream; use object_store::ObjectStore; use rand::distributions::{Alphanumeric, DistString}; use rand::thread_rng; @@ -363,7 +523,7 @@ mod tests { async fn read_file_from_hdfs(path: String, object_store: Arc) -> String { let file = object_store.get(&Path::from(path)).await.unwrap(); - match file { + match file.payload { Stream(s) => { let read_result = s .collect::>>() @@ -379,56 +539,28 @@ mod tests { #[tokio::test] async fn test_hdfs() { - let files = vec![ - format!("/user/{}/input/hdfs-site.xml", whoami::username()), - format!("/user/{}/input/yarn-site.xml", whoami::username()), - format!("/user/{}/input/core-site.xml", whoami::username()), - ]; - + // multipart upload and multi range get + let write_path = format!("/user/{}/output.test.txt", whoami::username()); + let complete_path = format!("hdfs://chenxu-dev:9000{}", write_path); + let url = Url::parse(complete_path.as_str()).unwrap(); let mut conf = LakeSoulIOConfigBuilder::new() .with_thread_num(2) .with_batch_size(8192) .with_max_row_group_size(250000) - .with_object_store_option("fs.defaultFS".to_string(), "hdfs://localhost:9000".to_string()) + .with_object_store_option("fs.defaultFS".to_string(), "hdfs://chenxu-dev:9000".to_string()) .with_object_store_option("fs.hdfs.user".to_string(), whoami::username()) .with_files(vec![ - format!("hdfs://localhost:9000{}", files[0]), - format!("hdfs://{}", files[1]), - files[2].clone(), + write_path.clone() ]) .build(); - let sess_ctx = create_session_context(&mut conf).unwrap(); - - let url = Url::parse(conf.files[0].as_str()).unwrap(); + println!("files: {:?}", conf.files); let object_store = sess_ctx .runtime_env() .object_store(ObjectStoreUrl::parse(&url[..url::Position::BeforePath]).unwrap()) .unwrap(); - assert_eq!( - conf.files, - vec![ - format!("hdfs://localhost:9000{}", files[0]), - format!("hdfs://localhost:9000{}", files[1]), - format!("hdfs://localhost:9000{}", files[2]), - ] - ); - let meta0 = object_store.head(&Path::from(files[0].as_str())).await.unwrap(); - assert_eq!(meta0.location, Path::from(files[0].as_str())); - assert_eq!(meta0.size, 867); - - let s = read_file_from_hdfs(files[1].clone(), object_store.clone()).await; - let path = format!("{}/etc/hadoop/yarn-site.xml", std::env::var("HADOOP_HOME").unwrap()); - let f = std::fs::read_to_string(path).unwrap(); - assert_eq!(s, f); - - // multipart upload and multi range get - let write_path = format!("/user/{}/output/test.txt", whoami::username()); - let (_, mut write) = object_store - .put_multipart(&Path::from(write_path.as_str())) - .await - .unwrap(); + let mut write = BufWriter::new(object_store.clone(), Path::from(write_path.clone())); let mut rng = thread_rng(); let size = 64 * 1024 * 1024usize; @@ -445,6 +577,21 @@ mod tests { write.shutdown().await.unwrap(); drop(write); + assert_eq!( + conf.files, + vec![ + complete_path, + ] + ); + let meta0 = object_store.head(&Path::from(write_path.as_str())).await.unwrap(); + assert_eq!(meta0.location, Path::from(write_path.as_str())); + assert_eq!(meta0.size, size); + + // test get + let s = read_file_from_hdfs(write_path.clone(), object_store.clone()).await; + assert_eq!(s, string); + + // test get_range let read_concurrency = 16; let step = size / read_concurrency; let ranges = (0..read_concurrency) @@ -466,6 +613,7 @@ mod tests { let result = bytes_to_string(result); assert_eq!(result, string); + // test get_ranges let result = object_store .get_ranges(&Path::from(write_path.as_str()), ranges.as_slice()) .await diff --git a/rust/lakesoul-io/src/helpers.rs b/rust/lakesoul-io/src/helpers.rs index b5325f8cd..2482d3e09 100644 --- a/rust/lakesoul-io/src/helpers.rs +++ b/rust/lakesoul-io/src/helpers.rs @@ -21,7 +21,7 @@ use datafusion::{ physical_planner::create_physical_sort_expr, }; use datafusion_common::DataFusionError::{External, Internal}; -use datafusion_common::{cast::as_primitive_array, DFSchema, DataFusionError, Result, ScalarValue}; +use datafusion_common::{cast::as_primitive_array, DFSchema, Result, ScalarValue}; use datafusion_substrait::substrait::proto::Plan; use object_store::path::Path; @@ -43,7 +43,6 @@ use crate::{ pub fn column_names_to_physical_sort_expr( columns: &[String], input_dfschema: &DFSchema, - input_schema: &Schema, session_state: &SessionState, ) -> Result> { columns @@ -52,7 +51,6 @@ pub fn column_names_to_physical_sort_expr( create_physical_sort_expr( &col(column).sort(true, true), input_dfschema, - input_schema, session_state.execution_props(), ) }) @@ -62,7 +60,6 @@ pub fn column_names_to_physical_sort_expr( pub fn column_names_to_physical_expr( columns: &[String], input_dfschema: &DFSchema, - input_schema: &Schema, session_state: &SessionState, ) -> Result>> { let runtime_expr = columns @@ -71,7 +68,6 @@ pub fn column_names_to_physical_expr( create_physical_expr( &col(column), input_dfschema, - input_schema, session_state.execution_props(), ) }) @@ -92,7 +88,7 @@ fn range_partition_to_partition_cols( pub fn get_columnar_values( batch: &RecordBatch, range_partitions: Arc>, -) -> datafusion::error::Result> { +) -> Result> { range_partitions .iter() .map(|range_col| { @@ -102,12 +98,12 @@ pub fn get_columnar_values( Err(e) => Err(e), } } else { - Err(datafusion::error::DataFusionError::External( + Err(External( format!("Invalid partition desc of {}", range_col).into(), )) } }) - .collect::>>() + .collect::>>() } pub fn format_scalar_value(v: &ScalarValue) -> String { @@ -281,7 +277,7 @@ pub fn partition_desc_to_scalar_values(schema: SchemaRef, partition_desc: String part_values.push((name, val)); } _ => { - return Err(datafusion::error::DataFusionError::External( + return Err(External( format!("Invalid partition_desc: {}", partition_desc).into(), )) } @@ -320,7 +316,7 @@ pub fn partition_desc_from_file_scan_config(conf: &FileScanConfig) -> Result<(St .map(|(idx, col)| (col.name().clone(), file.partition_values[idx].to_string())), ), )), - None => Err(DataFusionError::External( + None => Err(External( format!("Invalid file_group {:?}", conf.file_groups).into(), )), } @@ -366,7 +362,6 @@ pub async fn listing_table_from_lakesoul_io_config( ListingTableConfig::new_with_multi_paths(table_paths) .with_listing_options(listing_options) - // .with_schema(Arc::new(builder.finish())) .with_schema(resolved_schema) } true => { @@ -376,9 +371,8 @@ pub async fn listing_table_from_lakesoul_io_config( let listing_options = ListingOptions::new(file_format.clone()) .with_file_extension(".parquet") - .with_table_partition_cols(table_partition_cols) - .with_insert_mode(datafusion::datasource::listing::ListingTableInsertMode::AppendNewFiles); - let prefix = ListingTableUrl::parse_create_local_if_not_exists(lakesoul_io_config.prefix.clone(), true)?; + .with_table_partition_cols(table_partition_cols); + let prefix = ListingTableUrl::parse(lakesoul_io_config.prefix.clone())?; ListingTableConfig::new(prefix) .with_listing_options(listing_options) @@ -397,7 +391,7 @@ pub async fn infer_schema( // Create default parquet options let object_store_url = table_paths .first() - .ok_or(DataFusionError::Internal("no table path".to_string()))? + .ok_or(Internal("no table path".to_string()))? .object_store(); let store = sc.runtime_env().object_store(object_store_url.clone())?; let mut objects = vec![]; @@ -425,7 +419,7 @@ pub fn apply_partition_filter(wrapper: JniWrapper, schema: SchemaRef, filter: Pl let batch = batch_from_partition(&wrapper, schema, index_filed)?; let dataframe = context.read_batch(batch)?; - let df_filter = Parser::parse_proto(&filter, dataframe.schema())?; + let df_filter = Parser::parse_substrait_plan(filter, dataframe.schema())?; let results = dataframe.filter(df_filter)?.collect().await?; @@ -466,7 +460,7 @@ fn batch_from_partition(wrapper: &JniWrapper, schema: SchemaRef, index_field: Fi .collect::>>()?; // Add index column - let mut fields_with_index = schema.all_fields().into_iter().cloned().collect::>(); + let mut fields_with_index = schema.flattened_fields().into_iter().cloned().collect::>(); fields_with_index.push(index_field); let schema_with_index = SchemaRef::new(Schema::new(fields_with_index)); columns.push(Arc::new(UInt32Array::from( diff --git a/rust/lakesoul-io/src/lakesoul_io_config.rs b/rust/lakesoul-io/src/lakesoul_io_config.rs index 64c65db52..60ea23c39 100644 --- a/rust/lakesoul-io/src/lakesoul_io_config.rs +++ b/rust/lakesoul-io/src/lakesoul_io_config.rs @@ -11,12 +11,12 @@ use arrow::error::ArrowError; use arrow_schema::{Schema, SchemaRef}; use datafusion::datasource::object_store::ObjectStoreUrl; pub use datafusion::error::{DataFusionError, Result}; -use datafusion::execution::context::{QueryPlanner, SessionState}; +use datafusion::execution::context::QueryPlanner; use datafusion::execution::runtime_env::{RuntimeConfig, RuntimeEnv}; +use datafusion::execution::session_state::SessionStateBuilder; use datafusion::logical_expr::Expr; use datafusion::optimizer::analyzer::type_coercion::TypeCoercion; use datafusion::optimizer::push_down_filter::PushDownFilter; -use datafusion::optimizer::push_down_projection::PushDownProjection; use datafusion::optimizer::rewrite_disjunctive_predicate::RewriteDisjunctivePredicate; use datafusion::optimizer::simplify_expressions::SimplifyExpressions; use datafusion::optimizer::unwrap_cast_in_comparison::UnwrapCastInComparison; @@ -347,9 +347,10 @@ pub fn register_s3_object_store(url: &Url, config: &LakeSoulIOConfig, runtime: & } if bucket.is_none() { - return Err(DataFusionError::ArrowError(ArrowError::InvalidArgumentError( - "missing fs.s3a.bucket".to_string(), - ))); + return Err(DataFusionError::ArrowError( + ArrowError::InvalidArgumentError("missing fs.s3a.bucket".to_string()), + None, + )); } let retry_config = RetryConfig::default(); @@ -476,7 +477,6 @@ pub fn create_session_context_with_planner( let mut sess_conf = SessionConfig::default() .with_batch_size(config.batch_size) .with_parquet_pruning(true) - .with_prefetch(config.prefetch_size) .with_information_schema(true) .with_create_default_catalog_and_schema(true); @@ -484,7 +484,6 @@ pub fn create_session_context_with_planner( sess_conf.options_mut().optimizer.prefer_hash_join = false; //if true, panicked at 'range end out of bounds' sess_conf.options_mut().execution.parquet.pushdown_filters = config.parquet_filter_pushdown; sess_conf.options_mut().execution.target_partitions = 1; - // sess_conf.options_mut().catalog.default_catalog = "lakesoul".into(); let runtime = RuntimeEnv::new(RuntimeConfig::new())?; @@ -519,11 +518,20 @@ pub fn create_session_context_with_planner( info!("NativeIO final config: {:?}", config); // create session context - let mut state = if let Some(planner) = planner { - SessionState::new_with_config_rt(sess_conf, Arc::new(runtime)).with_query_planner(planner) - } else { - SessionState::new_with_config_rt(sess_conf, Arc::new(runtime)) - }; + let mut builder = SessionStateBuilder::new() + .with_config(sess_conf) + .with_runtime_env(Arc::new(runtime)) + .with_analyzer_rules(vec![Arc::new(TypeCoercion {})]) + .with_optimizer_rules(vec![ + Arc::new(PushDownFilter {}), + Arc::new(SimplifyExpressions {}), + Arc::new(UnwrapCastInComparison {}), + Arc::new(RewriteDisjunctivePredicate {}), + ]); + if let Some(planner) = planner { + builder = builder.with_query_planner(planner); + } + let state = builder.build(); // only keep projection/filter rules as others are unnecessary let physical_opt_rules = state .physical_optimizers() @@ -537,18 +545,9 @@ pub fn create_session_context_with_planner( } }) .collect(); - state = state - .with_analyzer_rules(vec![Arc::new(TypeCoercion {})]) - .with_optimizer_rules(vec![ - Arc::new(PushDownFilter {}), - Arc::new(PushDownProjection {}), - Arc::new(SimplifyExpressions {}), - Arc::new(UnwrapCastInComparison {}), - Arc::new(RewriteDisjunctivePredicate {}), - ]) - .with_physical_optimizer_rules(physical_opt_rules); + let builder = SessionStateBuilder::new_from_existing(state).with_physical_optimizer_rules(physical_opt_rules); - Ok(SessionContext::new_with_state(state)) + Ok(SessionContext::new_with_state(builder.build())) } #[cfg(test)] diff --git a/rust/lakesoul-io/src/lakesoul_reader.rs b/rust/lakesoul-io/src/lakesoul_reader.rs index af28908a1..4e82bd7bb 100644 --- a/rust/lakesoul-io/src/lakesoul_reader.rs +++ b/rust/lakesoul-io/src/lakesoul_reader.rs @@ -22,7 +22,7 @@ use tokio::sync::Mutex; use tokio::task::JoinHandle; use crate::datasource::file_format::LakeSoulParquetFormat; -use crate::datasource::listing::LakeSoulListingTable; +use crate::datasource::table::LakeSoulTableProvider; use crate::datasource::physical_plan::merge::convert_filter; use crate::datasource::physical_plan::merge::prune_filter_and_execute; use crate::lakesoul_io_config::{create_session_context, LakeSoulIOConfig}; @@ -56,7 +56,7 @@ impl LakeSoulReader { Arc::new(ParquetFormat::new()), self.config.clone(), )); - let source = LakeSoulListingTable::new_with_config_and_format( + let source = LakeSoulTableProvider::new_with_config_and_format( &self.sess_ctx.state(), self.config.clone(), file_format, diff --git a/rust/lakesoul-io/src/lakesoul_writer.rs b/rust/lakesoul-io/src/lakesoul_writer.rs index 45ced0d56..2f017e503 100644 --- a/rust/lakesoul-io/src/lakesoul_writer.rs +++ b/rust/lakesoul-io/src/lakesoul_writer.rs @@ -12,32 +12,33 @@ use arrow::record_batch::RecordBatch; use arrow_schema::SchemaRef; use async_trait::async_trait; use atomic_refcell::AtomicRefCell; +use bytes::Bytes; use datafusion::datasource::object_store::ObjectStoreUrl; use datafusion::error::Result; use datafusion::execution::context::TaskContext; use datafusion::physical_expr::expressions::{col, Column}; -use datafusion::physical_expr::{PhysicalExpr, PhysicalSortExpr}; +use datafusion::physical_expr::{EquivalenceProperties, PhysicalExpr, PhysicalSortExpr}; use datafusion::physical_plan::projection::ProjectionExec; use datafusion::physical_plan::sorts::sort::SortExec; use datafusion::physical_plan::stream::{RecordBatchReceiverStream, RecordBatchReceiverStreamBuilder}; -use datafusion::physical_plan::{DisplayAs, DisplayFormatType, ExecutionPlan, Partitioning, SendableRecordBatchStream}; -use datafusion_common::DataFusionError::Internal; +use datafusion::physical_plan::{ + DisplayAs, DisplayFormatType, ExecutionMode, ExecutionPlan, Partitioning, PlanProperties, SendableRecordBatchStream, +}; +use datafusion_common::DataFusionError::{ArrowError, Internal, ParquetError}; use datafusion_common::{project_schema, DataFusionError}; +use futures::future::BoxFuture; +use futures::FutureExt; +use object_store::buffered::BufWriter; use object_store::path::Path; -use object_store::{MultipartId, ObjectStore}; -use parquet::arrow::ArrowWriter; -use parquet::basic::Compression; +use parquet::arrow::async_writer::AsyncFileWriter; +use parquet::arrow::AsyncArrowWriter; use parquet::file::properties::WriterProperties; use rand::distributions::DistString; use std::any::Any; use std::borrow::Borrow; -use std::collections::{HashMap, VecDeque}; +use std::collections::HashMap; use std::fmt::{Debug, Formatter}; -use std::io::ErrorKind::AddrInUse; -use std::io::Write; use std::sync::Arc; -use tokio::io::AsyncWrite; -use tokio::io::AsyncWriteExt; use tokio::runtime::Runtime; use tokio::sync::mpsc::Sender; use tokio::sync::Mutex; @@ -57,22 +58,32 @@ pub trait AsyncBatchWriter { fn schema(&self) -> SchemaRef; } +#[derive(Clone)] +pub struct BufferedWriter(Arc>); + +impl AsyncFileWriter for BufferedWriter { + fn write(&mut self, bs: Bytes) -> BoxFuture<'_, parquet::errors::Result<()>> { + async move { + let mut write = self.0.lock().await; + write.write(bs).await + }.boxed() + } + + fn complete(&mut self) -> BoxFuture<'_, parquet::errors::Result<()>> { + async move { + let mut write = self.0.lock().await; + write.complete().await + }.boxed() + } +} + /// An async writer using object_store's multi-part upload feature for cloud storage. -/// This writer uses a `VecDeque` as `std::io::Write` for arrow-rs's ArrowWriter. -/// Everytime when a new RowGroup is flushed, the length of the VecDeque would grow. -/// At this time, we pass the VecDeque as `bytes::Buf` to `AsyncWriteExt::write_buf` provided -/// by object_store, which would drain and copy the content of the VecDeque so that we could reuse it. -/// The `CloudMultiPartUpload` itself would try to concurrently upload parts, and -/// all parts will be committed to cloud storage by shutdown the `AsyncWrite` object. pub struct MultiPartAsyncWriter { - in_mem_buf: InMemBuf, task_context: Arc, schema: SchemaRef, - writer: Box, - multi_part_id: MultipartId, - arrow_writer: ArrowWriter, + buffered_writer: BufferedWriter, + async_writer: AsyncArrowWriter, _config: LakeSoulIOConfig, - object_store: Arc, path: Path, absolute_path: String, num_rows: u64, @@ -98,41 +109,22 @@ pub struct PartitioningAsyncWriter { err: Option, } -/// A VecDeque which is both std::io::Write and bytes::Buf -#[derive(Clone)] -struct InMemBuf(Arc>>); - -impl Write for InMemBuf { - #[inline] - fn write(&mut self, buf: &[u8]) -> std::io::Result { - let mut v = self.0.try_borrow_mut().map_err(|_| std::io::Error::from(AddrInUse))?; - v.extend(buf); - Ok(buf.len()) - } - - #[inline] - fn flush(&mut self) -> std::io::Result<()> { - Ok(()) - } - - #[inline] - fn write_all(&mut self, buf: &[u8]) -> std::io::Result<()> { - let mut v = self.0.try_borrow_mut().map_err(|_| std::io::Error::from(AddrInUse))?; - v.extend(buf); - Ok(()) - } -} - pub struct ReceiverStreamExec { receiver_stream_builder: AtomicRefCell>, schema: SchemaRef, + cache: PlanProperties, } impl ReceiverStreamExec { pub fn new(receiver_stream_builder: RecordBatchReceiverStreamBuilder, schema: SchemaRef) -> Self { Self { receiver_stream_builder: AtomicRefCell::new(Some(receiver_stream_builder)), - schema, + schema: schema.clone(), + cache: PlanProperties::new( + EquivalenceProperties::new(schema), + Partitioning::UnknownPartitioning(1), + ExecutionMode::Bounded, + ), } } } @@ -144,12 +136,16 @@ impl Debug for ReceiverStreamExec { } impl DisplayAs for ReceiverStreamExec { - fn fmt_as(&self, _t: DisplayFormatType, f: &mut std::fmt::Formatter) -> std::fmt::Result { + fn fmt_as(&self, _t: DisplayFormatType, f: &mut Formatter) -> std::fmt::Result { write!(f, "ReceiverStreamExec") } } impl ExecutionPlan for ReceiverStreamExec { + fn name(&self) -> &str { + "ReceiverStreamExec" + } + fn as_any(&self) -> &dyn Any { self } @@ -158,15 +154,11 @@ impl ExecutionPlan for ReceiverStreamExec { Arc::clone(&self.schema) } - fn output_partitioning(&self) -> Partitioning { - Partitioning::UnknownPartitioning(1) + fn properties(&self) -> &PlanProperties { + &self.cache } - fn output_ordering(&self) -> Option<&[PhysicalSortExpr]> { - None - } - - fn children(&self) -> Vec> { + fn children(&self) -> Vec<&Arc> { unimplemented!() } @@ -179,7 +171,7 @@ impl ExecutionPlan for ReceiverStreamExec { .receiver_stream_builder .borrow_mut() .take() - .ok_or(DataFusionError::Internal("empty receiver stream".to_string()))?; + .ok_or(Internal("empty receiver stream".to_string()))?; Ok(builder.build()) } } @@ -189,10 +181,7 @@ impl MultiPartAsyncWriter { if config.files.is_empty() { return Err(Internal("wrong number of file names provided for writer".to_string())); } - let file_name = &config - .files - .last() - .ok_or(DataFusionError::Internal("wrong file name".to_string()))?; + let file_name = &config.files.last().ok_or(Internal("wrong file name".to_string()))?; // local style path should have already been handled in create_session_context, // so we don't have to deal with ParseError::RelativeUrlWithoutBase here @@ -207,10 +196,6 @@ impl MultiPartAsyncWriter { }?; // get underlying multipart uploader - let (multipart_id, async_writer) = object_store.put_multipart(&path).await?; - let in_mem_buf = InMemBuf(Arc::new(AtomicRefCell::new(VecDeque::::with_capacity( - 16 * 1024 * 1024, // 16kb - )))); let schema = uniform_schema(config.target_schema.0.clone()); // O(nm), n = number of fields, m = number of range partitions @@ -225,27 +210,24 @@ impl MultiPartAsyncWriter { .collect::>(); let writer_schema = project_schema(&schema, Some(&schema_projection_excluding_range))?; - let arrow_writer = ArrowWriter::try_new( - in_mem_buf.clone(), - writer_schema, + let buffered_writer = BufferedWriter(Arc::new(Mutex::new(BufWriter::new(object_store, path.clone())))); + let async_writer = AsyncArrowWriter::try_new( + buffered_writer.clone(), + writer_schema.clone(), Some( WriterProperties::builder() - .set_max_row_group_size(config.max_row_group_size) .set_write_batch_size(config.batch_size) - .set_compression(Compression::SNAPPY) + .set_max_row_group_size(config.max_row_group_size) .build(), ), )?; Ok(MultiPartAsyncWriter { - in_mem_buf, task_context, - schema, - writer: async_writer, - multi_part_id: multipart_id, - arrow_writer, + schema: writer_schema, + buffered_writer, + async_writer, _config: config.clone(), - object_store, path, absolute_path: file_name.to_string(), num_rows: 0, @@ -257,33 +239,20 @@ impl MultiPartAsyncWriter { Self::try_new_with_context(&mut config, task_context).await } - async fn write_batch( - batch: RecordBatch, - arrow_writer: &mut ArrowWriter, - in_mem_buf: &mut InMemBuf, - // underlying writer - writer: &mut Box, - ) -> Result<()> { - arrow_writer.write(&batch)?; - let mut v = in_mem_buf - .0 - .try_borrow_mut() - .map_err(|e| Internal(format!("{:?}", e)))?; - if v.len() > 0 { - MultiPartAsyncWriter::write_part(writer, &mut v).await - } else { - Ok(()) - } + async fn write_batch(&mut self, batch: RecordBatch) -> Result<()> { + self.async_writer.write(&batch).await.map_err(ParquetError) } - pub async fn write_part( - writer: &mut Box, - in_mem_buf: &mut VecDeque, - ) -> Result<()> { - writer.write_all_buf(in_mem_buf).await?; + async fn flush_and_close(self) -> Result<()> { + self.async_writer.close().await.map_err(ParquetError)?; Ok(()) } + async fn abort(&mut self) -> Result<()> { + let mut write = self.buffered_writer.0.lock().await; + write.abort().await.map_err(DataFusionError::ObjectStore) + } + pub fn nun_rows(&self) -> u64 { self.num_rows } @@ -306,34 +275,19 @@ impl AsyncBatchWriter for MultiPartAsyncWriter { async fn write_record_batch(&mut self, batch: RecordBatch) -> Result<()> { let batch = uniform_record_batch(batch)?; self.num_rows += batch.num_rows() as u64; - MultiPartAsyncWriter::write_batch(batch, &mut self.arrow_writer, &mut self.in_mem_buf, &mut self.writer).await + self.write_batch(batch).await } async fn flush_and_close(self: Box) -> Result> { // close arrow writer to flush remaining rows - let mut this = *self; - let arrow_writer = this.arrow_writer; - arrow_writer.close()?; - let mut v = this - .in_mem_buf - .0 - .try_borrow_mut() - .map_err(|e| Internal(format!("{:?}", e)))?; - if v.len() > 0 { - MultiPartAsyncWriter::write_part(&mut this.writer, &mut v).await?; - } - // shutdown multi-part async writer to complete the upload - this.writer.flush().await?; - this.writer.shutdown().await?; + let this = *self; + this.flush_and_close().await?; Ok(vec![]) } async fn abort_and_close(self: Box) -> Result> { - let this = *self; - this.object_store - .abort_multipart(&this.path, &this.multi_part_id) - .await - .map_err(DataFusionError::ObjectStore)?; + let mut this = *self; + this.abort().await?; Ok(vec![]) } @@ -508,7 +462,7 @@ impl PartitioningAsyncWriter { let write_id = rand::distributions::Alphanumeric.sample_string(&mut rand::thread_rng(), 16); let partitioned_file_path_and_row_count = Arc::new(Mutex::new(HashMap::, u64)>::new())); - for i in 0..partitioning_exec.output_partitioning().partition_count() { + for i in 0..partitioning_exec.properties().output_partitioning().partition_count() { let sink_task = tokio::spawn(Self::pull_and_sink( partitioning_exec.clone(), i, @@ -826,7 +780,7 @@ impl SyncSendableMutableLakeSoulWriter { .fields .iter() .filter(|f| !config.aux_sort_cols.contains(f.name())) - .map(|f| schema.index_of(f.name().as_str()).map_err(DataFusionError::ArrowError)) + .map(|f| schema.index_of(f.name().as_str()).map_err(|e| ArrowError(e, None))) .collect::>>()?; Arc::new(schema.project(proj_indices.borrow())?) } else { diff --git a/rust/lakesoul-io/src/repartition/mod.rs b/rust/lakesoul-io/src/repartition/mod.rs index d7acc7e52..700b7483b 100644 --- a/rust/lakesoul-io/src/repartition/mod.rs +++ b/rust/lakesoul-io/src/repartition/mod.rs @@ -18,7 +18,6 @@ use datafusion::{ }, physical_expr::PhysicalSortExpr, physical_plan::{ - common::{AbortOnDropMany, AbortOnDropSingle}, metrics::{ExecutionPlanMetricsSet, MetricBuilder}, DisplayAs, DisplayFormatType, ExecutionPlan, Partitioning, PhysicalExpr, RecordBatchStream, SendableRecordBatchStream, @@ -28,8 +27,11 @@ use datafusion::{physical_expr::physical_exprs_equal, physical_plan::metrics}; use datafusion_common::{DataFusionError, Result}; use arrow_array::{builder::UInt64Builder, ArrayRef, RecordBatch}; +use datafusion::common::runtime::SpawnedTask; +use datafusion::physical_expr::EquivalenceProperties; +use datafusion::physical_plan::PlanProperties; +use datafusion_common::DataFusionError::ArrowError; use futures::{FutureExt, Stream, StreamExt}; -use tokio::task::JoinHandle; use crate::{hash_utils::create_hashes, repartition::distributor_channels::channels}; @@ -62,7 +64,7 @@ struct RepartitionByRangeAndHashExecState { >, /// Helper that ensures that that background job is killed once it is no longer needed. - abort_helper: Arc>, + abort_helper: Arc>>, } /// A utility that can be used to partition batches based on [`Partitioning`] @@ -191,7 +193,7 @@ impl BatchPartitioner { let columns = batch .columns() .iter() - .map(|c| arrow::compute::take(c.as_ref(), &indices, None).map_err(DataFusionError::ArrowError)) + .map(|c| arrow::compute::take(c.as_ref(), &indices, None).map_err(|e| ArrowError(e, None))) .collect::>>()?; let batch = RecordBatch::try_new(batch.schema(), columns)?; @@ -266,6 +268,8 @@ pub struct RepartitionByRangeAndHashExec { /// Execution metrics metrics: ExecutionPlanMetricsSet, + + cache: PlanProperties, } impl RepartitionByRangeAndHashExec { @@ -299,7 +303,7 @@ impl DisplayAs for RepartitionByRangeAndHashExec { "{}: hash_partitioning={}, input_partitions={}", self.name(), self.hash_partitioning, - self.input.output_partitioning().partition_count() + self.input.properties().output_partitioning().partition_count() )?; if let Some(sort_exprs) = self.sort_exprs() { @@ -320,7 +324,7 @@ impl RepartitionByRangeAndHashExec { range_partitioning_expr: Vec>, hash_partitioning: Partitioning, ) -> Result { - if let Some(ordering) = input.output_ordering() { + if let Some(ordering) = input.properties().output_ordering() { let lhs = ordering .iter() .map(|sort_expr| sort_expr.expr.clone()) @@ -341,21 +345,30 @@ impl RepartitionByRangeAndHashExec { if physical_exprs_equal(&lhs, &rhs) { return Ok(Self { - input, + input: input.clone(), range_partitioning_expr, - hash_partitioning, + hash_partitioning: hash_partitioning.clone(), state: Arc::new(Mutex::new(RepartitionByRangeAndHashExecState { channels: HashMap::new(), - abort_helper: Arc::new(AbortOnDropMany::<()>(vec![])), + abort_helper: Arc::new(vec![]), })), metrics: ExecutionPlanMetricsSet::new(), + cache: PlanProperties::new( + if let Some(output_ordering) = input.properties().output_ordering() { + EquivalenceProperties::new_with_orderings(input.schema(), &[output_ordering.to_vec()]) + } else { + EquivalenceProperties::new(input.schema()) + }, + hash_partitioning, + input.properties().execution_mode, + ), }); } } Err(DataFusionError::Plan( format!( "Input ordering {:?} mismatch for RepartitionByRangeAndHashExec with range_partitioning_expr={:?}, hash_partitioning={}", - input.output_ordering(), + input.properties().output_ordering(), range_partitioning_expr, hash_partitioning, )) @@ -364,7 +377,7 @@ impl RepartitionByRangeAndHashExec { /// Return the sort expressions that are used to merge fn sort_exprs(&self) -> Option<&[PhysicalSortExpr]> { - self.input.output_ordering() + self.input.properties().output_ordering() } /// Pulls data from the specified input plan, feeding it to the @@ -452,13 +465,10 @@ impl RepartitionByRangeAndHashExec { /// each of the output tx channels to signal one of the inputs is /// complete. Upon error, propagates the errors to all output tx /// channels. - async fn wait_for_task( - input_task: AbortOnDropSingle>, - txs: HashMap>, - ) { + async fn wait_for_task(input_task: SpawnedTask>, txs: HashMap>) { // wait for completion, and propagate error // note we ignore errors on send (.ok) as that means the receiver has already shutdown. - match input_task.await { + match input_task.join().await { // Error in joining task Err(e) => { let e = Arc::new(e); @@ -473,7 +483,7 @@ impl RepartitionByRangeAndHashExec { } // Error from running input task Ok(Err(e)) => { - let e = Arc::new(e); + let e: Arc = Arc::new(e); for (_, tx) in txs { // wrap it because need to send error to all output partitions @@ -493,6 +503,10 @@ impl RepartitionByRangeAndHashExec { } impl ExecutionPlan for RepartitionByRangeAndHashExec { + fn name(&self) -> &str { + "RepartitionByRangeAndHashExec" + } + /// Return a reference to Any that can be used for downcasting fn as_any(&self) -> &dyn Any { self @@ -503,32 +517,12 @@ impl ExecutionPlan for RepartitionByRangeAndHashExec { self.input.schema() } - fn output_partitioning(&self) -> Partitioning { - self.hash_partitioning.clone() - } - - /// Specifies whether this plan generates an infinite stream of records. - /// If the plan does not support pipelining, but its input(s) are - /// infinite, returns an error to indicate this. - fn unbounded_output(&self, children: &[bool]) -> Result { - Ok(children[0]) - } - - fn output_ordering(&self) -> Option<&[PhysicalSortExpr]> { - if self.maintains_input_order()[0] { - self.input().output_ordering() - } else { - None - } - } - - fn maintains_input_order(&self) -> Vec { - // We preserve ordering when input partitioning is 1 - vec![self.input().output_partitioning().partition_count() <= 1] + fn properties(&self) -> &PlanProperties { + &self.cache } - fn children(&self) -> Vec> { - vec![self.input.clone()] + fn children(&self) -> Vec<&Arc> { + vec![&self.input] } fn with_new_children(self: Arc, mut children: Vec>) -> Result> { @@ -546,18 +540,12 @@ impl ExecutionPlan for RepartitionByRangeAndHashExec { // lock mutexes let mut state = self.state.lock(); - let num_input_partitions = self.input.output_partitioning().partition_count(); + let num_input_partitions = self.input.properties().output_partitioning().partition_count(); let num_output_partitions = self.hash_partitioning.partition_count(); // if this is the first partition to be invoked then we need to set up initial state if state.channels.is_empty() { let (txs, rxs) = { - // let (txs, rxs) = partition_aware_channels(num_input_partitions, num_output_partitions); - // // Take transpose of senders and receivers. `state.channels` keeps track of entries per output partition - // let txs = transpose(txs); - // let rxs = transpose(rxs); - // (txs, rxs) - // } else { // create one channel per *output* partition // note we use a custom channel that ensures there is always data for each receiver // but limits the amount of buffering if required. @@ -588,7 +576,7 @@ impl ExecutionPlan for RepartitionByRangeAndHashExec { let r_metrics = RepartitionMetrics::new(i, partition, &self.metrics); - let input_task: JoinHandle> = tokio::spawn(Self::pull_from_input( + let input_task = SpawnedTask::spawn(Self::pull_from_input( self.input.clone(), i, txs.clone(), @@ -600,8 +588,8 @@ impl ExecutionPlan for RepartitionByRangeAndHashExec { // In a separate task, wait for each input to be done // (and pass along any errors, including panic!s) - let join_handle = tokio::spawn(Self::wait_for_task( - AbortOnDropSingle::new(input_task), + let join_handle = SpawnedTask::spawn(Self::wait_for_task( + input_task, txs.into_iter() .map(|(partition, (tx, _reservation))| (partition, tx)) .collect(), @@ -609,7 +597,7 @@ impl ExecutionPlan for RepartitionByRangeAndHashExec { join_handles.push(join_handle); } - state.abort_helper = Arc::new(AbortOnDropMany(join_handles)) + state.abort_helper = Arc::new(join_handles); } trace!( @@ -625,40 +613,6 @@ impl ExecutionPlan for RepartitionByRangeAndHashExec { .remove(&partition) .ok_or(DataFusionError::Internal("partition not used yet".to_string()))?; - // if self.preserve_order { - - // // Store streams from all the input partitions: - // let input_streams = rx - // .into_iter() - // .map(|receiver| { - // Box::pin(PerPartitionStream { - // schema: self.schema(), - // receiver, - // drop_helper: Arc::clone(&state.abort_helper), - // reservation: reservation.clone(), - // }) as SendableRecordBatchStream - // }) - // .collect::>(); - // // Note that receiver size (`rx.len()`) and `num_input_partitions` are same. - - // // Get existing ordering to use for merging - // let sort_exprs = self.sort_exprs().unwrap_or(&[]); - - // // Merge streams (while preserving ordering) coming from - // // input partitions to this partition: - // let fetch = None; - // let merge_reservation = - // MemoryConsumer::new(format!("{}[Merge {partition}]", self.name())).register(context.memory_pool()); - // streaming_merge( - // input_streams, - // self.schema(), - // sort_exprs, - // BaselineMetrics::new(&self.metrics, partition), - // context.session_config().batch_size(), - // fetch, - // merge_reservation, - // ) - // } else { Ok(Box::pin(RepartitionStream { num_input_partitions, num_input_partitions_processed: 0, @@ -667,7 +621,6 @@ impl ExecutionPlan for RepartitionByRangeAndHashExec { drop_helper: Arc::clone(&state.abort_helper), reservation, })) - // } } } @@ -686,7 +639,7 @@ struct RepartitionStream { /// Handle to ensure background tasks are killed when no longer needed. #[allow(dead_code)] - drop_helper: Arc>, + drop_helper: Arc>>, /// Memory reservation. reservation: SharedMemoryReservation, @@ -745,7 +698,7 @@ struct PerPartitionStream { /// Handle to ensure background tasks are killed when no longer needed. #[allow(dead_code)] - drop_helper: Arc>, + drop_helper: Arc>>, /// Memory reservation. reservation: SharedMemoryReservation, diff --git a/rust/lakesoul-io/src/sorted_merge/sorted_stream_merger.rs b/rust/lakesoul-io/src/sorted_merge/sorted_stream_merger.rs index 52d3138cc..d11c2de0c 100644 --- a/rust/lakesoul-io/src/sorted_merge/sorted_stream_merger.rs +++ b/rust/lakesoul-io/src/sorted_merge/sorted_stream_merger.rs @@ -201,7 +201,7 @@ impl SortedStreamMerger { let rows = match self.row_converters[idx].convert_columns(&cols) { Ok(rows) => rows, Err(e) => { - return Poll::Ready(Err(ArrowError(e))); + return Poll::Ready(Err(ArrowError(e, None))); } }; @@ -260,7 +260,7 @@ impl SortedStreamMerger { loop { match self.range_combiner.poll_result() { RangeCombinerResult::Err(e) => { - return Poll::Ready(Some(Err(ArrowError(e)))); + return Poll::Ready(Some(Err(ArrowError(e, None)))); } RangeCombinerResult::None => { return Poll::Ready(None); @@ -285,7 +285,9 @@ impl SortedStreamMerger { } } } - RangeCombinerResult::RecordBatch(batch) => return Poll::Ready(Some(batch.map_err(ArrowError))), + RangeCombinerResult::RecordBatch(batch) => { + return Poll::Ready(Some(batch.map_err(|e| ArrowError(e, None)))) + } } } } diff --git a/rust/lakesoul-io/src/transform.rs b/rust/lakesoul-io/src/transform.rs index ae153ca67..0166f7f1a 100644 --- a/rust/lakesoul-io/src/transform.rs +++ b/rust/lakesoul-io/src/transform.rs @@ -123,7 +123,7 @@ pub fn transform_record_batch( transform_arrays, &RecordBatchOptions::new().with_row_count(Some(num_rows)), ) - .map_err(ArrowError) + .map_err(|e| ArrowError(e, None)) } pub fn transform_array( @@ -191,7 +191,7 @@ pub fn transform_array( } target_datatype => { if target_datatype != *array.data_type() { - cast_with_options(&array, &target_datatype, &ARROW_CAST_OPTIONS).map_err(ArrowError)? + cast_with_options(&array, &target_datatype, &ARROW_CAST_OPTIONS).map_err(|e| ArrowError(e, None))? } else { array.clone() } diff --git a/rust/proto/Cargo.toml b/rust/proto/Cargo.toml index eb32781b5..b243aa3e9 100644 --- a/rust/proto/Cargo.toml +++ b/rust/proto/Cargo.toml @@ -18,4 +18,4 @@ prost = { workspace = true } prost-build = { workspace = true } [target.'cfg(target_os = "linux")'.build-dependencies] -protobuf-src = "1.1.0" \ No newline at end of file +protobuf-src = "2.0.0" \ No newline at end of file