Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Rust][Metadata]Create Rust MetadataClient & add CI test cases #357

Merged
merged 4 commits into from
Nov 1, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/native-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ jobs:
- uses: Swatinem/rust-cache@v2
with:
workspaces: "./rust -> target"
env-vars: "JAVA_HOME"
- uses: actions-rs/cargo@v1
with:
command: build
Expand Down
26 changes: 26 additions & 0 deletions .github/workflows/rust-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,27 @@ env:
jobs:
rust_ci:
runs-on: ubuntu-latest
services:
# Label used to access the service container
postgres:
# Docker Hub image
image: postgres:14.5
# Provide the password for postgres
env:
POSTGRES_PASSWORD: lakesoul_test
POSTGRES_USER: lakesoul_test
POSTGRES_DB: lakesoul_test
# Set health checks to wait until postgres has started
options: >-
--health-cmd pg_isready
--health-interval 10s
--health-timeout 5s
--health-retries 5
--name lakesoul-test-pg
ports:
# Maps tcp port 5432 on service container to the host
- 5432:5432

steps:
- uses: actions/checkout@v3
- uses: actions-rs/toolchain@v1
Expand All @@ -32,6 +53,11 @@ jobs:
toolchain: nightly-2023-05-20
components: clippy
default: true
- name: Install psql
run: sudo apt-get install -y postgresql-client-14
- name: Init PG
run: |
./script/meta_init_for_local_test.sh -j 2
- name: Install Protoc
uses: arduino/setup-protoc@v2
with:
Expand Down
4 changes: 4 additions & 0 deletions rust/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions rust/lakesoul-datafusion/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,6 @@ lakesoul-io = { path = "../lakesoul-io" }
lakesoul-metadata = { path = "../lakesoul-metadata" }
proto = { path = "../proto" }
prost = "0.11"
async-trait = "0.1"
futures = "0.3"
uuid = { version = "1.4.0", features = ["v4", "fast-rng", "macro-diagnostics"]}
202 changes: 201 additions & 1 deletion rust/lakesoul-datafusion/src/test/upsert_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -519,4 +519,204 @@ mod upsert_with_io_config_tests {
}


}
}

mod upsert_with_metadata_tests {
use std::sync::Arc;
use std::env;
use std::path::PathBuf;
use std::time::SystemTime;


use lakesoul_io::datasource::parquet_source::EmptySchemaProvider;
use lakesoul_io::serde_json::json;
use lakesoul_io::{arrow, datafusion, tokio, serde_json};

use arrow::util::pretty::print_batches;
use arrow::datatypes::{Schema, SchemaRef, Field};
use arrow::record_batch::RecordBatch;
use arrow::array::{ArrayRef, Int32Array};

use datafusion::assert_batches_eq;
use datafusion::prelude::{DataFrame, SessionContext};
use datafusion::logical_expr::LogicalPlanBuilder;
use datafusion::error::Result;

use proto::proto::entity::{TableInfo, DataCommitInfo, FileOp, DataFileOp, CommitOp, Uuid};
use tokio::runtime::{Builder, Runtime};

use lakesoul_io::lakesoul_io_config::{LakeSoulIOConfigBuilder, LakeSoulIOConfig};
use lakesoul_io::lakesoul_reader::{LakeSoulReader, SyncSendableMutableLakeSoulReader};
use lakesoul_io::lakesoul_writer::SyncSendableMutableLakeSoulWriter;

use lakesoul_metadata::{Client, PreparedStatementMap, MetaDataClient};


fn commit_data(client: &mut MetaDataClient, table_name: &str, config: LakeSoulIOConfig) -> Result<()>{
let table_name_id = client.get_table_name_id_by_table_name(table_name, "default")?;
match client.commit_data_commit_info(DataCommitInfo {
table_id: table_name_id.table_id,
partition_desc: "-5".to_string(),
file_ops: config.files_slice()
.iter()
.map(|file| DataFileOp {
file_op: FileOp::Add as i32,
path: file.clone(),
..Default::default()
})
.collect(),
commit_op: CommitOp::AppendCommit as i32,
timestamp: SystemTime::now().duration_since(SystemTime::UNIX_EPOCH).unwrap().as_secs() as i64,
commit_id: {
let (high, low) = uuid::Uuid::new_v4().as_u64_pair();
Some(Uuid{high, low})
},
..Default::default()
})
{
Ok(()) => Ok(()),
Err(e) => Err(lakesoul_io::lakesoul_reader::DataFusionError::IoError(e))
}
}

fn create_table(client: &mut MetaDataClient, table_name: &str, config: LakeSoulIOConfig) -> Result<()> {
match client.create_table(
TableInfo {
table_id: format!("table_{}", uuid::Uuid::new_v4().to_string()),
table_name: table_name.to_string(),
table_path: [env::temp_dir().to_str().unwrap(), table_name].iter().collect::<PathBuf>().to_str().unwrap().to_string(),
table_schema: serde_json::to_string(&config.schema()).unwrap(),
table_namespace: "default".to_string(),
properties: "{}".to_string(),
partitions: ";".to_owned() + config.primary_keys_slice().iter().map(String::as_str).collect::<Vec<_>>().join(",").as_str(),
domain: "public".to_string(),
})
{
Ok(()) => Ok(()),
Err(e) => Err(lakesoul_io::lakesoul_reader::DataFusionError::IoError(e))
}
}

fn create_io_config_builder(client: &mut MetaDataClient, table_name: &str) -> LakeSoulIOConfigBuilder {
let table_info = client.get_table_info_by_table_name(table_name, "default").unwrap();
let data_files = client.get_data_files_by_table_name(table_name, vec![], "default").unwrap();
let schema_str = client.get_schema_by_table_name(table_name, "default").unwrap();
let schema = serde_json::from_str::<Schema>(schema_str.as_str()).unwrap();

LakeSoulIOConfigBuilder::new()
.with_files(data_files)
.with_schema(Arc::new(schema))
.with_primary_keys(
parse_table_info_partitions(table_info.partitions).1
)
}

fn parse_table_info_partitions(partitions: String) -> (Vec<String>, Vec<String>) {
let (range_keys, hash_keys) = partitions.split_at(partitions.find(';').unwrap());
let hash_keys = &hash_keys[1..];
(
range_keys.split(',')
.collect::<Vec<&str>>()
.iter()
.map(|str|str.to_string())
.collect::<Vec<String>>(),
hash_keys.split(',')
.collect::<Vec<&str>>()
.iter()
.map(|str|str.to_string())
.collect::<Vec<String>>()
)
}

fn create_batch_i32(names: Vec<&str>, values: Vec<&[i32]>) -> RecordBatch {
let values = values
.into_iter()
.map(|vec| Arc::new(Int32Array::from(Vec::from(vec))) as ArrayRef)
.collect::<Vec<ArrayRef>>();
let iter = names.into_iter().zip(values).map(|(name, array)| (name, array, true)).collect::<Vec<_>>();
RecordBatch::try_from_iter_with_nullable(iter).unwrap()
}


fn execute_upsert(batch: RecordBatch, table_name: &str, client: &mut MetaDataClient) -> Result<()> {
let file = [env::temp_dir().to_str().unwrap(), table_name, format!("{}.parquet", SystemTime::now().duration_since(SystemTime::UNIX_EPOCH).unwrap().as_millis().to_string()).as_str()].iter().collect::<PathBuf>().to_str().unwrap().to_string();
let builder = create_io_config_builder(client, table_name).with_file(file.clone()).with_schema(batch.schema());
let config = builder.clone().build();

let writer = SyncSendableMutableLakeSoulWriter::try_new(config, Builder::new_current_thread().build().unwrap()).unwrap();
writer.write_batch(batch)?;
writer.flush_and_close()?;
commit_data(client, table_name, builder.clone().build())
}




fn init_table(batch: RecordBatch, table_name: &str, pks:Vec<String>, client: &mut MetaDataClient) -> Result<()> {
let schema = batch.schema();
let builder = LakeSoulIOConfigBuilder::new()
.with_schema(schema.clone())
.with_primary_keys(pks);
create_table(client, table_name, builder.build())?;
execute_upsert(batch, table_name, client)
}



fn check_upsert(batch: RecordBatch, table_name: &str, selected_cols: Vec<&str>, filters: Option<String>, client: &mut MetaDataClient, expected: &[&str]) -> Result<()> {
execute_upsert(batch, table_name, client)?;
let builder = create_io_config_builder(client, table_name);
let builder = builder
.with_schema(SchemaRef::new(Schema::new(
selected_cols.iter().map(|col| Field::new(*col, arrow::datatypes::DataType::Int32, true)).collect::<Vec<_>>()
)));
let builder = if let Some(filters) = filters {
builder.with_filter_str(filters)
} else {
builder
};
let mut reader = SyncSendableMutableLakeSoulReader::new(LakeSoulReader::new(builder.build()).unwrap(), Builder::new_current_thread().build().unwrap());
reader.start_blocked()?;
let result = reader.next_rb_blocked();
match result {
Some(result) => {
assert_batches_eq!(expected, &[result?]);
Ok(())
},
None => Ok(())
}
}

#[test]
fn test_merge_same_column_i32() -> Result<()>{
let table_name = "merge-same_column";
let mut client = MetaDataClient::from_env();
// let mut client = MetaDataClient::from_config("host=127.0.0.1 port=5433 dbname=test_lakesoul_meta user=yugabyte password=yugabyte".to_string());
client.meta_cleanup()?;
init_table(
create_batch_i32(vec!["range", "hash", "value"], vec![&[20201101, 20201101, 20201101, 20201102], &[1, 2, 3, 4], &[1, 2, 3, 4]]),
table_name,
vec!["range".to_string(), "hash".to_string()],
&mut client,
)?;

check_upsert(
create_batch_i32(vec!["range", "hash", "value"], vec![&[20201101, 20201101, 20201101], &[1, 3, 4], &[11, 33, 44]]),
table_name,
vec!["range", "hash", "value"],
None,
&mut client,
&[
"+----------+------+-------+",
"| range | hash | value |",
"+----------+------+-------+",
"| 20201101 | 1 | 11 |",
"| 20201101 | 2 | 2 |",
"| 20201101 | 3 | 33 |",
"| 20201101 | 4 | 44 |",
"| 20201102 | 4 | 4 |",
"+----------+------+-------+",
]
)
}
}
1 change: 1 addition & 0 deletions rust/lakesoul-io/src/datasource/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@
// SPDX-License-Identifier: Apache-2.0

pub mod parquet_source;
pub mod parquet_sink;
Loading
Loading