update python metadata interface && full arrow types test

Signed-off-by: zenghua <[email protected]>
lakesoul-io · Nov 2, 2023 · 4dced35 · 4dced35
1 parent e299795
commit 4dced35
Show file tree

Hide file tree

Showing 5 changed files with 285 additions and 64 deletions.
diff --git a/python/lakesoul/metadata/lib/lakesoul_metadata_c.py b/python/lakesoul/metadata/lib/lakesoul_metadata_c.py
@@ -13,7 +13,7 @@ class NonNull(Structure):
 
 
 def reload_lib(path):
-    global lib, execute_query, create_tokio_runtime, free_tokio_runtime, create_tokio_postgres_client, free_tokio_postgres_client, create_prepared_statement, free_prepared_statement
+    global lib, execute_query, create_tokio_runtime, free_tokio_runtime, create_tokio_postgres_client, free_tokio_postgres_client, create_prepared_statement, free_prepared_statement, export_bytes_result, free_bytes_result
     lib = CDLL(path)
     # pub extern "C" fn execute_query(
     #     callback: extern "C" fn(i32, *const c_char),
@@ -22,13 +22,27 @@ def reload_lib(path):
     #     prepared: NonNull<Result<PreparedStatement>>,
     #     query_type: i32,
     #     joined_string: *const c_char,
-    #     addr: c_ptrdiff_t,
-    # )
+    # ) -> NonNull<Result<BytesResult>>
     execute_query = lib.execute_query
-    execute_query.restype = c_void_p
+    execute_query.restype = POINTER(NonNull)
     execute_query.argtypes = [CFUNCTYPE(c_void_p, c_int, c_char_p), POINTER(NonNull), POINTER(NonNull),
                               POINTER(NonNull),
-                              c_int, c_char_p, c_char_p]
+                              c_int, c_char_p]
+
+    # pub extern "C" fn export_bytes_result(
+    #     callback: extern "C" fn(bool, *const c_char),
+    #     bytes: NonNull<Result<BytesResult>>,
+    #     len: i32,
+    #     addr: c_ptrdiff_t,
+    # )
+    export_bytes_result = lib.export_bytes_result
+    export_bytes_result.restype = c_void_p
+    export_bytes_result.argtypes = [CFUNCTYPE(c_void_p, c_bool, c_char_p), POINTER(NonNull), c_int, c_char_p]
+
+    # pub extern "C" fn free_bytes_result(bytes: NonNull<Result<BytesResult>>)
+    free_bytes_result = lib.free_bytes_result
+    free_bytes_result.restype = c_void_p
+    free_bytes_result.argtypes = [POINTER(NonNull)]
 
     # pub extern "C" fn create_tokio_runtime() -> NonNull<Result<TokioRuntime>>
     create_tokio_runtime = lib.create_tokio_runtime

diff --git a/python/lakesoul/metadata/native_client.py b/python/lakesoul/metadata/native_client.py
@@ -14,10 +14,12 @@
 global config
 config = None
 
+
 def reset_pg_conf(conf):
     global config
     config = " ".join(conf)
 
+
 def get_pg_conf_from_env():
     import os
     conf = []
@@ -31,17 +33,18 @@ def get_pg_conf_from_env():
         return conf
     return None
 
+
 class NativeMetadataClient:
     def __init__(self):
         self._lock = threading.Lock()
         importlib.reload(lib)
-        self._buffer = create_string_buffer(4096)
-        self._large_buffer = create_string_buffer(65536)
         self._runtime = lib.lakesoul_metadata_c.create_tokio_runtime()
         self._free_tokio_runtime = lib.lakesoul_metadata_c.free_tokio_runtime
+        self._query_result_len = 0
+        self._bool = False
 
         def callback(bool, msg):
-            #print("create connection callback: status={} msg={}".format(bool, msg.decode("utf-8")))
+            print("create connection callback: status={} msg={}".format(bool, msg.decode("utf-8")))
             if not bool:
                 message = "fail to initialize lakesoul.metadata.native_client.NativeMetadataClient"
                 raise RuntimeError(message)
@@ -84,27 +87,38 @@ def __del__(self):
 
     def execute_query(self, query_type, params):
         joined_params = PARAM_DELIM.join(params).encode("utf-8")
-        buffer = self._buffer
-        if query_type >= DAO_TYPE_QUERY_LIST_OFFSET:
-            buffer = self._large_buffer
-        buffer.value = b''
 
-        def callback(len, msg):
-            #print("execute_query query_type={} callback: len={} msg={}".format(query_type, len, msg.decode("utf-8")))
-            pass
+        def execute_query_callback(len, msg):
+            print("execute_query query_type={} callback: len={} msg={}".format(query_type, len, msg.decode("utf-8")))
+            self._query_result_len = len
+
+        def export_bytes_result_callback(bool, msg):
+            print(
+                "export_bytes_result callback: bool={} msg={}".format(query_type, bool, msg.decode("utf-8")))
+            self._bool = bool
 
         with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
             future = executor.submit(lib.lakesoul_metadata_c.execute_query,
-                                     CFUNCTYPE(c_void_p, c_int, c_char_p)(callback), self._runtime, self._client,
-                                     self._prepared, query_type, joined_params, buffer)
+                                     CFUNCTYPE(c_void_p, c_int, c_char_p)(execute_query_callback), self._runtime,
+                                     self._client,
+                                     self._prepared, query_type, joined_params)
+            bytes = future.result(2.0)
+
+            buffer = create_string_buffer(self._query_result_len)
+            future = executor.submit(lib.lakesoul_metadata_c.export_bytes_result,
+                                     CFUNCTYPE(c_void_p, c_bool, c_char_p)(export_bytes_result_callback), bytes,
+                                     self._query_result_len, buffer)
             future.result(2.0)
 
-        if len(buffer.value) == 0:
-            return None
-        else:
-            wrapper = entity_pb2.JniWrapper()
-            wrapper.ParseFromString(buffer.value)
-            return wrapper
+            ret = None
+            if len(buffer.value) > 0:
+                wrapper = entity_pb2.JniWrapper()
+                wrapper.ParseFromString(buffer.value)
+                ret = wrapper
+
+            lib.lakesoul_metadata_c.free_bytes_result(bytes)
+
+            return ret
 
     def get_lock(self):
         return self._lock
@@ -120,7 +134,7 @@ def get_instance():
     if INSTANCE is None:
         import os
         dir_path = os.path.dirname(os.path.realpath(__file__))
-        file_path = os.path.join(dir_path, 'lib', 'liblakesoul_metadata_c.so')
+        file_path = os.path.join(dir_path, 'lib', 'liblakesoul_metadata_c.dylib')
         lib.reload_lib(file_path)
         INSTANCE = NativeMetadataClient()
         return INSTANCE

diff --git a/python/lakesoul/metadata/utils.py b/python/lakesoul/metadata/utils.py
@@ -6,37 +6,95 @@
 import pyarrow
 
 
-def to_arrow_field(spark_field_json):
-    spark_type = spark_field_json['type']
-    arrow_type = None
-    if spark_type == 'long':
-        arrow_type = pyarrow.int64()
-    elif spark_type == 'integer':
-        arrow_type = pyarrow.int32()
-    elif spark_type == 'string':
-        arrow_type = pyarrow.utf8()
-    elif spark_type == 'float':
-        arrow_type = pyarrow.float32()
-    elif spark_type == 'double':
-        arrow_type = pyarrow.float64()
-    elif spark_type == "binary":
-        arrow_type = pyarrow.binary()
-    elif spark_type.startswith("decimal"):
-        arrow_type = pyarrow.decimal128(38)
-    elif spark_type == 'struct':
-        fields = spark_field_json['fields']
-        arrow_fields = []
-        for field in fields:
-            arrow_fields.append(to_arrow_field(field))
-        arrow_type = pyarrow.struct(arrow_fields)
+def to_arrow_type(arrow_type_json):
+    if isinstance(arrow_type_json, str):
+        if arrow_type_json == 'Boolean':
+            return pyarrow.bool_()
+        elif arrow_type_json == 'Date32':
+            return pyarrow.date32()
+        elif arrow_type_json == 'Date64':
+            return pyarrow.date64()
+        elif arrow_type_json == 'Int8':
+            return pyarrow.int8()
+        elif arrow_type_json == 'Int16':
+            return pyarrow.int16()
+        elif arrow_type_json == 'Int32':
+            return pyarrow.int32()
+        elif arrow_type_json == 'Int64':
+            return pyarrow.int64()
+        elif arrow_type_json == 'UInt8':
+            return pyarrow.uint8()
+        elif arrow_type_json == 'UInt16':
+            return pyarrow.uint16()
+        elif arrow_type_json == 'UInt32':
+            return pyarrow.uint32()
+        elif arrow_type_json == 'UInt64':
+            return pyarrow.uint64()
+        elif arrow_type_json == 'String':
+            return pyarrow.string()
+        elif arrow_type_json == 'Utf8':
+            return pyarrow.utf8()
+        elif arrow_type_json == 'LargeUtf8':
+            return pyarrow.large_utf8()
+        elif arrow_type_json == 'Float32':
+            return pyarrow.float32()
+        elif arrow_type_json == 'Float64':
+            return pyarrow.float64()
+        elif arrow_type_json == "Binary":
+            return pyarrow.binary()
+        elif arrow_type_json == "LargeBinary":
+            return pyarrow.large_binary()
+        elif arrow_type_json == "Null":
+            return pyarrow.null()
+    elif isinstance(arrow_type_json, dict):
+        if 'Decimal128' in arrow_type_json:
+            return pyarrow.decimal128(arrow_type_json['Decimal128'][0], arrow_type_json['Decimal128'][1])
+        elif 'Decimal256' in arrow_type_json:
+            return pyarrow.decimal256(arrow_type_json['Decimal256'][0], arrow_type_json['Decimal256'][1])
+        elif 'Interval' in arrow_type_json:
+            if arrow_type_json['Interval'] == 'DayTime':
+                return pyarrow.month_day_nano_interval()
+            elif arrow_type_json['Interval'] == 'YearMonth':
+                return pyarrow.month_day_nano_interval()
+        elif 'List' in arrow_type_json:
+            return pyarrow.list_(to_arrow_type(arrow_type_json['List']['data_type']))
+        elif 'FixedSizeList' in arrow_type_json:
+            return pyarrow.list_(to_arrow_type(arrow_type_json['FixedSizeList'][0]['data_type']),
+                                 arrow_type_json['FixedSizeList'][1])
+        elif 'Dictionary' in arrow_type_json:
+            return pyarrow.dictionary(arrow_type_json['Dictionary'][0], arrow_type_json['Dictionary'][1])
+        elif 'FixedSizeBinary' in arrow_type_json:
+            return pyarrow.binary(arrow_type_json['FixedSizeBinary'])
+        elif 'Map' in arrow_type_json:
+            return pyarrow.map_(to_arrow_type(arrow_type_json['Map'][0]['data_type']['Struct'][0]['data_type']),
+                                to_arrow_type(arrow_type_json['Map'][0]['data_type']['Struct'][1]['data_type']),
+                                arrow_type_json['Map'][1])
+        elif 'Struct' in arrow_type_json:
+            arrow_fields = []
+            for field in arrow_type_json['Struct']:
+                arrow_fields.append(to_arrow_field(field))
+            return pyarrow.struct(arrow_fields)
+        elif 'Time32' in arrow_type_json:
+            return pyarrow.time32('ms' if arrow_type_json['Time32'] == 'Millisecond' else 's')
+        elif 'Time64' in arrow_type_json:
+            return pyarrow.time64('us' if arrow_type_json['Time64'] == 'Microsecond' else 'ns')
+        elif 'Timestamp' in arrow_type_json:
+            unit = arrow_type_json['Timestamp'][0]
+            unit = 's' if unit == 'Second' else 'ms' if unit == 'Millisecond' else 'us' if unit == 'Microsecond' else 'ns'
+            return pyarrow.timestamp(unit, arrow_type_json['Timestamp'][1])
     else:
-        raise IOError("Not supported spark type " + str(spark_type))
-    return pyarrow.field(spark_field_json['name'], arrow_type, spark_field_json['nullable'])
+        raise IOError("Not supported spark type " + str(arrow_type_json))
 
 
-def to_arrow_schema(spark_schema_str, exclude_columns=None):
+def to_arrow_field(arrow_field_json):
+    return pyarrow.field(arrow_field_json['name'], to_arrow_type(arrow_field_json['data_type']),
+                         arrow_field_json['nullable'])
+
+
+def to_arrow_schema(schema_json_str, exclude_columns=None):
     exclude_columns = frozenset(exclude_columns or frozenset())
-    fields = json.loads(spark_schema_str)['fields']
+    _json = json.loads(schema_json_str)
+    fields = json.loads(schema_json_str)['fields']
     arrow_fields = []
     for field in fields:
         if field['name'] in exclude_columns:

diff --git a/python/metadata_poc.py b/python/metadata_poc.py
@@ -8,18 +8,19 @@
 
 if __name__ == '__main__':
     reset_pg_conf(
-        ["host=localhost", "port=5432", " dbname=lakesoul_test", " user=lakesoul_test", "password=lakesoul_test"])
+        ["host=localhost", "port=5433", " dbname=test_lakesoul_meta", " user=yugabyte", "password=yugabyte"])
 
     db_manager = DBManager()
-    data_files = db_manager.get_data_files_by_table_name("titanic")
+    table_name = "test_datatypes"
+    data_files = db_manager.get_data_files_by_table_name(table_name)
     print(data_files)
-    data_files = db_manager.get_data_files_by_table_name("titanic", partitions={"split": "train"})
+    data_files = db_manager.get_data_files_by_table_name(table_name)
     print(data_files)
-    arrow_schema = db_manager.get_arrow_schema_by_table_name("titanic")
-    print(arrow_schema)
-    data_files = db_manager.get_data_files_by_table_name("imdb")
-    print(data_files)
-    data_files = db_manager.get_data_files_by_table_name("imdb", partitions={"split": "train"})
-    print(data_files)
-    arrow_schema = db_manager.get_arrow_schema_by_table_name("imdb")
+    arrow_schema = db_manager.get_arrow_schema_by_table_name(table_name)
     print(arrow_schema)
+    # data_files = db_manager.get_data_files_by_table_name("imdb")
+    # print(data_files)
+    # data_files = db_manager.get_data_files_by_table_name("imdb", partitions={"split": "train"})
+    # print(data_files)
+    # arrow_schema = db_manager.get_arrow_schema_by_table_name("imdb")
+    # print(arrow_schema)