From 9a72f75b0f19c88fb32456040ce20bd313a84041 Mon Sep 17 00:00:00 2001 From: Mingyu Chen Date: Tue, 2 Jul 2024 23:55:04 +0800 Subject: [PATCH] [fix](hive) support find serde info from both tbl properties and serde properties (#37043) Some hive table set serde properties in tblproperties. We need to support it --- .../scripts/data/regression/serde_prop/run.sh | 9 +++ .../serde_prop/some_serde_table.hql | 34 +++++++++++ .../hive/HiveMetaStoreClientHelper.java | 26 +++++++++ .../datasource/hive/source/HiveScanNode.java | 58 ++++++++++--------- .../hive/test_hive_serde_prop.out | 16 +++++ .../hive/test_hive_serde_prop.groovy | 4 ++ 6 files changed, 121 insertions(+), 26 deletions(-) create mode 100755 docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/run.sh create mode 100644 docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/some_serde_table.hql diff --git a/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/run.sh b/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/run.sh new file mode 100755 index 00000000000000..ef6538563d5b58 --- /dev/null +++ b/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/run.sh @@ -0,0 +1,9 @@ +#!/bin/bash +set -x + +CUR_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" + +# create table +hive -f "${CUR_DIR}"/some_serde_table.hql + + diff --git a/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/some_serde_table.hql b/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/some_serde_table.hql new file mode 100644 index 00000000000000..fa6ad791118c1e --- /dev/null +++ b/docker/thirdparties/docker-compose/hive/scripts/data/regression/serde_prop/some_serde_table.hql @@ -0,0 +1,34 @@ +create database if not exists regression; +use regression; + +CREATE TABLE `serde_test1`( + `id` int, + `name` string) +ROW FORMAT SERDE + 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' +WITH SERDEPROPERTIES ( + 'field.delim'='', + 'serialization.format'='') +STORED AS INPUTFORMAT + 'org.apache.hadoop.mapred.TextInputFormat' +OUTPUTFORMAT + 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'; + +CREATE TABLE `serde_test2`( + `id` int, + `name` string) +ROW FORMAT SERDE + 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' +WITH SERDEPROPERTIES ( + 'field.delim'='', + 'serialization.format'='') +STORED AS INPUTFORMAT + 'org.apache.hadoop.mapred.TextInputFormat' +OUTPUTFORMAT + 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' +TBLPROPERTIES ( + 'field.delim'='|' +); + +insert into serde_test1 values(1, "abc"),(2, "def"); +insert into serde_test2 values(1, "abc"),(2, "def"); diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreClientHelper.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreClientHelper.java index ea821cefec6f23..22bf13755a2e11 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreClientHelper.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreClientHelper.java @@ -51,6 +51,7 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.metastore.api.FieldSchema; import org.apache.hadoop.hive.metastore.api.StorageDescriptor; +import org.apache.hadoop.hive.metastore.api.Table; import org.apache.hadoop.hive.ql.exec.FunctionRegistry; import org.apache.hadoop.hive.ql.parse.SemanticException; import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; @@ -79,6 +80,7 @@ import java.util.LinkedList; import java.util.List; import java.util.Map; +import java.util.Optional; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Collectors; @@ -847,4 +849,28 @@ public static Configuration getConfiguration(HMSExternalTable table) { } return conf; } + + public static Optional getSerdeProperty(Table table, String key) { + String valueFromSd = table.getSd().getSerdeInfo().getParameters().get(key); + String valueFromTbl = table.getParameters().get(key); + return firstNonNullable(valueFromTbl, valueFromSd); + } + + private static Optional firstNonNullable(String... values) { + for (String value : values) { + if (!Strings.isNullOrEmpty(value)) { + return Optional.of(value); + } + } + return Optional.empty(); + } + + public static String firstPresentOrDefault(String defaultValue, Optional... values) { + for (Optional value : values) { + if (value.isPresent()) { + return value.get(); + } + } + return defaultValue; + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java index 1970a48f2d465a..0214ecc464238a 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java @@ -85,7 +85,7 @@ public class HiveScanNode extends FileQueryScanNode { public static final String PROP_LINE_DELIMITER = "line.delim"; public static final String DEFAULT_LINE_DELIMITER = "\n"; public static final String PROP_SEPARATOR_CHAR = "separatorChar"; - public static final String PROP_QUOTA_CHAR = "quoteChar"; + public static final String PROP_QUOTE_CHAR = "quoteChar"; public static final String PROP_COLLECTION_DELIMITER_HIVE2 = "colelction.delim"; public static final String PROP_COLLECTION_DELIMITER_HIVE3 = "collection.delim"; @@ -445,32 +445,37 @@ protected Map getLocationProperties() throws UserException { @Override protected TFileAttributes getFileAttributes() throws UserException { TFileTextScanRangeParams textParams = new TFileTextScanRangeParams(); - java.util.Map delimiter = hmsTable.getRemoteTable().getSd().getSerdeInfo().getParameters(); - if (delimiter.containsKey(PROP_FIELD_DELIMITER)) { - if (delimiter.get(PROP_FIELD_DELIMITER).length() == 0) { - textParams.setColumnSeparator(DEFAULT_FIELD_DELIMITER); - } else { - textParams.setColumnSeparator(delimiter.get(PROP_FIELD_DELIMITER)); - } - } else if (delimiter.containsKey(PROP_SEPARATOR_CHAR)) { - textParams.setColumnSeparator(delimiter.get(PROP_SEPARATOR_CHAR)); - } else { - textParams.setColumnSeparator(DEFAULT_FIELD_DELIMITER); - } - if (delimiter.containsKey(PROP_QUOTA_CHAR)) { - textParams.setEnclose(delimiter.get(PROP_QUOTA_CHAR).getBytes()[0]); - } - textParams.setLineDelimiter(delimiter.getOrDefault(PROP_LINE_DELIMITER, DEFAULT_LINE_DELIMITER)); - textParams.setMapkvDelimiter(delimiter.getOrDefault(PROP_MAP_KV_DELIMITER, DEFAULT_MAP_KV_DELIMITER)); - - // textParams.collection_delimiter field is map, array and struct delimiter; - if (delimiter.get(PROP_COLLECTION_DELIMITER_HIVE2) != null) { - textParams.setCollectionDelimiter(delimiter.get(PROP_COLLECTION_DELIMITER_HIVE2)); - } else if (delimiter.get(PROP_COLLECTION_DELIMITER_HIVE3) != null) { - textParams.setCollectionDelimiter(delimiter.get(PROP_COLLECTION_DELIMITER_HIVE3)); - } else { - textParams.setCollectionDelimiter(DEFAULT_COLLECTION_DELIMITER); + + // 1. set column separator + Optional fieldDelim = + HiveMetaStoreClientHelper.getSerdeProperty(hmsTable.getRemoteTable(), PROP_FIELD_DELIMITER); + Optional columnSeparator = + HiveMetaStoreClientHelper.getSerdeProperty(hmsTable.getRemoteTable(), PROP_SEPARATOR_CHAR); + textParams.setColumnSeparator(HiveMetaStoreClientHelper.firstPresentOrDefault( + DEFAULT_FIELD_DELIMITER, fieldDelim, columnSeparator)); + // 2. set line delimiter + Optional lineDelim = + HiveMetaStoreClientHelper.getSerdeProperty(hmsTable.getRemoteTable(), PROP_LINE_DELIMITER); + textParams.setLineDelimiter(HiveMetaStoreClientHelper.firstPresentOrDefault( + DEFAULT_LINE_DELIMITER, lineDelim)); + // 3. set mapkv delimiter + Optional mapkvDelim = + HiveMetaStoreClientHelper.getSerdeProperty(hmsTable.getRemoteTable(), PROP_MAP_KV_DELIMITER); + textParams.setMapkvDelimiter(HiveMetaStoreClientHelper.firstPresentOrDefault( + DEFAULT_MAP_KV_DELIMITER, mapkvDelim)); + // 4. set collection delimiter + Optional collectionDelimHive2 = + HiveMetaStoreClientHelper.getSerdeProperty(hmsTable.getRemoteTable(), PROP_COLLECTION_DELIMITER_HIVE2); + Optional collectionDelimHive3 = + HiveMetaStoreClientHelper.getSerdeProperty(hmsTable.getRemoteTable(), PROP_COLLECTION_DELIMITER_HIVE3); + textParams.setCollectionDelimiter(HiveMetaStoreClientHelper.firstPresentOrDefault( + DEFAULT_COLLECTION_DELIMITER, collectionDelimHive2, collectionDelimHive3)); + // 5. set quote char + Map serdeParams = hmsTable.getRemoteTable().getSd().getSerdeInfo().getParameters(); + if (serdeParams.containsKey(PROP_QUOTE_CHAR)) { + textParams.setEnclose(serdeParams.get(PROP_QUOTE_CHAR).getBytes()[0]); } + TFileAttributes fileAttributes = new TFileAttributes(); fileAttributes.setTextParams(textParams); fileAttributes.setHeaderType(""); @@ -502,3 +507,4 @@ protected TFileCompressType getFileCompressType(FileSplit fileSplit) throws User return compressType; } } + diff --git a/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out b/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out index 818db069d5045e..b00eebec49d711 100644 --- a/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out +++ b/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out @@ -3,7 +3,23 @@ a 1.1 b 2.2 +-- !2 -- +1 abc +2 def + +-- !2 -- +1 abc +2 def + -- !1 -- a 1.1 b 2.2 +-- !2 -- +1 abc +2 def + +-- !2 -- +1 abc +2 def + diff --git a/regression-test/suites/external_table_p0/hive/test_hive_serde_prop.groovy b/regression-test/suites/external_table_p0/hive/test_hive_serde_prop.groovy index 7ac366748b6ff2..3ae6b21bbba4f6 100644 --- a/regression-test/suites/external_table_p0/hive/test_hive_serde_prop.groovy +++ b/regression-test/suites/external_table_p0/hive/test_hive_serde_prop.groovy @@ -37,6 +37,10 @@ suite("test_hive_serde_prop", "external_docker,hive,external_docker_hive,p0,exte );""" qt_1 """select * from ${catalog_name}.${ex_db_name}.employee_gz order by name;""" + + + qt_2 """select * from ${catalog_name}.regression.serde_test1 order by id;""" + qt_2 """select * from ${catalog_name}.regression.serde_test2 order by id;""" } }