Skip to content

Commit

Permalink
[fix](hive) support find serde info from both tbl properties and serd…
Browse files Browse the repository at this point in the history
…e properties (apache#37043)

Some hive table set serde properties in tblproperties.
We need to support it
  • Loading branch information
morningman authored Jul 2, 2024
1 parent b3b035e commit 9a72f75
Show file tree
Hide file tree
Showing 6 changed files with 121 additions and 26 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/bin/bash
set -x

CUR_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"

# create table
hive -f "${CUR_DIR}"/some_serde_table.hql


Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
create database if not exists regression;
use regression;

CREATE TABLE `serde_test1`(
`id` int,
`name` string)
ROW FORMAT SERDE
'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
WITH SERDEPROPERTIES (
'field.delim'='',
'serialization.format'='')
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat';

CREATE TABLE `serde_test2`(
`id` int,
`name` string)
ROW FORMAT SERDE
'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
WITH SERDEPROPERTIES (
'field.delim'='',
'serialization.format'='')
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
TBLPROPERTIES (
'field.delim'='|'
);

insert into serde_test1 values(1, "abc"),(2, "def");
insert into serde_test2 values(1, "abc"),(2, "def");
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.metastore.api.StorageDescriptor;
import org.apache.hadoop.hive.metastore.api.Table;
import org.apache.hadoop.hive.ql.exec.FunctionRegistry;
import org.apache.hadoop.hive.ql.parse.SemanticException;
import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
Expand Down Expand Up @@ -79,6 +80,7 @@
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
Expand Down Expand Up @@ -847,4 +849,28 @@ public static Configuration getConfiguration(HMSExternalTable table) {
}
return conf;
}

public static Optional<String> getSerdeProperty(Table table, String key) {
String valueFromSd = table.getSd().getSerdeInfo().getParameters().get(key);
String valueFromTbl = table.getParameters().get(key);
return firstNonNullable(valueFromTbl, valueFromSd);
}

private static Optional<String> firstNonNullable(String... values) {
for (String value : values) {
if (!Strings.isNullOrEmpty(value)) {
return Optional.of(value);
}
}
return Optional.empty();
}

public static String firstPresentOrDefault(String defaultValue, Optional<String>... values) {
for (Optional<String> value : values) {
if (value.isPresent()) {
return value.get();
}
}
return defaultValue;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ public class HiveScanNode extends FileQueryScanNode {
public static final String PROP_LINE_DELIMITER = "line.delim";
public static final String DEFAULT_LINE_DELIMITER = "\n";
public static final String PROP_SEPARATOR_CHAR = "separatorChar";
public static final String PROP_QUOTA_CHAR = "quoteChar";
public static final String PROP_QUOTE_CHAR = "quoteChar";

public static final String PROP_COLLECTION_DELIMITER_HIVE2 = "colelction.delim";
public static final String PROP_COLLECTION_DELIMITER_HIVE3 = "collection.delim";
Expand Down Expand Up @@ -445,32 +445,37 @@ protected Map<String, String> getLocationProperties() throws UserException {
@Override
protected TFileAttributes getFileAttributes() throws UserException {
TFileTextScanRangeParams textParams = new TFileTextScanRangeParams();
java.util.Map<String, String> delimiter = hmsTable.getRemoteTable().getSd().getSerdeInfo().getParameters();
if (delimiter.containsKey(PROP_FIELD_DELIMITER)) {
if (delimiter.get(PROP_FIELD_DELIMITER).length() == 0) {
textParams.setColumnSeparator(DEFAULT_FIELD_DELIMITER);
} else {
textParams.setColumnSeparator(delimiter.get(PROP_FIELD_DELIMITER));
}
} else if (delimiter.containsKey(PROP_SEPARATOR_CHAR)) {
textParams.setColumnSeparator(delimiter.get(PROP_SEPARATOR_CHAR));
} else {
textParams.setColumnSeparator(DEFAULT_FIELD_DELIMITER);
}
if (delimiter.containsKey(PROP_QUOTA_CHAR)) {
textParams.setEnclose(delimiter.get(PROP_QUOTA_CHAR).getBytes()[0]);
}
textParams.setLineDelimiter(delimiter.getOrDefault(PROP_LINE_DELIMITER, DEFAULT_LINE_DELIMITER));
textParams.setMapkvDelimiter(delimiter.getOrDefault(PROP_MAP_KV_DELIMITER, DEFAULT_MAP_KV_DELIMITER));

// textParams.collection_delimiter field is map, array and struct delimiter;
if (delimiter.get(PROP_COLLECTION_DELIMITER_HIVE2) != null) {
textParams.setCollectionDelimiter(delimiter.get(PROP_COLLECTION_DELIMITER_HIVE2));
} else if (delimiter.get(PROP_COLLECTION_DELIMITER_HIVE3) != null) {
textParams.setCollectionDelimiter(delimiter.get(PROP_COLLECTION_DELIMITER_HIVE3));
} else {
textParams.setCollectionDelimiter(DEFAULT_COLLECTION_DELIMITER);

// 1. set column separator
Optional<String> fieldDelim =
HiveMetaStoreClientHelper.getSerdeProperty(hmsTable.getRemoteTable(), PROP_FIELD_DELIMITER);
Optional<String> columnSeparator =
HiveMetaStoreClientHelper.getSerdeProperty(hmsTable.getRemoteTable(), PROP_SEPARATOR_CHAR);
textParams.setColumnSeparator(HiveMetaStoreClientHelper.firstPresentOrDefault(
DEFAULT_FIELD_DELIMITER, fieldDelim, columnSeparator));
// 2. set line delimiter
Optional<String> lineDelim =
HiveMetaStoreClientHelper.getSerdeProperty(hmsTable.getRemoteTable(), PROP_LINE_DELIMITER);
textParams.setLineDelimiter(HiveMetaStoreClientHelper.firstPresentOrDefault(
DEFAULT_LINE_DELIMITER, lineDelim));
// 3. set mapkv delimiter
Optional<String> mapkvDelim =
HiveMetaStoreClientHelper.getSerdeProperty(hmsTable.getRemoteTable(), PROP_MAP_KV_DELIMITER);
textParams.setMapkvDelimiter(HiveMetaStoreClientHelper.firstPresentOrDefault(
DEFAULT_MAP_KV_DELIMITER, mapkvDelim));
// 4. set collection delimiter
Optional<String> collectionDelimHive2 =
HiveMetaStoreClientHelper.getSerdeProperty(hmsTable.getRemoteTable(), PROP_COLLECTION_DELIMITER_HIVE2);
Optional<String> collectionDelimHive3 =
HiveMetaStoreClientHelper.getSerdeProperty(hmsTable.getRemoteTable(), PROP_COLLECTION_DELIMITER_HIVE3);
textParams.setCollectionDelimiter(HiveMetaStoreClientHelper.firstPresentOrDefault(
DEFAULT_COLLECTION_DELIMITER, collectionDelimHive2, collectionDelimHive3));
// 5. set quote char
Map<String, String> serdeParams = hmsTable.getRemoteTable().getSd().getSerdeInfo().getParameters();
if (serdeParams.containsKey(PROP_QUOTE_CHAR)) {
textParams.setEnclose(serdeParams.get(PROP_QUOTE_CHAR).getBytes()[0]);
}

TFileAttributes fileAttributes = new TFileAttributes();
fileAttributes.setTextParams(textParams);
fileAttributes.setHeaderType("");
Expand Down Expand Up @@ -502,3 +507,4 @@ protected TFileCompressType getFileCompressType(FileSplit fileSplit) throws User
return compressType;
}
}

Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,23 @@
a 1.1
b 2.2

-- !2 --
1 abc
2 def

-- !2 --
1 abc
2 def

-- !1 --
a 1.1
b 2.2

-- !2 --
1 abc
2 def

-- !2 --
1 abc
2 def

Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,10 @@ suite("test_hive_serde_prop", "external_docker,hive,external_docker_hive,p0,exte
);"""

qt_1 """select * from ${catalog_name}.${ex_db_name}.employee_gz order by name;"""


qt_2 """select * from ${catalog_name}.regression.serde_test1 order by id;"""
qt_2 """select * from ${catalog_name}.regression.serde_test2 order by id;"""
}
}

0 comments on commit 9a72f75

Please sign in to comment.