-
Notifications
You must be signed in to change notification settings - Fork 4.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
HIVE-28586 Support write order for Iceberg tables at CREATE TABLE #5541
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -88,6 +88,7 @@ private InputFormatConfig() { | |
public static final String CATALOG_CLASS_TEMPLATE = "iceberg.catalog.%s.catalog-impl"; | ||
public static final String CATALOG_DEFAULT_CONFIG_PREFIX = "iceberg.catalog-default."; | ||
public static final String QUERY_FILTERS = "iceberg.query.filters"; | ||
public static final String INSERT_WRITE_ORDER = "iceberg.write-order"; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can't we use iceberg config: TableProperties.DEFAULT_SORT_ORDER?
|
||
|
||
public enum InMemoryDataModel { | ||
PIG, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -47,6 +47,7 @@ | |
import org.apache.hadoop.hive.metastore.api.EnvironmentContext; | ||
import org.apache.hadoop.hive.metastore.api.FieldSchema; | ||
import org.apache.hadoop.hive.metastore.api.MetaException; | ||
import org.apache.hadoop.hive.metastore.api.NullOrderingType; | ||
import org.apache.hadoop.hive.metastore.api.SQLPrimaryKey; | ||
import org.apache.hadoop.hive.metastore.api.SerDeInfo; | ||
import org.apache.hadoop.hive.metastore.api.StorageDescriptor; | ||
|
@@ -82,13 +83,16 @@ | |
import org.apache.iceberg.FileScanTask; | ||
import org.apache.iceberg.MetadataTableType; | ||
import org.apache.iceberg.MetadataTableUtils; | ||
import org.apache.iceberg.NullOrder; | ||
import org.apache.iceberg.PartitionData; | ||
import org.apache.iceberg.PartitionField; | ||
import org.apache.iceberg.PartitionSpec; | ||
import org.apache.iceberg.PartitionSpecParser; | ||
import org.apache.iceberg.PartitionsTable; | ||
import org.apache.iceberg.Schema; | ||
import org.apache.iceberg.SchemaParser; | ||
import org.apache.iceberg.SortOrder; | ||
import org.apache.iceberg.SortOrderParser; | ||
import org.apache.iceberg.Table; | ||
import org.apache.iceberg.TableMetadata; | ||
import org.apache.iceberg.TableMetadataParser; | ||
|
@@ -271,6 +275,23 @@ public void preCreateTable(CreateTableRequest request) { | |
setOrcOnlyFilesParam(hmsTable); | ||
// Remove hive primary key columns from table request, as iceberg doesn't support hive primary key. | ||
request.setPrimaryKeys(null); | ||
addSortOrder(hmsTable, schema, catalogProperties); | ||
} | ||
|
||
private void addSortOrder(org.apache.hadoop.hive.metastore.api.Table hmsTable, Schema schema, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
Properties properties) { | ||
SortOrder.Builder sortOderBuilder = SortOrder.builderFor(schema); | ||
hmsTable.getSd().getSortCols().forEach(item -> { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. iceberg supports expressions in sort order like |
||
NullOrder nullOrder = item.getNullOrdering() == NullOrderingType.NULLS_FIRST ? | ||
NullOrder.NULLS_FIRST : NullOrder.NULLS_LAST; | ||
if (item.getOrder() == 0) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
sortOderBuilder.desc(item.getCol(), nullOrder); | ||
} else { | ||
sortOderBuilder.asc(item.getCol(), nullOrder); | ||
} | ||
|
||
}); | ||
properties.put(InputFormatConfig.INSERT_WRITE_ORDER, SortOrderParser.toJson(sortOderBuilder.build())); | ||
} | ||
|
||
@Override | ||
|
@@ -781,7 +802,7 @@ private void setCommonHmsTablePropertiesForIceberg(org.apache.hadoop.hive.metast | |
* @param hmsTable Table for which we are calculating the properties | ||
* @return The properties we can provide for Iceberg functions, like {@link Catalogs} | ||
*/ | ||
private static Properties getCatalogProperties(org.apache.hadoop.hive.metastore.api.Table hmsTable) { | ||
private Properties getCatalogProperties(org.apache.hadoop.hive.metastore.api.Table hmsTable) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why static was removed? |
||
Properties properties = new Properties(); | ||
|
||
hmsTable.getParameters().entrySet().stream().filter(e -> e.getKey() != null && e.getValue() != null).forEach(e -> { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
-- Mask neededVirtualColumns due to non-strict order | ||
--! qt:replace:/(\s+neededVirtualColumns:\s)(.*)/$1#Masked#/ | ||
-- Mask the totalSize value as it can have slight variability, causing test flakiness | ||
--! qt:replace:/(\s+totalSize\s+)\S+(\s+)/$1#Masked#$2/ | ||
-- Mask random uuid | ||
--! qt:replace:/(\s+uuid\s+)\S+(\s*)/$1#Masked#$2/ | ||
-- Mask a random snapshot id | ||
--! qt:replace:/(\s+current-snapshot-id\s+)\S+(\s*)/$1#Masked#/ | ||
-- Mask added file size | ||
--! qt:replace:/(\S\"added-files-size\\\":\\\")(\d+)(\\\")/$1#Masked#$3/ | ||
-- Mask total file size | ||
--! qt:replace:/(\S\"total-files-size\\\":\\\")(\d+)(\\\")/$1#Masked#$3/ | ||
-- Mask removed file size | ||
--! qt:replace:/(\S\"removed-files-size\\\":\\\")(\d+)(\\\")/$1#Masked#$3/ | ||
-- Mask current-snapshot-timestamp-ms | ||
--! qt:replace:/(\s+current-snapshot-timestamp-ms\s+)\S+(\s*)/$1#Masked#$2/ | ||
--! qt:replace:/(MAJOR\s+succeeded\s+)[a-zA-Z0-9\-\.\s+]+(\s+manual)/$1#Masked#$2/ | ||
-- Mask iceberg version | ||
--! qt:replace:/(\S\"iceberg-version\\\":\\\")(\w+\s\w+\s\d+\.\d+\.\d+\s\(\w+\s\w+\))(\\\")/$1#Masked#$3/ | ||
set hive.llap.io.enabled=true; | ||
set hive.vectorized.execution.enabled=true; | ||
set hive.optimize.shared.work.merge.ts.schema=true; | ||
|
||
|
||
create table ice_orc_sorted (id int, text string) write ordered by id desc nulls first, text asc nulls last stored by iceberg stored as orc; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we specify the sort column/expressions in parentheses? |
||
|
||
insert into ice_orc_sorted values (3, "3"),(2, "2"),(4, "4"),(5, "5"),(1, "1"),(2, "3"),(3,null),(2,null),(null,"a"); | ||
|
||
describe formatted ice_orc_sorted; | ||
describe extended ice_orc_sorted; | ||
|
||
select * from ice_orc_sorted; | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -31,6 +31,7 @@ Table Parameters: | |
current-schema {\"type\":\"struct\",\"schema-id\":0,\"fields\":[{\"id\":1,\"name\":\"i\",\"required\":false,\"type\":\"int\"},{\"id\":2,\"name\":\"s\",\"required\":false,\"type\":\"string\"},{\"id\":3,\"name\":\"ts\",\"required\":false,\"type\":\"timestamp\"},{\"id\":4,\"name\":\"d\",\"required\":false,\"type\":\"date\"}]} | ||
format-version 2 | ||
iceberg.orc.files.only true | ||
iceberg.write-order {\"order-id\":0,\"fields\":[]} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should we add this if no order provided? i think that config shouldn't even be set |
||
metadata_location hdfs://### HDFS PATH ### | ||
numFiles 0 | ||
numRows 0 | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
sortOrderAsJson
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think I would use "asJson" in a method name to express a transformation is happening like toString(), asJson(), asXML ect not in a variable name.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
i don't insist, just saw this naming is used in iceberg