From 9f73cd4df0f06b88147e83e9780614626b7ace37 Mon Sep 17 00:00:00 2001 From: Shardul Mahadik Date: Wed, 25 Nov 2020 09:42:48 -0800 Subject: [PATCH] ORC: Allow reads of tinyint, smallint, char, varchar types (#1821) --- .../iceberg/data/orc/TestGenericData.java | 49 +++++++++++++++++++ .../org/apache/iceberg/orc/ORCSchemaUtil.java | 12 +++-- 2 files changed, 57 insertions(+), 4 deletions(-) diff --git a/data/src/test/java/org/apache/iceberg/data/orc/TestGenericData.java b/data/src/test/java/org/apache/iceberg/data/orc/TestGenericData.java index a18ef5f9d..f86db77dd 100644 --- a/data/src/test/java/org/apache/iceberg/data/orc/TestGenericData.java +++ b/data/src/test/java/org/apache/iceberg/data/orc/TestGenericData.java @@ -21,11 +21,14 @@ import java.io.File; import java.io.IOException; +import java.nio.charset.StandardCharsets; import java.time.LocalDateTime; import java.time.OffsetDateTime; import java.util.Collections; import java.util.List; import java.util.TimeZone; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; import org.apache.iceberg.Files; import org.apache.iceberg.Schema; import org.apache.iceberg.data.DataTest; @@ -38,9 +41,16 @@ import org.apache.iceberg.orc.ORC; import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.types.Types; +import org.apache.orc.OrcFile; +import org.apache.orc.TypeDescription; +import org.apache.orc.Writer; +import org.apache.orc.storage.ql.exec.vector.BytesColumnVector; +import org.apache.orc.storage.ql.exec.vector.LongColumnVector; +import org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch; import org.junit.Assert; import org.junit.Test; +import static org.apache.iceberg.types.Types.NestedField.optional; import static org.apache.iceberg.types.Types.NestedField.required; public class TestGenericData extends DataTest { @@ -123,6 +133,45 @@ public void writeAndValidateTimestamps() throws IOException { } } + @Test + public void writeAndValidateExternalData() throws IOException { + File testFile = temp.newFile(); + Assert.assertTrue("Delete should succeed", testFile.delete()); + + Configuration conf = new Configuration(); + TypeDescription writerSchema = TypeDescription.fromString("struct"); + Writer writer = OrcFile.createWriter(new Path(testFile.toString()), + OrcFile.writerOptions(conf) + .setSchema(writerSchema)); + VectorizedRowBatch batch = writerSchema.createRowBatch(); + batch.ensureSize(1); + batch.size = 1; + ((LongColumnVector) batch.cols[0]).vector[0] = 1; + ((LongColumnVector) batch.cols[1]).vector[0] = 123; + ((BytesColumnVector) batch.cols[2]).setVal(0, "1".getBytes(StandardCharsets.UTF_8)); + ((BytesColumnVector) batch.cols[3]).setVal(0, "123".getBytes(StandardCharsets.UTF_8)); + writer.addRowBatch(batch); + writer.close(); + + List rows; + Schema readSchema = new Schema( + optional(1, "a", Types.IntegerType.get()), + optional(2, "b", Types.IntegerType.get()), + optional(3, "c", Types.StringType.get()), + optional(4, "d", Types.StringType.get()) + ); + try (CloseableIterable reader = ORC.read(Files.localInput(testFile)) + .project(readSchema) + .createReaderFunc(fileSchema -> GenericOrcReader.buildReader(readSchema, fileSchema)) + .build()) { + rows = Lists.newArrayList(reader); + } + Assert.assertEquals(1, rows.get(0).getField("a")); + Assert.assertEquals(123, rows.get(0).getField("b")); + Assert.assertEquals("1", rows.get(0).getField("c")); + Assert.assertEquals("123", rows.get(0).getField("d")); + } + private void writeAndValidateRecords(Schema schema, List expected) throws IOException { File testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); diff --git a/orc/src/main/java/org/apache/iceberg/orc/ORCSchemaUtil.java b/orc/src/main/java/org/apache/iceberg/orc/ORCSchemaUtil.java index e17c7592f..9c2de518d 100644 --- a/orc/src/main/java/org/apache/iceberg/orc/ORCSchemaUtil.java +++ b/orc/src/main/java/org/apache/iceberg/orc/ORCSchemaUtil.java @@ -27,7 +27,7 @@ import org.apache.iceberg.Schema; import org.apache.iceberg.mapping.NameMapping; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMultimap; import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.types.Type; import org.apache.iceberg.types.TypeUtil; @@ -80,15 +80,19 @@ public TypeDescription type() { public static final String ICEBERG_LONG_TYPE_ATTRIBUTE = "iceberg.long-type"; static final String ICEBERG_FIELD_LENGTH = "iceberg.length"; - private static final ImmutableMap TYPE_MAPPING = - ImmutableMap.builder() + private static final ImmutableMultimap TYPE_MAPPING = + ImmutableMultimap.builder() .put(Type.TypeID.BOOLEAN, TypeDescription.Category.BOOLEAN) + .put(Type.TypeID.INTEGER, TypeDescription.Category.BYTE) + .put(Type.TypeID.INTEGER, TypeDescription.Category.SHORT) .put(Type.TypeID.INTEGER, TypeDescription.Category.INT) .put(Type.TypeID.LONG, TypeDescription.Category.LONG) .put(Type.TypeID.TIME, TypeDescription.Category.LONG) .put(Type.TypeID.FLOAT, TypeDescription.Category.FLOAT) .put(Type.TypeID.DOUBLE, TypeDescription.Category.DOUBLE) .put(Type.TypeID.DATE, TypeDescription.Category.DATE) + .put(Type.TypeID.STRING, TypeDescription.Category.CHAR) + .put(Type.TypeID.STRING, TypeDescription.Category.VARCHAR) .put(Type.TypeID.STRING, TypeDescription.Category.STRING) .put(Type.TypeID.UUID, TypeDescription.Category.BINARY) .put(Type.TypeID.FIXED, TypeDescription.Category.BINARY) @@ -378,7 +382,7 @@ private static boolean isSameType(TypeDescription orcType, Type icebergType) { tsType.shouldAdjustToUTC() ? TypeDescription.Category.TIMESTAMP_INSTANT : TypeDescription.Category.TIMESTAMP, orcType.getCategory()); } else { - return Objects.equals(TYPE_MAPPING.get(icebergType.typeId()), orcType.getCategory()); + return TYPE_MAPPING.containsEntry(icebergType.typeId(), orcType.getCategory()); } }