Skip to content

Commit

Permalink
ORC: Allow reads of tinyint, smallint, char, varchar types (#1821)
Browse files Browse the repository at this point in the history
  • Loading branch information
shardulm94 committed Dec 2, 2020
1 parent 2e111a1 commit 9f73cd4
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,14 @@

import java.io.File;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.time.LocalDateTime;
import java.time.OffsetDateTime;
import java.util.Collections;
import java.util.List;
import java.util.TimeZone;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.iceberg.Files;
import org.apache.iceberg.Schema;
import org.apache.iceberg.data.DataTest;
Expand All @@ -38,9 +41,16 @@
import org.apache.iceberg.orc.ORC;
import org.apache.iceberg.relocated.com.google.common.collect.Lists;
import org.apache.iceberg.types.Types;
import org.apache.orc.OrcFile;
import org.apache.orc.TypeDescription;
import org.apache.orc.Writer;
import org.apache.orc.storage.ql.exec.vector.BytesColumnVector;
import org.apache.orc.storage.ql.exec.vector.LongColumnVector;
import org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch;
import org.junit.Assert;
import org.junit.Test;

import static org.apache.iceberg.types.Types.NestedField.optional;
import static org.apache.iceberg.types.Types.NestedField.required;

public class TestGenericData extends DataTest {
Expand Down Expand Up @@ -123,6 +133,45 @@ public void writeAndValidateTimestamps() throws IOException {
}
}

@Test
public void writeAndValidateExternalData() throws IOException {
File testFile = temp.newFile();
Assert.assertTrue("Delete should succeed", testFile.delete());

Configuration conf = new Configuration();
TypeDescription writerSchema = TypeDescription.fromString("struct<a:tinyint,b:smallint,c:char(10),d:varchar(10)>");
Writer writer = OrcFile.createWriter(new Path(testFile.toString()),
OrcFile.writerOptions(conf)
.setSchema(writerSchema));
VectorizedRowBatch batch = writerSchema.createRowBatch();
batch.ensureSize(1);
batch.size = 1;
((LongColumnVector) batch.cols[0]).vector[0] = 1;
((LongColumnVector) batch.cols[1]).vector[0] = 123;
((BytesColumnVector) batch.cols[2]).setVal(0, "1".getBytes(StandardCharsets.UTF_8));
((BytesColumnVector) batch.cols[3]).setVal(0, "123".getBytes(StandardCharsets.UTF_8));
writer.addRowBatch(batch);
writer.close();

List<Record> rows;
Schema readSchema = new Schema(
optional(1, "a", Types.IntegerType.get()),
optional(2, "b", Types.IntegerType.get()),
optional(3, "c", Types.StringType.get()),
optional(4, "d", Types.StringType.get())
);
try (CloseableIterable<Record> reader = ORC.read(Files.localInput(testFile))
.project(readSchema)
.createReaderFunc(fileSchema -> GenericOrcReader.buildReader(readSchema, fileSchema))
.build()) {
rows = Lists.newArrayList(reader);
}
Assert.assertEquals(1, rows.get(0).getField("a"));
Assert.assertEquals(123, rows.get(0).getField("b"));
Assert.assertEquals("1", rows.get(0).getField("c"));
Assert.assertEquals("123", rows.get(0).getField("d"));
}

private void writeAndValidateRecords(Schema schema, List<Record> expected) throws IOException {
File testFile = temp.newFile();
Assert.assertTrue("Delete should succeed", testFile.delete());
Expand Down
12 changes: 8 additions & 4 deletions orc/src/main/java/org/apache/iceberg/orc/ORCSchemaUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
import org.apache.iceberg.Schema;
import org.apache.iceberg.mapping.NameMapping;
import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMultimap;
import org.apache.iceberg.relocated.com.google.common.collect.Maps;
import org.apache.iceberg.types.Type;
import org.apache.iceberg.types.TypeUtil;
Expand Down Expand Up @@ -80,15 +80,19 @@ public TypeDescription type() {
public static final String ICEBERG_LONG_TYPE_ATTRIBUTE = "iceberg.long-type";
static final String ICEBERG_FIELD_LENGTH = "iceberg.length";

private static final ImmutableMap<Type.TypeID, TypeDescription.Category> TYPE_MAPPING =
ImmutableMap.<Type.TypeID, TypeDescription.Category>builder()
private static final ImmutableMultimap<Type.TypeID, TypeDescription.Category> TYPE_MAPPING =
ImmutableMultimap.<Type.TypeID, TypeDescription.Category>builder()
.put(Type.TypeID.BOOLEAN, TypeDescription.Category.BOOLEAN)
.put(Type.TypeID.INTEGER, TypeDescription.Category.BYTE)
.put(Type.TypeID.INTEGER, TypeDescription.Category.SHORT)
.put(Type.TypeID.INTEGER, TypeDescription.Category.INT)
.put(Type.TypeID.LONG, TypeDescription.Category.LONG)
.put(Type.TypeID.TIME, TypeDescription.Category.LONG)
.put(Type.TypeID.FLOAT, TypeDescription.Category.FLOAT)
.put(Type.TypeID.DOUBLE, TypeDescription.Category.DOUBLE)
.put(Type.TypeID.DATE, TypeDescription.Category.DATE)
.put(Type.TypeID.STRING, TypeDescription.Category.CHAR)
.put(Type.TypeID.STRING, TypeDescription.Category.VARCHAR)
.put(Type.TypeID.STRING, TypeDescription.Category.STRING)
.put(Type.TypeID.UUID, TypeDescription.Category.BINARY)
.put(Type.TypeID.FIXED, TypeDescription.Category.BINARY)
Expand Down Expand Up @@ -378,7 +382,7 @@ private static boolean isSameType(TypeDescription orcType, Type icebergType) {
tsType.shouldAdjustToUTC() ? TypeDescription.Category.TIMESTAMP_INSTANT : TypeDescription.Category.TIMESTAMP,
orcType.getCategory());
} else {
return Objects.equals(TYPE_MAPPING.get(icebergType.typeId()), orcType.getCategory());
return TYPE_MAPPING.containsEntry(icebergType.typeId(), orcType.getCategory());
}
}

Expand Down

0 comments on commit 9f73cd4

Please sign in to comment.