From 561cf6db772d95606b1a95cc617a41595b4a9595 Mon Sep 17 00:00:00 2001 From: lyogev Date: Sat, 30 Jan 2021 16:20:49 +0200 Subject: [PATCH 1/4] fix(elasticsearch): add test to spark 3 from snapshot JAR --- e2e/elasticsearch/docker-compose.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/e2e/elasticsearch/docker-compose.yml b/e2e/elasticsearch/docker-compose.yml index a449c8ebb..af4a84a1b 100644 --- a/e2e/elasticsearch/docker-compose.yml +++ b/e2e/elasticsearch/docker-compose.yml @@ -2,22 +2,22 @@ version: '3' services: spark-submit: - image: metorikku/metorikku:spark2_standalone + image: metorikku/metorikku:standalone environment: - - SUBMIT_COMMAND=spark-submit --packages commons-httpclient:commons-httpclient:3.0.1 --jars https://repo1.maven.org/maven2/org/elasticsearch/elasticsearch-hadoop/6.8.10/elasticsearch-hadoop-6.8.10.jar --class com.yotpo.metorikku.Metorikku metorikku.jar -c examples/elasticsearch/movies.yaml + - SUBMIT_COMMAND=spark-submit --packages commons-httpclient:commons-httpclient:3.0.1 --jars https://yotpo-public.s3.amazonaws.com/elasticsearch-hadoop/elasticsearch-spark-30_2.12-8.0.0-SNAPSHOT.jar --class com.yotpo.metorikku.Metorikku metorikku.jar -c examples/elasticsearch/movies.yaml entrypoint: - /scripts/entrypoint-submit.sh depends_on: - spark-master - spark-worker spark-master: - image: metorikku/metorikku:spark2_standalone + image: metorikku/metorikku:standalone entrypoint: - /scripts/entrypoint-master.sh logging: driver: none spark-worker: - image: metorikku/metorikku:spark2_standalone + image: metorikku/metorikku:standalone entrypoint: - /scripts/entrypoint-worker.sh logging: From eabaa96c767e94580c49c92d367888f5db8d20c0 Mon Sep 17 00:00:00 2001 From: lyogev Date: Sat, 30 Jan 2021 20:08:50 +0200 Subject: [PATCH 2/4] fix(hudi): upgrade to 0.7.0 (spark 3) --- .travis.yml | 2 +- README.md | 2 +- docker/hive/Dockerfile | 2 +- docker/spark/custom-hadoop/Dockerfile | 2 +- e2e/cdc/docker-compose.yml | 12 ++++++------ e2e/hudi/docker-compose.yml | 28 +++++++++++++-------------- 6 files changed, 24 insertions(+), 24 deletions(-) diff --git a/.travis.yml b/.travis.yml index afd345abe..e7d036652 100644 --- a/.travis.yml +++ b/.travis.yml @@ -28,7 +28,7 @@ env: - SPARK2_VERSION=2.4.6 - SPARK_VERSION=3.0.1 - HIVE_VERSION=2.3.7 - - HUDI_VERSION=0.5.3 + - HUDI_VERSION=0.7.0 - TARGET_CACHE=$HOME/target-cache/${TRAVIS_COMMIT} - LC_ALL=en_US.UTF-8 - LANG=en_US.UTF-8 diff --git a/README.md b/README.md index 0ad56941b..91b82b384 100644 --- a/README.md +++ b/README.md @@ -530,7 +530,7 @@ Metorikku supports reading/writing with [Apache Hudi](https://github.com/apache/ Hudi is a very exciting project that basically allows upserts and deletes directly on top of partitioned parquet data. In order to use Hudi with Metorikku you need to add to your classpath (via ```--jars``` or if running locally with ```-cp```) -an external JAR from here: https://repo1.maven.org/maven2/org/apache/hudi/hudi-spark-bundle_2.12/0.5.3/hudi-spark-bundle_2.12-0.5.3.jar +an external JAR from here: https://repo1.maven.org/maven2/org/apache/hudi/hudi-spark-bundle_2.12/0.7.0/hudi-spark-bundle_2.12-0.7.0.jar To run Hudi jobs you also have to make sure you have the following spark configuration (pass with ```--conf``` or ```-D```): ```properties diff --git a/docker/hive/Dockerfile b/docker/hive/Dockerfile index 944ee1933..4299deae7 100644 --- a/docker/hive/Dockerfile +++ b/docker/hive/Dockerfile @@ -23,7 +23,7 @@ ENV MYSQL_CONNECTOR_VERSION=5.1.47 RUN wget -q https://repo1.maven.org/maven2/mysql/mysql-connector-java/$MYSQL_CONNECTOR_VERSION/mysql-connector-java-$MYSQL_CONNECTOR_VERSION.jar \ && mv mysql-connector-java-$MYSQL_CONNECTOR_VERSION.jar $HIVE_HOME/lib -ARG HUDI_VERSION=0.5.3 +ARG HUDI_VERSION=0.7.0 RUN wget -q https://repo1.maven.org/maven2/org/apache/hudi/hudi-hive-bundle/$HUDI_VERSION/hudi-hive-bundle-$HUDI_VERSION.jar \ && mv hudi-hive-bundle-$HUDI_VERSION.jar $HIVE_HOME/lib RUN wget -q https://repo1.maven.org/maven2/org/apache/hudi/hudi-hadoop-mr-bundle/$HUDI_VERSION/hudi-hadoop-mr-bundle-$HUDI_VERSION.jar \ diff --git a/docker/spark/custom-hadoop/Dockerfile b/docker/spark/custom-hadoop/Dockerfile index 8a3629b70..b6819bcdd 100644 --- a/docker/spark/custom-hadoop/Dockerfile +++ b/docker/spark/custom-hadoop/Dockerfile @@ -27,7 +27,7 @@ RUN wget -q https://archive.apache.org/dist/hive/hive-$HIVE_VERSION/apache-hive- && rm apache-hive-$HIVE_VERSION-bin.tar.gz #Hudi for hive -ENV HUDI_VERSION=0.5.3 +ENV HUDI_VERSION=0.7.0 RUN wget -q https://repo1.maven.org/maven2/org/apache/hudi/hudi-hive-bundle/$HUDI_VERSION/hudi-hive-bundle-$HUDI_VERSION.jar \ && mv hudi-hive-bundle-$HUDI_VERSION.jar $HIVE_HOME/lib RUN wget -q https://repo1.maven.org/maven2/org/apache/hudi/hudi-hadoop-mr-bundle/$HUDI_VERSION/hudi-hadoop-mr-bundle-$HUDI_VERSION.jar \ diff --git a/e2e/cdc/docker-compose.yml b/e2e/cdc/docker-compose.yml index 09c48ab68..9d7b7f299 100644 --- a/e2e/cdc/docker-compose.yml +++ b/e2e/cdc/docker-compose.yml @@ -85,13 +85,13 @@ services: - mysql # Spark Resources spark-master: - image: metorikku/metorikku:spark2_standalone + image: metorikku/metorikku:standalone entrypoint: - /scripts/entrypoint-master.sh logging: driver: none spark-worker: - image: metorikku/metorikku:spark2_standalone + image: metorikku/metorikku:standalone entrypoint: - /scripts/entrypoint-worker.sh logging: @@ -108,9 +108,9 @@ services: - SCHEMA_REGISTRY_LISTENERS=http://schema-registry:8081 # Spark job: Read from CDC Kafka topic, Deserialize according to schema registry, Write to Hudi output spark-submit: - image: metorikku/metorikku:spark2_standalone + image: metorikku/metorikku:standalone environment: - - SUBMIT_COMMAND=spark-submit --repositories http://packages.confluent.io/maven/ --jars https://repo1.maven.org/maven2/org/apache/hudi/hudi-spark-bundle_2.11/0.5.3/hudi-spark-bundle_2.11-0.5.3.jar,https://repo1.maven.org/maven2/za/co/absa/abris_2.11/3.2.2/abris_2.11-3.2.2.jar --conf spark.hadoop.mapreduce.input.pathFilter.class=org.apache.hudi.hadoop.HoodieROTablePathFilter --packages io.confluent:kafka-schema-registry-client:5.3.0,io.confluent:kafka-avro-serializer:5.3.0 --conf spark.sql.warehouse.dir=/warehouse --class com.yotpo.metorikku.Metorikku metorikku.jar -c examples/kafka/kafka_example_cdc.yaml + - SUBMIT_COMMAND=spark-submit --repositories http://packages.confluent.io/maven/ --jars https://repo1.maven.org/maven2/org/apache/hudi/hudi-spark-bundle_2.12/0.7.0/hudi-spark-bundle_2.12-0.7.0.jar,https://repo1.maven.org/maven2/za/co/absa/abris_2.12/3.2.2/abris_2.12-3.2.2.jar --conf spark.hadoop.mapreduce.input.pathFilter.class=org.apache.hudi.hadoop.HoodieROTablePathFilter --packages io.confluent:kafka-schema-registry-client:5.3.0,io.confluent:kafka-avro-serializer:5.3.0 --conf spark.sql.warehouse.dir=/warehouse --class com.yotpo.metorikku.Metorikku metorikku.jar -c examples/kafka/kafka_example_cdc.yaml - HIVE_METASTORE_URI=hive:9083 entrypoint: - /scripts/entrypoint-submit.sh @@ -142,9 +142,9 @@ services: # - 9083:9083 # Hive test: Select from hive table and assert over the result hive-tester: - image: metorikku/metorikku:spark2_standalone + image: metorikku/metorikku:standalone environment: - - SUBMIT_COMMAND=spark-submit --jars https://repo1.maven.org/maven2/org/apache/hudi/hudi-spark-bundle_2.11/0.5.3/hudi-spark-bundle_2.11-0.5.3.jar --conf spark.hadoop.mapreduce.input.pathFilter.class=org.apache.hudi.hadoop.HoodieROTablePathFilter --conf spark.sql.warehouse.dir=/warehouse --class com.yotpo.metorikku.MetorikkuTester metorikku.jar --test-settings /test_metrics/hive_test.yaml + - SUBMIT_COMMAND=spark-submit --jars https://repo1.maven.org/maven2/org/apache/hudi/hudi-spark-bundle_2.12/0.7.0/hudi-spark-bundle_2.12-0.7.0.jar --conf spark.hadoop.mapreduce.input.pathFilter.class=org.apache.hudi.hadoop.HoodieROTablePathFilter --conf spark.sql.warehouse.dir=/warehouse --class com.yotpo.metorikku.MetorikkuTester metorikku.jar --test-settings /test_metrics/hive_test.yaml - HIVE_METASTORE_URI=hive:9083 volumes: - ./output/:/examples/output/ diff --git a/e2e/hudi/docker-compose.yml b/e2e/hudi/docker-compose.yml index b0616056f..95571057d 100644 --- a/e2e/hudi/docker-compose.yml +++ b/e2e/hudi/docker-compose.yml @@ -1,9 +1,9 @@ version: '3' services: spark-submit: - image: metorikku/metorikku:spark2_standalone + image: metorikku/metorikku:standalone environment: - - SUBMIT_COMMAND=spark-submit --jars https://repo1.maven.org/maven2/org/apache/hudi/hudi-spark-bundle_2.11/0.5.3/hudi-spark-bundle_2.11-0.5.3.jar --conf spark.hadoop.mapreduce.input.pathFilter.class=org.apache.hudi.hadoop.HoodieROTablePathFilter --class com.yotpo.metorikku.Metorikku metorikku.jar -c examples/hudi/movies.yaml + - SUBMIT_COMMAND=spark-submit --jars https://repo1.maven.org/maven2/org/apache/hudi/hudi-spark-bundle_2.12/0.7.0/hudi-spark-bundle_2.12-0.7.0.jar --conf spark.hadoop.mapreduce.input.pathFilter.class=org.apache.hudi.hadoop.HoodieROTablePathFilter --class com.yotpo.metorikku.Metorikku metorikku.jar -c examples/hudi/movies.yaml - HIVE_METASTORE_URI=hive:9083 volumes: - ./output/:/examples/output/ @@ -13,9 +13,9 @@ services: - spark-master - spark-worker hive-tester: - image: metorikku/metorikku:spark2_standalone + image: metorikku/metorikku:standalone environment: - - SUBMIT_COMMAND=spark-submit --jars https://repo1.maven.org/maven2/org/apache/hudi/hudi-spark-bundle_2.11/0.5.3/hudi-spark-bundle_2.11-0.5.3.jar --conf spark.hadoop.mapreduce.input.pathFilter.class=org.apache.hudi.hadoop.HoodieROTablePathFilter --class com.yotpo.metorikku.MetorikkuTester metorikku.jar --test-settings examples/hudi/movies_test.yaml + - SUBMIT_COMMAND=spark-submit --jars https://repo1.maven.org/maven2/org/apache/hudi/hudi-spark-bundle_2.12/0.7.0/hudi-spark-bundle_2.12-0.7.0.jar --conf spark.hadoop.mapreduce.input.pathFilter.class=org.apache.hudi.hadoop.HoodieROTablePathFilter --class com.yotpo.metorikku.MetorikkuTester metorikku.jar --test-settings examples/hudi/movies_test.yaml - HIVE_METASTORE_URI=hive:9083 volumes: - ./output/:/examples/output/ @@ -25,9 +25,9 @@ services: - spark-master - spark-worker spark-submit-manual-hive-sync: - image: metorikku/metorikku:spark2_standalone + image: metorikku/metorikku:standalone environment: - - SUBMIT_COMMAND=spark-submit --jars https://repo1.maven.org/maven2/org/apache/hudi/hudi-spark-bundle_2.11/0.5.3/hudi-spark-bundle_2.11-0.5.3.jar --conf spark.hadoop.mapreduce.input.pathFilter.class=org.apache.hudi.hadoop.HoodieROTablePathFilter --class com.yotpo.metorikku.Metorikku metorikku.jar -c examples/hudi/manual_hive_sync_config.yaml + - SUBMIT_COMMAND=spark-submit --jars https://repo1.maven.org/maven2/org/apache/hudi/hudi-spark-bundle_2.12/0.7.0/hudi-spark-bundle_2.12-0.7.0.jar --conf spark.hadoop.mapreduce.input.pathFilter.class=org.apache.hudi.hadoop.HoodieROTablePathFilter --class com.yotpo.metorikku.Metorikku metorikku.jar -c examples/hudi/manual_hive_sync_config.yaml - HIVE_METASTORE_URI=hive:9083 volumes: - ./output/:/examples/output/ @@ -37,9 +37,9 @@ services: - spark-master - spark-worker hive-tester-manual-hive-sync: - image: metorikku/metorikku:spark2_standalone + image: metorikku/metorikku:standalone environment: - - SUBMIT_COMMAND=spark-submit --jars https://repo1.maven.org/maven2/org/apache/hudi/hudi-spark-bundle_2.11/0.5.3/hudi-spark-bundle_2.11-0.5.3.jar --conf spark.hadoop.mapreduce.input.pathFilter.class=org.apache.hudi.hadoop.HoodieROTablePathFilter --class com.yotpo.metorikku.MetorikkuTester metorikku.jar --test-settings examples/hudi/manual_hive_sync_test.yaml + - SUBMIT_COMMAND=spark-submit --jars https://repo1.maven.org/maven2/org/apache/hudi/hudi-spark-bundle_2.12/0.7.0/hudi-spark-bundle_2.12-0.7.0.jar --conf spark.hadoop.mapreduce.input.pathFilter.class=org.apache.hudi.hadoop.HoodieROTablePathFilter --class com.yotpo.metorikku.MetorikkuTester metorikku.jar --test-settings examples/hudi/manual_hive_sync_test.yaml - HIVE_METASTORE_URI=hive:9083 volumes: - ./output/:/examples/output/ @@ -49,9 +49,9 @@ services: - spark-master - spark-worker spark-submit-manual-hive-sync-non-partition: - image: metorikku/metorikku:spark2_standalone + image: metorikku/metorikku:standalone environment: - - SUBMIT_COMMAND=spark-submit --jars https://repo1.maven.org/maven2/org/apache/hudi/hudi-spark-bundle_2.11/0.5.3/hudi-spark-bundle_2.11-0.5.3.jar --conf spark.hadoop.mapreduce.input.pathFilter.class=org.apache.hudi.hadoop.HoodieROTablePathFilter --class com.yotpo.metorikku.Metorikku metorikku.jar -c examples/hudi/manual_hive_sync_no_partitions_config.yaml + - SUBMIT_COMMAND=spark-submit --jars https://repo1.maven.org/maven2/org/apache/hudi/hudi-spark-bundle_2.12/0.7.0/hudi-spark-bundle_2.12-0.7.0.jar --conf spark.hadoop.mapreduce.input.pathFilter.class=org.apache.hudi.hadoop.HoodieROTablePathFilter --class com.yotpo.metorikku.Metorikku metorikku.jar -c examples/hudi/manual_hive_sync_no_partitions_config.yaml - HIVE_METASTORE_URI=hive:9083 volumes: - ./output/:/examples/output/ @@ -61,9 +61,9 @@ services: - spark-master - spark-worker hive-tester-manual-hive-sync-no-partition: - image: metorikku/metorikku:spark2_standalone + image: metorikku/metorikku:standalone environment: - - SUBMIT_COMMAND=spark-submit --jars https://repo1.maven.org/maven2/org/apache/hudi/hudi-spark-bundle_2.11/0.5.3/hudi-spark-bundle_2.11-0.5.3.jar --conf spark.hadoop.mapreduce.input.pathFilter.class=org.apache.hudi.hadoop.HoodieROTablePathFilter --class com.yotpo.metorikku.MetorikkuTester metorikku.jar --test-settings examples/hudi/manual_hive_sync_no_partitions_test.yaml + - SUBMIT_COMMAND=spark-submit --jars https://repo1.maven.org/maven2/org/apache/hudi/hudi-spark-bundle_2.12/0.7.0/hudi-spark-bundle_2.12-0.7.0.jar --conf spark.hadoop.mapreduce.input.pathFilter.class=org.apache.hudi.hadoop.HoodieROTablePathFilter --class com.yotpo.metorikku.MetorikkuTester metorikku.jar --test-settings examples/hudi/manual_hive_sync_no_partitions_test.yaml - HIVE_METASTORE_URI=hive:9083 volumes: - ./output/:/examples/output/ @@ -73,13 +73,13 @@ services: - spark-master - spark-worker spark-master: - image: metorikku/metorikku:spark2_standalone + image: metorikku/metorikku:standalone entrypoint: - /scripts/entrypoint-master.sh logging: driver: none spark-worker: - image: metorikku/metorikku:spark2_standalone + image: metorikku/metorikku:standalone entrypoint: - /scripts/entrypoint-worker.sh volumes: From 42fd4794b832f0b994321494c5a6e43dd22a58be Mon Sep 17 00:00:00 2001 From: lyogev Date: Mon, 1 Feb 2021 09:30:03 +0200 Subject: [PATCH 3/4] fix(docker): log in to docker hub --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index e7d036652..c85925ba5 100644 --- a/.travis.yml +++ b/.travis.yml @@ -37,6 +37,7 @@ before_script: - export -f travis_time_start - export -f travis_time_finish - export -f travis_nanoseconds + - echo "$DOCKER_PASSWORD" | docker login -u "$DOCKER_USERNAME" --password-stdin || echo "Logging in to docker" jobs: include: - stage: "Build" From f5ea7c690bc8b6b14c76fc35fc1bc35604ea9a9a Mon Sep 17 00:00:00 2001 From: lyogev Date: Mon, 1 Feb 2021 10:40:40 +0200 Subject: [PATCH 4/4] fix(hudi): upgrade to 0.7.0 (spark 3) --- .travis.yml | 2 +- docker/hive/Dockerfile | 2 +- docker/spark/custom-hadoop/Dockerfile | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index c85925ba5..47749bbf1 100644 --- a/.travis.yml +++ b/.travis.yml @@ -28,7 +28,7 @@ env: - SPARK2_VERSION=2.4.6 - SPARK_VERSION=3.0.1 - HIVE_VERSION=2.3.7 - - HUDI_VERSION=0.7.0 + - HUDI_VERSION=0.5.3 - TARGET_CACHE=$HOME/target-cache/${TRAVIS_COMMIT} - LC_ALL=en_US.UTF-8 - LANG=en_US.UTF-8 diff --git a/docker/hive/Dockerfile b/docker/hive/Dockerfile index 4299deae7..944ee1933 100644 --- a/docker/hive/Dockerfile +++ b/docker/hive/Dockerfile @@ -23,7 +23,7 @@ ENV MYSQL_CONNECTOR_VERSION=5.1.47 RUN wget -q https://repo1.maven.org/maven2/mysql/mysql-connector-java/$MYSQL_CONNECTOR_VERSION/mysql-connector-java-$MYSQL_CONNECTOR_VERSION.jar \ && mv mysql-connector-java-$MYSQL_CONNECTOR_VERSION.jar $HIVE_HOME/lib -ARG HUDI_VERSION=0.7.0 +ARG HUDI_VERSION=0.5.3 RUN wget -q https://repo1.maven.org/maven2/org/apache/hudi/hudi-hive-bundle/$HUDI_VERSION/hudi-hive-bundle-$HUDI_VERSION.jar \ && mv hudi-hive-bundle-$HUDI_VERSION.jar $HIVE_HOME/lib RUN wget -q https://repo1.maven.org/maven2/org/apache/hudi/hudi-hadoop-mr-bundle/$HUDI_VERSION/hudi-hadoop-mr-bundle-$HUDI_VERSION.jar \ diff --git a/docker/spark/custom-hadoop/Dockerfile b/docker/spark/custom-hadoop/Dockerfile index b6819bcdd..8a3629b70 100644 --- a/docker/spark/custom-hadoop/Dockerfile +++ b/docker/spark/custom-hadoop/Dockerfile @@ -27,7 +27,7 @@ RUN wget -q https://archive.apache.org/dist/hive/hive-$HIVE_VERSION/apache-hive- && rm apache-hive-$HIVE_VERSION-bin.tar.gz #Hudi for hive -ENV HUDI_VERSION=0.7.0 +ENV HUDI_VERSION=0.5.3 RUN wget -q https://repo1.maven.org/maven2/org/apache/hudi/hudi-hive-bundle/$HUDI_VERSION/hudi-hive-bundle-$HUDI_VERSION.jar \ && mv hudi-hive-bundle-$HUDI_VERSION.jar $HIVE_HOME/lib RUN wget -q https://repo1.maven.org/maven2/org/apache/hudi/hudi-hadoop-mr-bundle/$HUDI_VERSION/hudi-hadoop-mr-bundle-$HUDI_VERSION.jar \