Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[PR1] refactoring :apps build and adding spark3.5/iceberg1.5 artifact #255

Merged
merged 5 commits into from
Dec 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 67 additions & 0 deletions apps/spark-3.5/build.gradle
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
plugins {
// dependency in apps-spark-conventions
id 'com.github.johnrengelman.shadow' version '7.1.2'
id 'openhouse.apps-spark-common'
id 'openhouse.iceberg-conventions-1.5.2'
}

ext {
log4jVersion = "2.20.0"

sparkVersion = '3.5.2'
icebergVersion = '1.5.2'
sparkVersionSuffix = "3.5"
openhouseSparkRuntimeModule = ":integrations:spark:spark-${sparkVersionSuffix}:openhouse-spark-3.5-runtime_2.12"
icebergSparkRuntimeModule = "org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:${icebergVersion}"
tablesTestFixturesModule = ":tables-test-fixtures:tables-test-fixtures-iceberg-1.5_2.12"
}

dependencies {
compileOnly (project(path: openhouseSparkRuntimeModule)) {
exclude group: 'io.netty'
exclude group: 'org.apache.hadoop', module: 'hadoop-common'
exclude group: 'org.apache.hadoop', module: 'hadoop-client'
}

implementation "org.apache.logging.log4j:log4j-slf4j-impl:${log4jVersion}"
implementation(project(':libs:datalayout')) {
exclude group: 'org.apache.iceberg', module: 'iceberg-spark-runtime-3.1_2.12'
}
implementation ('org.apache.spark:spark-core_2.12:' + sparkVersion) {
exclude group: 'io.netty'
exclude group: 'org.apache.hadoop', module: 'hadoop-common'
exclude group: 'org.apache.hadoop', module: 'hadoop-client'
}
implementation ('org.apache.spark:spark-sql_2.12:' + sparkVersion) {
exclude group: 'io.netty'
exclude group: 'org.apache.hadoop', module: 'hadoop-common'
exclude group: 'org.apache.hadoop', module: 'hadoop-client'
}
implementation (icebergSparkRuntimeModule) {
exclude group: 'io.netty'
}

testImplementation (project(path: openhouseSparkRuntimeModule, configuration: 'shadow')) {
exclude group: 'io.netty'
exclude group: 'org.apache.hadoop', module: 'hadoop-common'
exclude group: 'org.apache.hadoop', module: 'hadoop-client'
}
testImplementation(project(tablesTestFixturesModule)) {
exclude group: "io.netty"
}
}

// because we depend on project(':apps:openhouse-spark-apps_2.12').sourceSets
evaluationDependsOn(':apps:openhouse-spark-apps_2.12')
sourceSets {
main {
java {
srcDirs = ['src/main/java', project(':apps:openhouse-spark-apps_2.12').sourceSets.main.java.srcDirs]
}
}
test {
java {
srcDirs = ['src/test/java', project(':apps:openhouse-spark-apps_2.12').sourceSets.test.java.srcDirs]
}
}
}
140 changes: 16 additions & 124 deletions apps/spark/build.gradle
Original file line number Diff line number Diff line change
@@ -1,37 +1,29 @@
plugins {
id 'openhouse.java-conventions'
id 'openhouse.hadoop-conventions'
id 'openhouse.iceberg-conventions-1.2'
cbb330 marked this conversation as resolved.
Show resolved Hide resolved
id 'openhouse.maven-publish'
// dependency in apps-spark-conventions
id 'com.github.johnrengelman.shadow' version '7.1.2'
}

configurations {
// Excluding these libraries avoids competing implementations for LoggerFactory
// Standardizing on slf4j + log4j2 as implementation.
all*.exclude module : 'spring-boot-starter-logging'
all*.exclude module : 'logback-classic'
shadow.extendsFrom implementation
id 'openhouse.apps-spark-common'
id 'openhouse.iceberg-conventions-1.2'
}

ext {
log4jVersion = "2.18.0"

sparkVersion = '3.1.1'
icebergVersion = '1.2.0'
sparkVersionSuffix = "3.1"
openhouseSparkRuntimeModule = ":integrations:spark:spark-${sparkVersionSuffix}:openhouse-spark-runtime_2.12"
icebergSparkRuntimeModule = "org.apache.iceberg:iceberg-spark-runtime-3.1_2.12:${icebergVersion}"
tablesTestFixturesModule = ":tables-test-fixtures:tables-test-fixtures_2.12"
}

dependencies {
implementation project(':iceberg:openhouse:internalcatalog')
implementation project(':client:hts')
implementation project(':client:jobsclient')
implementation project(':client:tableclient')
implementation project(':client:secureclient')
implementation project(':services:common')
implementation project(':cluster:storage')
compileOnly (project(path: ':integrations:spark:spark-3.1:openhouse-spark-runtime_2.12')) {
compileOnly (project(path: openhouseSparkRuntimeModule)) {
exclude group: 'io.netty'
exclude group: 'org.apache.hadoop', module: 'hadoop-common'
exclude group: 'org.apache.hadoop', module: 'hadoop-client'
}

implementation "org.apache.logging.log4j:log4j-slf4j-impl:${log4jVersion}"
implementation project(':libs:datalayout')
implementation ('org.apache.spark:spark-core_2.12:' + sparkVersion) {
exclude group: 'io.netty'
Expand All @@ -43,116 +35,16 @@ dependencies {
exclude group: 'org.apache.hadoop', module: 'hadoop-common'
exclude group: 'org.apache.hadoop', module: 'hadoop-client'
}
implementation ('org.apache.hadoop:hadoop-common:2.10.0') {
exclude group: 'io.netty'
exclude group: 'org.apache.curator', module: 'curator-client'
exclude group: 'org.apache.commons', module: 'commons-lang3'

}
implementation ('org.apache.iceberg:iceberg-spark-runtime-3.1_2.12:' + icebergVersion) {
implementation (icebergSparkRuntimeModule) {
exclude group: 'io.netty'
}
implementation 'commons-cli:commons-cli:1.5.0'
implementation 'org.reflections:reflections:0.10.2'
implementation 'org.springframework.boot:spring-boot-starter-webflux:2.7.8'
implementation 'io.netty:netty-resolver-dns-native-macos:4.1.75.Final:osx-x86_64'
implementation 'org.springframework.retry:spring-retry:1.3.3'
implementation 'org.apache.logging.log4j:log4j-core:2.18.0'
implementation 'org.apache.logging.log4j:log4j-slf4j-impl:2.18.0'
implementation 'org.apache.logging.log4j:log4j-api:2.18.0'
implementation 'com.fasterxml.jackson.core:jackson-core:2.13.3'
implementation 'com.fasterxml.jackson.core:jackson-annotations:2.13.3'
implementation 'com.fasterxml.jackson.core:jackson-databind:2.13.3'
implementation 'com.fasterxml.jackson.jaxrs:jackson-jaxrs-json-provider:2.13.3'
implementation 'com.fasterxml.jackson.datatype:jackson-datatype-jsr310:2.13.3'
implementation 'com.fasterxml.woodstox:woodstox-core:6.2.7'

// open telemetry related classed. Latest Okhttp version is 4.10.0, pinning to 4.9.3 to avoid dependency issues
implementation 'com.squareup.okhttp3:okhttp:' + ok_http3_version
implementation 'com.squareup.okhttp:okhttp:2.7.5'
implementation 'com.squareup.okio:okio:3.2.0'
implementation 'com.squareup.okio:okio-jvm:3.2.0'
implementation 'org.jetbrains.kotlin:kotlin-stdlib:2.0.20'
implementation 'org.jetbrains.kotlin:kotlin-stdlib-jdk7:2.0.20'
implementation 'org.jetbrains.kotlin:kotlin-stdlib-jdk8:2.0.20'
implementation 'io.opentelemetry:opentelemetry-api:1.18.0'
implementation 'io.opentelemetry:opentelemetry-exporter-otlp:1.18.0'
implementation 'io.opentelemetry:opentelemetry-sdk:1.18.0'
implementation 'io.opentelemetry:opentelemetry-sdk-extension-autoconfigure:1.14.0-alpha'
implementation 'io.opentelemetry:opentelemetry-semconv:1.14.0-alpha'
implementation 'org.apache.commons:commons-lang3:3.12.0'

testImplementation (project(path: ':integrations:spark:spark-3.1:openhouse-spark-runtime_2.12', configuration: 'shadow')) {
testImplementation (project(path: openhouseSparkRuntimeModule, configuration: 'shadow')) {
exclude group: 'io.netty'
exclude group: 'org.apache.hadoop', module: 'hadoop-common'
exclude group: 'org.apache.hadoop', module: 'hadoop-client'
}
// Otherwise throws the error: Scala module 2.10.0 requires Jackson Databind version >= 2.10.0 and < 2.11.0
testImplementation 'com.fasterxml.jackson.module:jackson-module-scala_2.12:2.13.1'
testImplementation 'org.mockito:mockito-inline:4.11.0'
testImplementation 'org.powermock:powermock-module-junit4:2.0.9'
testImplementation 'org.powermock:powermock-api-mockito2:2.0.9'
testImplementation(project(':tables-test-fixtures:tables-test-fixtures_2.12')) {
testImplementation(project(tablesTestFixturesModule)) {
exclude group: "io.netty"
}
testRuntimeOnly("org.eclipse.jetty:jetty-server:11.0.2")

}

// Need spark runtime to be built before this test for this project to run successfully because compileOnly and
// testImplementation dependencies are not triggering it.
test.dependsOn ':integrations:spark:spark-3.1:openhouse-spark-runtime_2.12:build'

shadowJar {
zip64 = true
archiveClassifier.set('uber')
mergeServiceFiles() // merge META-INF/services configuration files to allow FileSystem to be discovered
dependencies {
// unnecessary dependencies from iceberg-spark3-runtime
exclude(dependency('org.apache.iceberg::'))
// this dependency will be provided at runtime
exclude(dependency('org.apache.iceberg:iceberg-spark3-runtime::'))

relocate('io', 'openhouse.relocated.io') {
exclude 'io.netty.resolver.dns.macos.**' // dynamically loaded classes
}
relocate('com', 'openhouse.relocated.com') {
exclude 'com.linkedin.openhouse.**' // don't want our classes to be shaded
exclude 'com.ctc.wstx.**' // dynamically loaded classes
exclude 'com.squareup.**'
exclude '%regex[com.sun.security.*]'
}
relocate 'okhttp3', 'openhouse.relocated.okhttp3'
relocate 'okio', 'openhouse.relocated.okio'
relocate 'reactor', 'openhouse.relocated.reactor'
relocate('org','openhouse.relocated.org') {
exclude 'org.apache.iceberg.**' // these are runtime classes, we shouldn't relocate them unless we shade them
exclude '%regex[org.apache.hadoop.*]' // these are runtime classes too, use regex to exclude string literals
exclude 'org.apache.commons.**' // these are part of method signatures reused in sub-classes
exclude 'org.apache.avro.**' // these runtime classes too
exclude 'org.apache.spark.**' // these runtime classes too
exclude 'org.springframework.**' // otherwise fails with ClassNotFoundException: org.springframework.http.codec.ClientCodecConfigurer
exclude 'org.log4j.**'
exclude 'org.slf4j.**'
exclude 'org.apache.log4j.**'
exclude 'org.apache.logging.**' // otherwise fails with add log4j-core to the classpath
exclude 'org.xml.sax.**' // otherwise fails with NoClassDefFoundError: org/xml/sax/ContentHandler
exclude '%regex[org.w3c.*]'
exclude '%regex[org.ietf.*]'
}
}
}

// https://github.com/johnrengelman/shadow/issues/335
// By default shadow doesn't configure the build task to depend on the shadowJar task.
tasks.build.dependsOn tasks.shadowJar

test {
if (JavaVersion.current() >= JavaVersion.VERSION_1_9) {
jvmArgs \
'--add-opens=java.base/java.nio=ALL-UNNAMED',
'--add-exports=java.base/sun.nio.ch=ALL-UNNAMED',
'--add-opens=java.base/sun.util.calendar=ALL-UNNAMED',
'--add-exports=java.base/sun.util.calendar=ALL-UNNAMED'
}
}
}
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
package com.linkedin.openhouse.jobs.spark;

import avro.shaded.com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.Lists;
import com.linkedin.openhouse.common.stats.model.IcebergTableStats;
import com.linkedin.openhouse.jobs.util.SparkJobUtil;
Expand Down Expand Up @@ -29,6 +28,7 @@
import org.apache.iceberg.actions.RewriteDataFiles;
import org.apache.iceberg.catalog.Catalog;
import org.apache.iceberg.catalog.TableIdentifier;
import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting;
import org.apache.iceberg.spark.actions.SparkActions;
import org.apache.spark.sql.SparkSession;
import scala.collection.JavaConverters;
Expand Down
10 changes: 7 additions & 3 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,12 @@ allprojects {
}
}

if (it.path != ':integrations:spark:spark-3.5:openhouse-spark-3.5-itest') {
def excludedProjects = [
':integrations:spark:spark-3.5:openhouse-spark-3.5-itest',
':apps:openhouse-spark-apps-1.5_2.12'
]

if (!excludedProjects.contains(it.path)) {
configurations.all {
resolutionStrategy {
force 'com.fasterxml.jackson:jackson-bom:2.13.4'
Expand Down Expand Up @@ -128,5 +133,4 @@ tasks.register('CopyGitHooksTask', Copy) {
println 'Make the git hook available in .git/hooks directory.'
from file('scripts/git-hooks')
into file('.git/hooks/')
}

}
Loading
Loading