Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[PR1] refactoring :apps build and adding spark3.5/iceberg1.5 artifact #255

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 68 additions & 0 deletions apps/spark-3.5/build.gradle
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
plugins {
// dependency in apps-spark-conventions
id 'com.github.johnrengelman.shadow' version '7.1.2'
id 'openhouse.apps-spark-conventions'
}

ext {
log4jVersion = "2.20.0"

sparkVersion = '3.5.2'
icebergVersion = '1.5.2'
sparkVersionSuffix = "3.5"
openhouseSparkRuntimeModule = ":integrations:spark:spark-${sparkVersionSuffix}:openhouse-spark-3.5-runtime_2.12"
icebergSparkRuntimeModule = "org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:${icebergVersion}"
tablesTestFixturesModule = ":tables-test-fixtures:tables-test-fixtures-iceberg-1.5_2.12"
}

dependencies {
compileOnly (project(path: openhouseSparkRuntimeModule)) {
exclude group: 'io.netty'
exclude group: 'org.apache.hadoop', module: 'hadoop-common'
exclude group: 'org.apache.hadoop', module: 'hadoop-client'
}

implementation "org.apache.logging.log4j:log4j-slf4j-impl:${log4jVersion}"
implementation(project(':libs:datalayout')) {
exclude group: 'org.apache.iceberg', module: 'iceberg-spark-runtime-3.1_2.12'
}
implementation("org.apache.iceberg:iceberg-bundled-guava:${icebergVersion}")
implementation("org.apache.iceberg:iceberg-data:${icebergVersion}")
implementation("org.apache.iceberg:iceberg-core:${icebergVersion}")
implementation("org.apache.iceberg:iceberg-common:${icebergVersion}")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add iceberg-conventions-1.5.2.gradle to remove them.

implementation ('org.apache.spark:spark-core_2.12:' + sparkVersion) {
exclude group: 'io.netty'
exclude group: 'org.apache.hadoop', module: 'hadoop-common'
exclude group: 'org.apache.hadoop', module: 'hadoop-client'
}
implementation ('org.apache.spark:spark-sql_2.12:' + sparkVersion) {
exclude group: 'io.netty'
exclude group: 'org.apache.hadoop', module: 'hadoop-common'
exclude group: 'org.apache.hadoop', module: 'hadoop-client'
}
implementation (icebergSparkRuntimeModule) {
exclude group: 'io.netty'
}

testImplementation (project(path: openhouseSparkRuntimeModule, configuration: 'shadow')) {
exclude group: 'io.netty'
exclude group: 'org.apache.hadoop', module: 'hadoop-common'
exclude group: 'org.apache.hadoop', module: 'hadoop-client'
}
testImplementation(project(tablesTestFixturesModule)) {
exclude group: "io.netty"
}
}

sourceSets {
main {
java {
srcDirs = ['src/main/java', project(':apps:openhouse-spark-apps_2.12').sourceSets.main.java.srcDirs]
}
}
test {
java {
srcDirs = ['src/test/java', project(':apps:openhouse-spark-apps_2.12').sourceSets.test.java.srcDirs]
}
}
}
159 changes: 35 additions & 124 deletions apps/spark/build.gradle
Original file line number Diff line number Diff line change
@@ -1,38 +1,49 @@
plugins {
id 'openhouse.java-conventions'
id 'openhouse.hadoop-conventions'
id 'openhouse.iceberg-conventions-1.2'
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This line should not be removed.

id 'openhouse.maven-publish'
// dependency in apps-spark-conventions
id 'com.github.johnrengelman.shadow' version '7.1.2'
}

configurations {
// Excluding these libraries avoids competing implementations for LoggerFactory
// Standardizing on slf4j + log4j2 as implementation.
all*.exclude module : 'spring-boot-starter-logging'
all*.exclude module : 'logback-classic'
shadow.extendsFrom implementation
id 'openhouse.apps-spark-conventions'
}

ext {
log4jVersion = "2.18.0"

sparkVersion = '3.1.1'
icebergVersion = '1.2.0'
sparkVersionSuffix = "3.1"
openhouseSparkRuntimeModule = ":integrations:spark:spark-${sparkVersionSuffix}:openhouse-spark-runtime_2.12"
icebergSparkRuntimeModule = "org.apache.iceberg:iceberg-spark-runtime-3.1_2.12:${icebergVersion}"
tablesTestFixturesModule = ":tables-test-fixtures:tables-test-fixtures_2.12"
}

dependencies {
implementation project(':iceberg:openhouse:internalcatalog')
implementation project(':client:hts')
implementation project(':client:jobsclient')
implementation project(':client:tableclient')
implementation project(':client:secureclient')
implementation project(':services:common')
implementation project(':cluster:storage')
compileOnly (project(path: ':integrations:spark:spark-3.1:openhouse-spark-runtime_2.12')) {
compileOnly (project(path: openhouseSparkRuntimeModule)) {
exclude group: 'io.netty'
exclude group: 'org.apache.hadoop', module: 'hadoop-common'
exclude group: 'org.apache.hadoop', module: 'hadoop-client'
}

implementation "org.apache.logging.log4j:log4j-slf4j-impl:${log4jVersion}"
implementation project(':libs:datalayout')
implementation("org.apache.iceberg:iceberg-bundled-guava") {
version {
strictly("${icebergVersion}")
}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

are these really helpful ? In practice I never find force or strictly covering everything that I want, since there could always be fatJar in the dependency Tree and these keywords don't work for them.

Copy link
Collaborator Author

@cbb330 cbb330 Nov 26, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

strictly here overrides any dependencies of compaction that are defining a higher version of 1.5.2, such as

services
cluster:storage
internalcatalog
htscatalog

without strictly, these dependencies will bump up iceberg in compactino unintentionally.

an alternative to forcing the version to be LOWER than what is defined in transitive dependencies, is:

  1. excluding all iceberg stuff from all of the dependencies above. but i think this is less readable and from what I understand, utilizing the resolution rules is preferred to exclusions
  2. using the 1.2 conventions plugin which ALSO defines strictly. but i prefer to be explicit

}
implementation("org.apache.iceberg:iceberg-data") {
version {
strictly("${icebergVersion}")
}
}
implementation("org.apache.iceberg:iceberg-core") {
version {
strictly("${icebergVersion}")
}
}
implementation("org.apache.iceberg:iceberg-common") {
version {
strictly("${icebergVersion}")
}
}
implementation ('org.apache.spark:spark-core_2.12:' + sparkVersion) {
exclude group: 'io.netty'
exclude group: 'org.apache.hadoop', module: 'hadoop-common'
Expand All @@ -43,116 +54,16 @@ dependencies {
exclude group: 'org.apache.hadoop', module: 'hadoop-common'
exclude group: 'org.apache.hadoop', module: 'hadoop-client'
}
implementation ('org.apache.hadoop:hadoop-common:2.10.0') {
exclude group: 'io.netty'
exclude group: 'org.apache.curator', module: 'curator-client'
exclude group: 'org.apache.commons', module: 'commons-lang3'

}
implementation ('org.apache.iceberg:iceberg-spark-runtime-3.1_2.12:' + icebergVersion) {
implementation (icebergSparkRuntimeModule) {
exclude group: 'io.netty'
}
implementation 'commons-cli:commons-cli:1.5.0'
implementation 'org.reflections:reflections:0.10.2'
implementation 'org.springframework.boot:spring-boot-starter-webflux:2.7.8'
implementation 'io.netty:netty-resolver-dns-native-macos:4.1.75.Final:osx-x86_64'
implementation 'org.springframework.retry:spring-retry:1.3.3'
implementation 'org.apache.logging.log4j:log4j-core:2.18.0'
implementation 'org.apache.logging.log4j:log4j-slf4j-impl:2.18.0'
implementation 'org.apache.logging.log4j:log4j-api:2.18.0'
implementation 'com.fasterxml.jackson.core:jackson-core:2.13.3'
implementation 'com.fasterxml.jackson.core:jackson-annotations:2.13.3'
implementation 'com.fasterxml.jackson.core:jackson-databind:2.13.3'
implementation 'com.fasterxml.jackson.jaxrs:jackson-jaxrs-json-provider:2.13.3'
implementation 'com.fasterxml.jackson.datatype:jackson-datatype-jsr310:2.13.3'
implementation 'com.fasterxml.woodstox:woodstox-core:6.2.7'

// open telemetry related classed. Latest Okhttp version is 4.10.0, pinning to 4.9.3 to avoid dependency issues
implementation 'com.squareup.okhttp3:okhttp:' + ok_http3_version
implementation 'com.squareup.okhttp:okhttp:2.7.5'
implementation 'com.squareup.okio:okio:3.2.0'
implementation 'com.squareup.okio:okio-jvm:3.2.0'
implementation 'org.jetbrains.kotlin:kotlin-stdlib:2.0.20'
implementation 'org.jetbrains.kotlin:kotlin-stdlib-jdk7:2.0.20'
implementation 'org.jetbrains.kotlin:kotlin-stdlib-jdk8:2.0.20'
implementation 'io.opentelemetry:opentelemetry-api:1.18.0'
implementation 'io.opentelemetry:opentelemetry-exporter-otlp:1.18.0'
implementation 'io.opentelemetry:opentelemetry-sdk:1.18.0'
implementation 'io.opentelemetry:opentelemetry-sdk-extension-autoconfigure:1.14.0-alpha'
implementation 'io.opentelemetry:opentelemetry-semconv:1.14.0-alpha'
implementation 'org.apache.commons:commons-lang3:3.12.0'

testImplementation (project(path: ':integrations:spark:spark-3.1:openhouse-spark-runtime_2.12', configuration: 'shadow')) {
testImplementation (project(path: openhouseSparkRuntimeModule, configuration: 'shadow')) {
exclude group: 'io.netty'
exclude group: 'org.apache.hadoop', module: 'hadoop-common'
exclude group: 'org.apache.hadoop', module: 'hadoop-client'
}
// Otherwise throws the error: Scala module 2.10.0 requires Jackson Databind version >= 2.10.0 and < 2.11.0
testImplementation 'com.fasterxml.jackson.module:jackson-module-scala_2.12:2.13.1'
testImplementation 'org.mockito:mockito-inline:4.11.0'
testImplementation 'org.powermock:powermock-module-junit4:2.0.9'
testImplementation 'org.powermock:powermock-api-mockito2:2.0.9'
testImplementation(project(':tables-test-fixtures:tables-test-fixtures_2.12')) {
testImplementation(project(tablesTestFixturesModule)) {
exclude group: "io.netty"
}
testRuntimeOnly("org.eclipse.jetty:jetty-server:11.0.2")

}

// Need spark runtime to be built before this test for this project to run successfully because compileOnly and
// testImplementation dependencies are not triggering it.
test.dependsOn ':integrations:spark:spark-3.1:openhouse-spark-runtime_2.12:build'

shadowJar {
zip64 = true
archiveClassifier.set('uber')
mergeServiceFiles() // merge META-INF/services configuration files to allow FileSystem to be discovered
dependencies {
// unnecessary dependencies from iceberg-spark3-runtime
exclude(dependency('org.apache.iceberg::'))
// this dependency will be provided at runtime
exclude(dependency('org.apache.iceberg:iceberg-spark3-runtime::'))

relocate('io', 'openhouse.relocated.io') {
exclude 'io.netty.resolver.dns.macos.**' // dynamically loaded classes
}
relocate('com', 'openhouse.relocated.com') {
exclude 'com.linkedin.openhouse.**' // don't want our classes to be shaded
exclude 'com.ctc.wstx.**' // dynamically loaded classes
exclude 'com.squareup.**'
exclude '%regex[com.sun.security.*]'
}
relocate 'okhttp3', 'openhouse.relocated.okhttp3'
relocate 'okio', 'openhouse.relocated.okio'
relocate 'reactor', 'openhouse.relocated.reactor'
relocate('org','openhouse.relocated.org') {
exclude 'org.apache.iceberg.**' // these are runtime classes, we shouldn't relocate them unless we shade them
exclude '%regex[org.apache.hadoop.*]' // these are runtime classes too, use regex to exclude string literals
exclude 'org.apache.commons.**' // these are part of method signatures reused in sub-classes
exclude 'org.apache.avro.**' // these runtime classes too
exclude 'org.apache.spark.**' // these runtime classes too
exclude 'org.springframework.**' // otherwise fails with ClassNotFoundException: org.springframework.http.codec.ClientCodecConfigurer
exclude 'org.log4j.**'
exclude 'org.slf4j.**'
exclude 'org.apache.log4j.**'
exclude 'org.apache.logging.**' // otherwise fails with add log4j-core to the classpath
exclude 'org.xml.sax.**' // otherwise fails with NoClassDefFoundError: org/xml/sax/ContentHandler
exclude '%regex[org.w3c.*]'
exclude '%regex[org.ietf.*]'
}
}
}

// https://github.com/johnrengelman/shadow/issues/335
// By default shadow doesn't configure the build task to depend on the shadowJar task.
tasks.build.dependsOn tasks.shadowJar

test {
if (JavaVersion.current() >= JavaVersion.VERSION_1_9) {
jvmArgs \
'--add-opens=java.base/java.nio=ALL-UNNAMED',
'--add-exports=java.base/sun.nio.ch=ALL-UNNAMED',
'--add-opens=java.base/sun.util.calendar=ALL-UNNAMED',
'--add-exports=java.base/sun.util.calendar=ALL-UNNAMED'
}
}
}
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
package com.linkedin.openhouse.jobs.spark;

import avro.shaded.com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.Lists;
import com.linkedin.openhouse.common.stats.model.IcebergTableStats;
import com.linkedin.openhouse.jobs.util.SparkJobUtil;
Expand Down Expand Up @@ -29,6 +28,7 @@
import org.apache.iceberg.actions.RewriteDataFiles;
import org.apache.iceberg.catalog.Catalog;
import org.apache.iceberg.catalog.TableIdentifier;
import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting;
import org.apache.iceberg.spark.actions.SparkActions;
import org.apache.spark.sql.SparkSession;
import scala.collection.JavaConverters;
Expand Down
5 changes: 1 addition & 4 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,6 @@ allprojects {
if (it.path != ':integrations:spark:spark-3.5:openhouse-spark-3.5-itest') {
configurations.all {
resolutionStrategy {
force 'com.fasterxml.jackson:jackson-bom:2.13.4'
force 'com.fasterxml.jackson.core:jackson-databind:2.13.4'
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

are these changes relevant ?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this may actually be dangerous to remove, and I will fix it, since idk what is currently relying on this.

but this strict version constraint breaks :apps:spark-3.5 because that also needs jackson 2.15

force 'org.apache.orc:orc-core:1.8.3'
force 'com.google.guava:guava:31.1-jre'
}
Expand Down Expand Up @@ -128,5 +126,4 @@ tasks.register('CopyGitHooksTask', Copy) {
println 'Make the git hook available in .git/hooks directory.'
from file('scripts/git-hooks')
into file('.git/hooks/')
}

}
Loading
Loading