From aad3f7d5dec93455025a0e3e61411a15a7f716f7 Mon Sep 17 00:00:00 2001 From: Robert Metzger Date: Sat, 4 Apr 2020 10:08:37 +0200 Subject: [PATCH] [FLINK-16973][tests] Add tooling for collecting jvm crash files --- azure-pipelines.yml | 2 ++ .../tests/util/kafka/LocalStandaloneKafkaResource.java | 2 +- flink-end-to-end-tests/test-scripts/common.sh | 3 +++ tools/azure-pipelines/jobs-template.yml | 2 ++ tools/ci/maven-utils.sh | 10 ++++++++++ tools/travis_watchdog.sh | 6 ++++++ 6 files changed, 24 insertions(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 89a5089bd3ff2..e0a585372ebad 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -39,6 +39,8 @@ resources: # Container with Maven 3.2.5, SSL to have the same environment everywhere. - container: flink-build-container image: rmetzger/flink-ci:ubuntu-amd64-3528acd + # On AZP provided machines, set this flag to allow writing coredumps in docker + options: --privileged # Define variables: # - See tools/azure-pipelines/jobs-template.yml for a short summary of the caching diff --git a/flink-end-to-end-tests/flink-end-to-end-tests-common-kafka/src/main/java/org/apache/flink/tests/util/kafka/LocalStandaloneKafkaResource.java b/flink-end-to-end-tests/flink-end-to-end-tests-common-kafka/src/main/java/org/apache/flink/tests/util/kafka/LocalStandaloneKafkaResource.java index 1b1a3395c73b7..405690f4ddb76 100644 --- a/flink-end-to-end-tests/flink-end-to-end-tests-common-kafka/src/main/java/org/apache/flink/tests/util/kafka/LocalStandaloneKafkaResource.java +++ b/flink-end-to-end-tests/flink-end-to-end-tests-common-kafka/src/main/java/org/apache/flink/tests/util/kafka/LocalStandaloneKafkaResource.java @@ -199,7 +199,7 @@ private static boolean isZookeeperRunning(final Path kafkaDir) { private static boolean isKafkaRunning(final Path kafkaDir) throws IOException { try { final AtomicBoolean atomicBrokerStarted = new AtomicBoolean(false); - queryBrokerStatus(kafkaDir, line -> atomicBrokerStarted.compareAndSet(false, !line.contains("Node does not exist"))); + queryBrokerStatus(kafkaDir, line -> atomicBrokerStarted.compareAndSet(false, line.contains("dataLength ="))); return atomicBrokerStarted.get(); } catch (final IOException ioe) { // we get an exception if zookeeper isn't running diff --git a/flink-end-to-end-tests/test-scripts/common.sh b/flink-end-to-end-tests/test-scripts/common.sh index 44ce92f40fc5b..fad1c74731f55 100644 --- a/flink-end-to-end-tests/test-scripts/common.sh +++ b/flink-end-to-end-tests/test-scripts/common.sh @@ -352,6 +352,7 @@ function check_logs_for_errors { | grep -v "Error while loading kafka-version.properties :null" \ | grep -v "Failed Elasticsearch item request" \ | grep -v "[Terror] modules" \ + | grep -v "HeapDumpOnOutOfMemoryError" \ | grep -ic "error" || true) if [[ ${error_count} -gt 0 ]]; then echo "Found error in log files:" @@ -401,12 +402,14 @@ function check_logs_for_exceptions { function check_logs_for_non_empty_out_files { echo "Checking for non-empty .out files..." # exclude reflective access warnings as these are expected (and currently unavoidable) on Java 9 + # exclude message about JAVA_TOOL_OPTIONS being set (https://bugs.openjdk.java.net/browse/JDK-8039152) if grep -ri -v \ -e "WARNING: An illegal reflective access" \ -e "WARNING: Illegal reflective access"\ -e "WARNING: Please consider reporting"\ -e "WARNING: Use --illegal-access"\ -e "WARNING: All illegal access"\ + -e "Picked up JAVA_TOOL_OPTIONS"\ $FLINK_DIR/log/*.out\ | grep "." \ > /dev/null; then diff --git a/tools/azure-pipelines/jobs-template.yml b/tools/azure-pipelines/jobs-template.yml index 85217901a655d..e036ebc7f2bc5 100644 --- a/tools/azure-pipelines/jobs-template.yml +++ b/tools/azure-pipelines/jobs-template.yml @@ -117,6 +117,8 @@ jobs: echo "##vso[task.setvariable variable=PATH]$JAVA_HOME_11_X64/bin:$PATH" displayName: "Set to jdk11" condition: eq('${{parameters.jdk}}', 'jdk11') + - script: sudo sysctl -w kernel.core_pattern=core.%p + displayName: Set coredump pattern # Test - script: STAGE=test ${{parameters.environment}} ./tools/azure_controller.sh $(module) displayName: Test - $(module) diff --git a/tools/ci/maven-utils.sh b/tools/ci/maven-utils.sh index f36e3749985cf..1ea3d28de8abd 100755 --- a/tools/ci/maven-utils.sh +++ b/tools/ci/maven-utils.sh @@ -67,3 +67,13 @@ function setup_maven { echo "Installed Maven ${MAVEN_VERSION} to ${M2_HOME}" } + +function collect_coredumps { + local SEARCHDIR=$1 + local TARGET_DIR=$2 + echo "Searching for .dump, .dumpstream and related files in '$SEARCHDIR'" + for file in `find $SEARCHDIR -type f -regextype posix-extended -iregex '.*\.hprof|.*\.dump|.*\.dumpstream|.*hs.*\.log|.*/core(.[0-9]+)?$'`; do + echo "Moving '$file' to target directory ('$TARGET_DIR')" + mv $file $TARGET_DIR/ + done +} diff --git a/tools/travis_watchdog.sh b/tools/travis_watchdog.sh index bb05113b4f676..296789891e375 100755 --- a/tools/travis_watchdog.sh +++ b/tools/travis_watchdog.sh @@ -97,6 +97,10 @@ if [ ! -z "$TF_BUILD" ] ; then ARTIFACTS_FILE=${BUILD_BUILDNUMBER}.tar.gz fi +# enable coredumps +ulimit -c unlimited +export JAVA_TOOL_OPTIONS="-XX:+HeapDumpOnOutOfMemoryError" + if [ $TEST == $STAGE_PYTHON ]; then CMD=$PYTHON_TEST CMD_PID=$PYTHON_PID @@ -276,6 +280,8 @@ case $TEST in ;; esac +collect_coredumps `pwd` $ARTIFACTS_DIR + upload_artifacts_s3 # since we are in flink/tools/artifacts