diff --git a/src/main/scala/com/amazon/deequ/VerificationRunBuilder.scala b/src/main/scala/com/amazon/deequ/VerificationRunBuilder.scala index 150e164fd..f34b7f6ee 100644 --- a/src/main/scala/com/amazon/deequ/VerificationRunBuilder.scala +++ b/src/main/scala/com/amazon/deequ/VerificationRunBuilder.scala @@ -16,7 +16,7 @@ package com.amazon.deequ -import com.amazon.deequ.anomalydetection.AnomalyDetectionStrategy +import com.amazon.deequ.anomalydetection.{AnomalyDetectionStrategy, AnomalyDetectionStrategyWithExtendedResults} import com.amazon.deequ.analyzers.Analyzer import com.amazon.deequ.analyzers.{State, _} import com.amazon.deequ.checks.{Check, CheckLevel} @@ -240,6 +240,24 @@ class VerificationRunBuilderWithRepository( anomalyDetectionStrategy, analyzer, anomalyCheckConfigOrDefault) this } + + def addAnomalyCheckWithExtendedResults[S <: State[S]]( + anomalyDetectionStrategy: AnomalyDetectionStrategyWithExtendedResults, + analyzer: Analyzer[S, Metric[Double]], + anomalyCheckConfig: Option[AnomalyCheckConfig] = None) + : this.type = { + + val anomalyCheckConfigOrDefault = anomalyCheckConfig.getOrElse { + + val checkDescription = s"Anomaly check for ${analyzer.toString}" + + AnomalyCheckConfig(CheckLevel.Warning, checkDescription) + } + + checks :+= VerificationRunBuilderHelper.getAnomalyCheckWithExtendedResults( + metricsRepository.get, anomalyDetectionStrategy, analyzer, anomalyCheckConfigOrDefault) + this + } } class VerificationRunBuilderWithSparkSession( @@ -315,6 +333,32 @@ private[this] object VerificationRunBuilderHelper { anomalyCheckConfig.beforeDate ) } + + /** + * Build a check using Anomaly Detection with extended results methods + * + * @param metricsRepository A metrics repository to get the previous results + * @param anomalyDetectionStrategyWithExtendedResults The anomaly detection strategy with extended results + * @param analyzer The analyzer for the metric to run anomaly detection on + * @param anomalyCheckConfig Some configuration settings for the Check + */ + def getAnomalyCheckWithExtendedResults[S <: State[S]]( + metricsRepository: MetricsRepository, + anomalyDetectionStrategyWithExtendedResults: AnomalyDetectionStrategyWithExtendedResults, + analyzer: Analyzer[S, Metric[Double]], + anomalyCheckConfig: AnomalyCheckConfig) + : Check = { + + Check(anomalyCheckConfig.level, anomalyCheckConfig.description) + .isNewestPointNonAnomalousWithExtendedResults( + metricsRepository, + anomalyDetectionStrategyWithExtendedResults, + analyzer, + anomalyCheckConfig.withTagValues, + anomalyCheckConfig.afterDate, + anomalyCheckConfig.beforeDate + ) + } } /** diff --git a/src/main/scala/com/amazon/deequ/analyzers/applicability/Applicability.scala b/src/main/scala/com/amazon/deequ/analyzers/applicability/Applicability.scala index e2c282c14..dc55c84cf 100644 --- a/src/main/scala/com/amazon/deequ/analyzers/applicability/Applicability.scala +++ b/src/main/scala/com/amazon/deequ/analyzers/applicability/Applicability.scala @@ -21,7 +21,8 @@ import java.util.Calendar import com.amazon.deequ.analyzers.{Analyzer, State} import com.amazon.deequ.checks.Check -import com.amazon.deequ.constraints.{AnalysisBasedConstraint, Constraint, ConstraintDecorator} +import com.amazon.deequ.constraints.{AnalysisBasedConstraint, AnomalyExtendedResultsConstraint, + Constraint, ConstraintDecorator} import com.amazon.deequ.metrics.Metric import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Row, SparkSession} @@ -187,9 +188,13 @@ private[deequ] class Applicability(session: SparkSession) { case (name, nc: ConstraintDecorator) => name -> nc.inner case (name, c: Constraint) => name -> c } - .collect { case (name, constraint: AnalysisBasedConstraint[_, _, _]) => - val metric = constraint.analyzer.calculate(data).value - name -> metric + .collect { + case (name, constraint: AnalysisBasedConstraint[_, _, _]) => + val metric = constraint.analyzer.calculate(data).value + name -> metric + case (name, constraint: AnomalyExtendedResultsConstraint[_, _, _]) => + val metric = constraint.analyzer.calculate(data).value + name -> metric } val constraintApplicabilities = check.constraints.zip(namedMetrics).map { diff --git a/src/main/scala/com/amazon/deequ/anomalydetection/AnomalyDetectionStrategy.scala b/src/main/scala/com/amazon/deequ/anomalydetection/AnomalyDetectionStrategy.scala index 0c3f6805e..5e48e96bc 100644 --- a/src/main/scala/com/amazon/deequ/anomalydetection/AnomalyDetectionStrategy.scala +++ b/src/main/scala/com/amazon/deequ/anomalydetection/AnomalyDetectionStrategy.scala @@ -30,3 +30,17 @@ trait AnomalyDetectionStrategy { dataSeries: Vector[Double], searchInterval: (Int, Int) = (0, Int.MaxValue)): Seq[(Int, Anomaly)] } +trait AnomalyDetectionStrategyWithExtendedResults { + + /** + * Search for anomalies in a series of data points, returns extended results. + * + * @param dataSeries The data contained in a Vector of Doubles + * @param searchInterval The indices between which anomalies should be detected. [a, b). + * @return The indices of all data points with their corresponding anomaly extended results wrapper + * object. + */ + def detectWithExtendedResults( + dataSeries: Vector[Double], + searchInterval: (Int, Int) = (0, Int.MaxValue)): Seq[(Int, AnomalyDetectionDataPoint)] +} diff --git a/src/main/scala/com/amazon/deequ/anomalydetection/AnomalyDetector.scala b/src/main/scala/com/amazon/deequ/anomalydetection/AnomalyDetector.scala index e7146c0e9..96f3925af 100644 --- a/src/main/scala/com/amazon/deequ/anomalydetection/AnomalyDetector.scala +++ b/src/main/scala/com/amazon/deequ/anomalydetection/AnomalyDetector.scala @@ -56,12 +56,8 @@ case class AnomalyDetector(strategy: AnomalyDetectionStrategy) { val allDataPoints = sortedDataPoints :+ newPoint - // Run anomaly - val anomalies = detectAnomaliesInHistory(allDataPoints, (newPoint.time, Long.MaxValue)) - .anomalies - - // Create a Detection result with all anomalies - DetectionResult(anomalies) + // Run anomaly and create a Detection result with all anomalies + detectAnomaliesInHistory(allDataPoints, (newPoint.time, Long.MaxValue)) } /** @@ -100,3 +96,86 @@ case class AnomalyDetector(strategy: AnomalyDetectionStrategy) { DetectionResult(anomalies.map { case (index, anomaly) => (sortedTimestamps(index), anomaly) }) } } + +case class AnomalyDetectorWithExtendedResults(strategy: AnomalyDetectionStrategyWithExtendedResults) { + + + /** + * Given a sequence of metrics and a current value, detects if there is an anomaly by using the + * given algorithm and returns extended results. + * + * @param historicalDataPoints Sequence of tuples (Points in time with corresponding Metric). + * @param newPoint A new data point to check if there are anomalies + * @return + */ + def isNewPointAnomalousWithExtendedResults( + historicalDataPoints: Seq[DataPoint[Double]], + newPoint: DataPoint[Double]) + : ExtendedDetectionResult = { + + require(historicalDataPoints.nonEmpty, "historicalDataPoints must not be empty!") + + val sortedDataPoints = historicalDataPoints.sortBy(_.time) + + val firstDataPointTime = sortedDataPoints.head.time + val lastDataPointTime = sortedDataPoints.last.time + + val newPointTime = newPoint.time + + require(lastDataPointTime < newPointTime, + s"Can't decide which range to use for anomaly detection. New data point with time " + + s"$newPointTime is in history range ($firstDataPointTime - $lastDataPointTime)!") + + val allDataPoints = sortedDataPoints :+ newPoint + + // Run anomaly and create an Extended Detection result with all data points and anomaly details + detectAnomaliesInHistoryWithExtendedResults(allDataPoints, (newPoint.time, Long.MaxValue)) + } + + + /** + * Given a strategy, detects anomalies in a time series after some preprocessing + * and returns extended results. + * + * @param dataSeries Sequence of tuples (Points in time with corresponding value). + * @param searchInterval The interval in which anomalies should be detected. [a, b). + * @return A wrapper object, containing all data points with anomaly extended results. + */ + def detectAnomaliesInHistoryWithExtendedResults( + dataSeries: Seq[DataPoint[Double]], + searchInterval: (Long, Long) = (Long.MinValue, Long.MaxValue)) + : ExtendedDetectionResult = { + + def findIndexForBound(sortedTimestamps: Seq[Long], boundValue: Long): Int = { + sortedTimestamps.search(boundValue).insertionPoint + } + + val (searchStart, searchEnd) = searchInterval + + require(searchStart <= searchEnd, + "The first interval element has to be smaller or equal to the last.") + + // Remove missing values and sort series by time + val removedMissingValues = dataSeries.filter { + _.metricValue.isDefined + } + val sortedSeries = removedMissingValues.sortBy { + _.time + } + val sortedTimestamps = sortedSeries.map { + _.time + } + + // Find indices of lower and upper bound + val lowerBoundIndex = findIndexForBound(sortedTimestamps, searchStart) + val upperBoundIndex = findIndexForBound(sortedTimestamps, searchEnd) + + val anomalies = strategy.detectWithExtendedResults( + sortedSeries.flatMap { + _.metricValue + }.toVector, (lowerBoundIndex, upperBoundIndex)) + + ExtendedDetectionResult(anomalies.map { case (index, anomaly) => (sortedTimestamps(index), anomaly) }) + } + +} diff --git a/src/main/scala/com/amazon/deequ/anomalydetection/BaseChangeStrategy.scala b/src/main/scala/com/amazon/deequ/anomalydetection/BaseChangeStrategy.scala index e00c86772..2d0cf3948 100644 --- a/src/main/scala/com/amazon/deequ/anomalydetection/BaseChangeStrategy.scala +++ b/src/main/scala/com/amazon/deequ/anomalydetection/BaseChangeStrategy.scala @@ -27,7 +27,7 @@ import breeze.linalg.DenseVector * Set to 1 it calculates the difference between two consecutive values. */ trait BaseChangeStrategy - extends AnomalyDetectionStrategy { + extends AnomalyDetectionStrategy with AnomalyDetectionStrategyWithExtendedResults { def maxRateDecrease: Option[Double] def maxRateIncrease: Option[Double] @@ -67,7 +67,8 @@ trait BaseChangeStrategy } /** - * Search for anomalies in a series of data points. + * Search for anomalies in a series of data points. This function uses the + * detectWithExtendedResults function and then filters and maps to return only anomaly data point objects. * * If there aren't enough data points preceding the searchInterval, * it may happen that the interval's first elements (depending on the specified order) @@ -81,6 +82,30 @@ trait BaseChangeStrategy dataSeries: Vector[Double], searchInterval: (Int, Int)) : Seq[(Int, Anomaly)] = { + + detectWithExtendedResults(dataSeries, searchInterval) + .filter { case (_, anomDataPoint) => anomDataPoint.isAnomaly } + .map { case (i, anomDataPoint) => + (i, Anomaly(Some(anomDataPoint.dataMetricValue), anomDataPoint.confidence, anomDataPoint.detail)) + } + } + + /** + * Search for anomalies in a series of data points, returns extended results. + * + * If there aren't enough data points preceding the searchInterval, + * it may happen that the interval's first elements (depending on the specified order) + * can't be flagged as anomalies. + * + * @param dataSeries The data contained in a Vector of Doubles + * @param searchInterval The indices between which anomalies should be detected. [a, b). + * @return The indices of all anomalies in the interval and their corresponding wrapper object + * with extended results. + */ + override def detectWithExtendedResults( + dataSeries: Vector[Double], + searchInterval: (Int, Int)) + : Seq[(Int, AnomalyDetectionDataPoint)] = { val (start, end) = searchInterval require(start <= end, @@ -89,15 +114,25 @@ trait BaseChangeStrategy val startPoint = Seq(start - order, 0).max val data = diff(DenseVector(dataSeries.slice(startPoint, end): _*), order).data - data.zipWithIndex.filter { case (value, _) => - (value < maxRateDecrease.getOrElse(Double.MinValue) - || value > maxRateIncrease.getOrElse(Double.MaxValue)) - } - .map { case (change, index) => - (index + startPoint + order, Anomaly(Option(dataSeries(index + startPoint + order)), 1.0, - Some(s"[AbsoluteChangeStrategy]: Change of $change is not in bounds [" + - s"${maxRateDecrease.getOrElse(Double.MinValue)}, " + - s"${maxRateIncrease.getOrElse(Double.MaxValue)}]. Order=$order"))) + val lowerBound = maxRateDecrease.getOrElse(Double.MinValue) + val upperBound = maxRateIncrease.getOrElse(Double.MaxValue) + + + data.zipWithIndex.map { + case (change, index) => + val outputSequenceIndex = index + startPoint + order + val value = dataSeries(outputSequenceIndex) + val (detail, isAnomaly) = if (change < lowerBound || change > upperBound) { + (Some(s"[AbsoluteChangeStrategy]: Change of $change is not in bounds [" + + s"$lowerBound, " + + s"$upperBound]. Order=$order"), true) + } + else { + (None, false) + } + (outputSequenceIndex, AnomalyDetectionDataPoint(value, change, + BoundedRange(lowerBound = Bound(lowerBound, inclusive = true), + upperBound = Bound(upperBound, inclusive = true)), isAnomaly, 1.0, detail)) } } } diff --git a/src/main/scala/com/amazon/deequ/anomalydetection/BatchNormalStrategy.scala b/src/main/scala/com/amazon/deequ/anomalydetection/BatchNormalStrategy.scala index baff49c03..41a7bad43 100644 --- a/src/main/scala/com/amazon/deequ/anomalydetection/BatchNormalStrategy.scala +++ b/src/main/scala/com/amazon/deequ/anomalydetection/BatchNormalStrategy.scala @@ -33,7 +33,9 @@ import breeze.stats.meanAndVariance case class BatchNormalStrategy( lowerDeviationFactor: Option[Double] = Some(3.0), upperDeviationFactor: Option[Double] = Some(3.0), - includeInterval: Boolean = false) extends AnomalyDetectionStrategy { + includeInterval: Boolean = false) + extends AnomalyDetectionStrategy with AnomalyDetectionStrategyWithExtendedResults + { require(lowerDeviationFactor.isDefined || upperDeviationFactor.isDefined, "At least one factor has to be specified.") @@ -43,7 +45,8 @@ case class BatchNormalStrategy( /** - * Search for anomalies in a series of data points. + * Search for anomalies in a series of data points. This function uses the + * detectWithExtendedResults function and then filters and maps to return only anomaly objects. * * @param dataSeries The data contained in a Vector of Doubles * @param searchInterval The indices between which anomalies should be detected. [a, b). @@ -53,6 +56,25 @@ case class BatchNormalStrategy( dataSeries: Vector[Double], searchInterval: (Int, Int)): Seq[(Int, Anomaly)] = { + detectWithExtendedResults(dataSeries, searchInterval) + .filter { case (_, anomDataPoint) => anomDataPoint.isAnomaly } + .map { case (i, anomDataPoint) => + (i, Anomaly(Some(anomDataPoint.dataMetricValue), anomDataPoint.confidence, anomDataPoint.detail)) + } + } + + /** + * Search for anomalies in a series of data points, returns extended results. + * + * @param dataSeries The data contained in a Vector of Doubles + * @param searchInterval The indices between which anomalies should be detected. [a, b). + * @return The indices of all anomalies in the interval and their corresponding wrapper object + * with extended results. + */ + override def detectWithExtendedResults( + dataSeries: Vector[Double], + searchInterval: (Int, Int)): Seq[(Int, AnomalyDetectionDataPoint)] = { + val (searchStart, searchEnd) = searchInterval require(searchStart <= searchEnd, "The start of the interval can't be larger than the end.") @@ -83,13 +105,18 @@ case class BatchNormalStrategy( dataSeries.zipWithIndex .slice(searchStart, searchEnd) - .filter { case (value, _) => value > upperBound || value < lowerBound } .map { case (value, index) => - - val detail = Some(s"[BatchNormalStrategy]: Value $value is not in " + - s"bounds [$lowerBound, $upperBound].") - - (index, Anomaly(Option(value), 1.0, detail)) + val (detail, isAnomaly) = if (value > upperBound || value < lowerBound) { + (Some(s"[BatchNormalStrategy]: Value $value is not in " + + s"bounds [$lowerBound, $upperBound]."), true) + } else { + (None, false) + } + (index, AnomalyDetectionDataPoint(value, value, + BoundedRange(lowerBound = Bound(lowerBound, inclusive = true), + upperBound = Bound(upperBound, inclusive = true)), isAnomaly, 1.0, detail)) } } + + } diff --git a/src/main/scala/com/amazon/deequ/anomalydetection/ExtendedDetectionResult.scala b/src/main/scala/com/amazon/deequ/anomalydetection/ExtendedDetectionResult.scala new file mode 100644 index 000000000..c966d738a --- /dev/null +++ b/src/main/scala/com/amazon/deequ/anomalydetection/ExtendedDetectionResult.scala @@ -0,0 +1,143 @@ +/** + * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"). You may not + * use this file except in compliance with the License. A copy of the License + * is located at + * + * http://aws.amazon.com/apache2.0/ + * + * or in the "license" file accompanying this file. This file is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either + * express or implied. See the License for the specific language governing + * permissions and limitations under the License. + * + */ + +package com.amazon.deequ.anomalydetection + +/** + * The classes here provide the same anomaly detection functionality as in DetectionResult + * but also provide extended results through details contained in the AnomalyDetectionDataPoint class. + * See below. + */ + +/** + * Anomaly Detection Data Point class + * This class is different from the Anomaly Class in that this class + * wraps around all data points, not just anomalies, and provides extended results including + * if the data point is an anomaly, and the range with bounds used in the anomaly calculation. + * + * @param dataMetricValue The metric value that is the data point. + * @param anomalyMetricValue The metric value that is being used in the anomaly calculation. + * This usually aligns with dataMetricValue but not always, + * like in a rate of change strategy where the rate of change is the anomaly metric + * which may not equal the actual data point value. + * @param anomalyCheckRange The range of bounds used in the anomaly check, the anomalyMetricValue is + * compared to this range. + * @param isAnomaly If the data point is an anomaly. + * @param confidence Confidence of anomaly detection. + * @param detail Detailed error message. + */ +class AnomalyDetectionDataPoint( + val dataMetricValue: Double, + val anomalyMetricValue: Double, + val anomalyCheckRange: BoundedRange, + val isAnomaly: Boolean, + val confidence: Double, + val detail: Option[String]) + { + + def canEqual(that: Any): Boolean = { + that.isInstanceOf[AnomalyDetectionDataPoint] + } + + /** + * Tests anomalyDetectionDataPoints for equality. Ignores detailed error message. + * + * @param obj The object/ anomaly to compare against. + * @return true, if and only if the dataMetricValue, anomalyMetricValue, anomalyThreshold, isAnomaly + * and confidence are the same. + */ + override def equals(obj: Any): Boolean = { + obj match { + case anomaly: AnomalyDetectionDataPoint => + anomaly.dataMetricValue == dataMetricValue && + anomaly.anomalyMetricValue == anomalyMetricValue && + anomaly.anomalyCheckRange == anomalyCheckRange && + anomaly.isAnomaly == isAnomaly && + anomaly.confidence == confidence + case _ => false + } + } + + override def hashCode: Int = { + val prime = 31 + var result = 1 + result = prime * result + dataMetricValue.hashCode() + result = prime * result + anomalyMetricValue.hashCode() + result = prime * result + anomalyCheckRange.hashCode() + result = prime * result + isAnomaly.hashCode() + result = prime * result + confidence.hashCode() + result + } + +} + +object AnomalyDetectionDataPoint { + def apply(dataMetricValue: Double, anomalyMetricValue: Double, + anomalyCheckRange: BoundedRange, isAnomaly: Boolean, + confidence: Double, detail: Option[String] = None + ): AnomalyDetectionDataPoint = { + new AnomalyDetectionDataPoint(dataMetricValue, anomalyMetricValue, anomalyCheckRange, isAnomaly, confidence, detail) + } +} + + +/** + * BoundedRange class + * Defines range for the anomaly detection. + * @param upperBound The upper bound or threshold. + * @param lowerBound The lower bound or threshold. + */ +case class BoundedRange(lowerBound: Bound, upperBound: Bound) + +/** + * Bound Class + * Class representing a threshold/bound, with value and inclusive/exclusive boolean/ + * @param value The value of the bound as a Double. + * @param inclusive Boolean indicating if the Bound is inclusive or not. + */ +case class Bound(value: Double, inclusive: Boolean) + + + +/** + * ExtendedDetectionResult Class + * This class is returned from the detectAnomaliesInHistoryWithExtendedResults function. + * @param anomalyDetectionDataPointSequence The sequence of (timestamp, AnomalyDetectionDataPoint) pairs. + */ +case class ExtendedDetectionResult(anomalyDetectionDataPointSequence: + Seq[(Long, AnomalyDetectionDataPoint)] = Seq.empty) + + +/** + * AnomalyDetectionExtendedResult Class + * This class contains anomaly detection extended results through an AnomalyDetectionDataPoint. + * This is currently an optional field in the ConstraintResult class that is exposed to users. + * + * Currently, anomaly detection only runs on "newest" data point (referring to the dataframe being + * run on by the verification suite) and not multiple data points, so this will contain that + * one AnomalyDetectionDataPoint. + * @param anomalyDetectionDataPoint AnomalyDetectionDataPoint of newest data point generated from check. + */ +case class AnomalyDetectionExtendedResult(anomalyDetectionDataPoint: AnomalyDetectionDataPoint) + +/** + * AnomalyDetectionAssertionResult Class + * This class is returned by the assertion function Check.isNewestPointNonAnomalousWithExtendedResults. + * @param hasAnomaly Boolean indicating if there was an anomaly detected. + * @param anomalyDetectionExtendedResult AnomalyDetectionExtendedResults class. + */ +case class AnomalyDetectionAssertionResult(hasAnomaly: Boolean, + anomalyDetectionExtendedResult: AnomalyDetectionExtendedResult) diff --git a/src/main/scala/com/amazon/deequ/anomalydetection/OnlineNormalStrategy.scala b/src/main/scala/com/amazon/deequ/anomalydetection/OnlineNormalStrategy.scala index 8bf8b634c..aa9c91276 100644 --- a/src/main/scala/com/amazon/deequ/anomalydetection/OnlineNormalStrategy.scala +++ b/src/main/scala/com/amazon/deequ/anomalydetection/OnlineNormalStrategy.scala @@ -40,7 +40,8 @@ case class OnlineNormalStrategy( lowerDeviationFactor: Option[Double] = Some(3.0), upperDeviationFactor: Option[Double] = Some(3.0), ignoreStartPercentage: Double = 0.1, - ignoreAnomalies: Boolean = true) extends AnomalyDetectionStrategy { + ignoreAnomalies: Boolean = true) + extends AnomalyDetectionStrategy with AnomalyDetectionStrategyWithExtendedResults { require(lowerDeviationFactor.isDefined || upperDeviationFactor.isDefined, "At least one factor has to be specified.") @@ -121,9 +122,10 @@ case class OnlineNormalStrategy( /** - * Search for anomalies in a series of data points. + * Search for anomalies in a series of data points. This function uses the + * detectWithExtendedResults function and then filters and maps to return only anomaly objects. * - * @param dataSeries The data contained in a Vector of Doubles + * @param dataSeries The data contained in a Vector of Doubles. * @param searchInterval The indices between which anomalies should be detected. [a, b). * @return The indices of all anomalies in the interval and their corresponding wrapper object. */ @@ -132,6 +134,26 @@ case class OnlineNormalStrategy( searchInterval: (Int, Int)) : Seq[(Int, Anomaly)] = { + detectWithExtendedResults(dataSeries, searchInterval) + .filter { case (_, anomDataPoint) => anomDataPoint.isAnomaly } + .map { case (i, anomDataPoint) => + (i, Anomaly(Some(anomDataPoint.dataMetricValue), anomDataPoint.confidence, anomDataPoint.detail)) + } + } + + /** + * Search for anomalies in a series of data points, returns extended results. + * + * @param dataSeries The data contained in a Vector of Doubles. + * @param searchInterval The indices between which anomalies should be detected. [a, b). + * @return The indices of all anomalies in the interval and their corresponding wrapper object + * with extended results. + */ + override def detectWithExtendedResults( + dataSeries: Vector[Double], + searchInterval: (Int, Int)) + : Seq[(Int, AnomalyDetectionDataPoint)] = { + val (searchStart, searchEnd) = searchInterval require(searchStart <= searchEnd, "The start of the interval can't be larger than the end.") @@ -139,7 +161,6 @@ case class OnlineNormalStrategy( computeStatsAndAnomalies(dataSeries, searchInterval) .zipWithIndex .slice(searchStart, searchEnd) - .filter { case (result, _) => result.isAnomaly } .map { case (calcRes, index) => val lowerBound = calcRes.mean - lowerDeviationFactor.getOrElse(Double.MaxValue) * calcRes.stdDev @@ -149,7 +170,11 @@ case class OnlineNormalStrategy( val detail = Some(s"[OnlineNormalStrategy]: Value ${dataSeries(index)} is not in " + s"bounds [$lowerBound, $upperBound].") - (index, Anomaly(Option(dataSeries(index)), 1.0, detail)) + val value = dataSeries(index) + + (index, AnomalyDetectionDataPoint(value, value, + BoundedRange(lowerBound = Bound(lowerBound, inclusive = true), + upperBound = Bound(upperBound, inclusive = true)), calcRes.isAnomaly, 1.0, detail)) } } } diff --git a/src/main/scala/com/amazon/deequ/anomalydetection/SimpleThresholdStrategy.scala b/src/main/scala/com/amazon/deequ/anomalydetection/SimpleThresholdStrategy.scala index ec7f5df74..5e5fe72e8 100644 --- a/src/main/scala/com/amazon/deequ/anomalydetection/SimpleThresholdStrategy.scala +++ b/src/main/scala/com/amazon/deequ/anomalydetection/SimpleThresholdStrategy.scala @@ -25,34 +25,61 @@ package com.amazon.deequ.anomalydetection case class SimpleThresholdStrategy( lowerBound: Double = Double.MinValue, upperBound: Double) - extends AnomalyDetectionStrategy { + extends AnomalyDetectionStrategy with AnomalyDetectionStrategyWithExtendedResults { require(lowerBound <= upperBound, "The lower bound must be smaller or equal to the upper bound.") /** - * Search for anomalies in a series of data points. + * Search for anomalies in a series of data points. This function uses the + * detectWithExtendedResults function and then filters and maps to return only anomaly objects. * - * @param dataSeries The data contained in a Vector of Doubles + * @param dataSeries The data contained in a Vector of Doubles. * @param searchInterval The indices between which anomalies should be detected. [a, b). * @return The indices of all anomalies in the interval and their corresponding wrapper object. */ override def detect( + dataSeries: Vector[Double], + searchInterval: (Int, Int)) + : Seq[(Int, Anomaly)] = { + + detectWithExtendedResults(dataSeries, searchInterval) + .filter { case (_, anomDataPoint) => anomDataPoint.isAnomaly } + .map { case (i, anomDataPoint) => + (i, Anomaly(Some(anomDataPoint.dataMetricValue), anomDataPoint.confidence, anomDataPoint.detail)) + } + } + + /** + * Search for anomalies in a series of data points, returns extended results. + * + * @param dataSeries The data contained in a Vector of Doubles. + * @param searchInterval The indices between which anomalies should be detected. [a, b). + * @return The indices of all anomalies in the interval and their corresponding wrapper object + * with extended results. + */ + override def detectWithExtendedResults( dataSeries: Vector[Double], - searchInterval: (Int, Int)): Seq[(Int, Anomaly)] = { + searchInterval: (Int, Int)): Seq[(Int, AnomalyDetectionDataPoint)] = { val (searchStart, searchEnd) = searchInterval - require (searchStart <= searchEnd, "The start of the interval can't be larger than the end.") + require(searchStart <= searchEnd, "The start of the interval can't be larger than the end.") dataSeries.zipWithIndex .slice(searchStart, searchEnd) .filter { case (value, _) => value < lowerBound || value > upperBound } .map { case (value, index) => - val detail = Some(s"[SimpleThresholdStrategy]: Value $value is not in " + - s"bounds [$lowerBound, $upperBound]") + val (detail, isAnomaly) = if (value < lowerBound || value > upperBound) { + (Some(s"[SimpleThresholdStrategy]: Value $value is not in " + + s"bounds [$lowerBound, $upperBound]"), true) + } else { + (None, false) + } - (index, Anomaly(Option(value), 1.0, detail)) + (index, AnomalyDetectionDataPoint(value, value, + BoundedRange(lowerBound = Bound(lowerBound, inclusive = true), + upperBound = Bound(upperBound, inclusive = true)), isAnomaly, 1.0, detail)) } } } diff --git a/src/main/scala/com/amazon/deequ/anomalydetection/seasonal/HoltWinters.scala b/src/main/scala/com/amazon/deequ/anomalydetection/seasonal/HoltWinters.scala index 203dfe9fa..082911b1c 100644 --- a/src/main/scala/com/amazon/deequ/anomalydetection/seasonal/HoltWinters.scala +++ b/src/main/scala/com/amazon/deequ/anomalydetection/seasonal/HoltWinters.scala @@ -17,8 +17,15 @@ package com.amazon.deequ.anomalydetection.seasonal import breeze.linalg.DenseVector -import breeze.optimize.{ApproximateGradientFunction, DiffFunction, LBFGSB} -import com.amazon.deequ.anomalydetection.{Anomaly, AnomalyDetectionStrategy} +import breeze.optimize.ApproximateGradientFunction +import breeze.optimize.DiffFunction +import breeze.optimize.LBFGSB +import com.amazon.deequ.anomalydetection.Anomaly +import com.amazon.deequ.anomalydetection.AnomalyDetectionDataPoint +import com.amazon.deequ.anomalydetection.AnomalyDetectionStrategy +import com.amazon.deequ.anomalydetection.AnomalyDetectionStrategyWithExtendedResults +import com.amazon.deequ.anomalydetection.BoundedRange +import com.amazon.deequ.anomalydetection.Bound import collection.mutable.ListBuffer @@ -49,7 +56,7 @@ object HoltWinters { } class HoltWinters(seriesPeriodicity: Int) - extends AnomalyDetectionStrategy { + extends AnomalyDetectionStrategy with AnomalyDetectionStrategyWithExtendedResults { import HoltWinters._ @@ -174,37 +181,77 @@ class HoltWinters(seriesPeriodicity: Int) ) } - private def findAnomalies( - testSeries: Vector[Double], - forecasts: Seq[Double], - startIndex: Int, - residualSD: Double) - : Seq[(Int, Anomaly)] = { - testSeries.zip(forecasts).zipWithIndex - .collect { case ((inputValue, forecastedValue), detectionIndex) - if math.abs(inputValue - forecastedValue) > 1.96 * residualSD => + /** + * This function is renamed to add 'withExtendedResults' to the name. + * The functionality no longer filters out non anomalies, but instead leaves a flag + * of whether it's anomaly or not. The previous anomaly detection strategy uses this refactored function + * and then does the filtering to remove non anomalies and maps to previous anomaly objects. + * The new anomaly detection strategy with extended results uses this function and does not filter on it. + */ + private def findAnomaliesWithExtendedResults( + testSeries: Vector[Double], + forecasts: Seq[Double], + startIndex: Int, + residualSD: Double) + : Seq[(Int, AnomalyDetectionDataPoint)] = { - detectionIndex + startIndex -> Anomaly( - value = Some(inputValue), + testSeries.zip(forecasts).zipWithIndex + .collect { case ((inputValue, forecastedValue), detectionIndex) => + val anomalyMetricValue = math.abs(inputValue - forecastedValue) + val upperBound = 1.96 * residualSD + + val (detail, isAnomaly) = if (anomalyMetricValue > upperBound) { + (Some(s"Forecasted $forecastedValue for observed value $inputValue"), true) + } else { + (None, false) + } + detectionIndex + startIndex -> AnomalyDetectionDataPoint( + dataMetricValue = inputValue, + anomalyMetricValue = anomalyMetricValue, + anomalyCheckRange = BoundedRange(lowerBound = Bound(Double.MinValue, inclusive = true), + upperBound = Bound(upperBound, inclusive = true)), + isAnomaly = isAnomaly, confidence = 1.0, - detail = Some(s"Forecasted $forecastedValue for observed value $inputValue") + detail = detail ) } } /** - * Search for anomalies in a series of data points. + * Search for anomalies in a series of data points. This function uses the + * detectWithExtendedResults function and then filters and maps to return only anomaly objects. * - * @param dataSeries The data contained in a Vector of Doubles + * @param dataSeries The data contained in a Vector of Doubles. * @param searchInterval The indices between which anomalies should be detected. [a, b). * @return The indices of all anomalies in the interval and their corresponding wrapper object. + * */ override def detect( dataSeries: Vector[Double], searchInterval: (Int, Int) = (0, Int.MaxValue)) : Seq[(Int, Anomaly)] = { + detectWithExtendedResults(dataSeries, searchInterval) + .filter { case (_, anomDataPoint) => anomDataPoint.isAnomaly } + .map { case (i, anomDataPoint) => + (i, Anomaly(Some(anomDataPoint.dataMetricValue), anomDataPoint.confidence, anomDataPoint.detail)) + } + } + + /** + * Search for anomalies in a series of data points, returns extended results. + * + * @param dataSeries The data contained in a Vector of Doubles. + * @param searchInterval The indices between which anomalies should be detected. [a, b). + * @return The indices of all anomalies in the interval and their corresponding wrapper object + * with extended results. + */ + override def detectWithExtendedResults( + dataSeries: Vector[Double], + searchInterval: (Int, Int) = (0, Int.MaxValue)) + : Seq[(Int, AnomalyDetectionDataPoint)] = { + require(dataSeries.nonEmpty, "Provided data series is empty") val (start, end) = searchInterval @@ -245,6 +292,6 @@ class HoltWinters(seriesPeriodicity: Int) require(modelResults.forecasts.size == numberOfObservationsToForecast) val testSeries = dataSeries.drop(start) - findAnomalies(testSeries, modelResults.forecasts, start, residualsStandardDeviation) + findAnomaliesWithExtendedResults(testSeries, modelResults.forecasts, start, residualsStandardDeviation) } } diff --git a/src/main/scala/com/amazon/deequ/checks/Check.scala b/src/main/scala/com/amazon/deequ/checks/Check.scala index 1e1048921..446c2022d 100644 --- a/src/main/scala/com/amazon/deequ/checks/Check.scala +++ b/src/main/scala/com/amazon/deequ/checks/Check.scala @@ -25,10 +25,15 @@ import com.amazon.deequ.analyzers.Histogram import com.amazon.deequ.analyzers.KLLParameters import com.amazon.deequ.analyzers.Patterns import com.amazon.deequ.analyzers.State -import com.amazon.deequ.anomalydetection.HistoryUtils +import com.amazon.deequ.anomalydetection.AnomalyDetectionAssertionResult +import com.amazon.deequ.anomalydetection.AnomalyDetectionExtendedResult +import com.amazon.deequ.anomalydetection.ExtendedDetectionResult import com.amazon.deequ.anomalydetection.AnomalyDetectionStrategy +import com.amazon.deequ.anomalydetection.AnomalyDetectionStrategyWithExtendedResults import com.amazon.deequ.anomalydetection.AnomalyDetector +import com.amazon.deequ.anomalydetection.AnomalyDetectorWithExtendedResults import com.amazon.deequ.anomalydetection.DataPoint +import com.amazon.deequ.anomalydetection.HistoryUtils import com.amazon.deequ.checks.ColumnCondition.isAnyNotNull import com.amazon.deequ.checks.ColumnCondition.isEachNotNull import com.amazon.deequ.constraints.Constraint._ @@ -585,6 +590,44 @@ case class Check( addConstraint(anomalyConstraint(analyzer, anomalyAssertionFunction, hint)) } + /** + * Creates a constraint that runs AnomalyDetection with extended results on the new value. + * + * @param metricsRepository A metrics repository to get the previous results. + * @param anomalyDetectionStrategyWithExtendedResults The anomaly detection strategy with extended results. + * @param analyzer The analyzer for the metric to run anomaly detection on. + * @param withTagValues Can contain a Map with tag names and the corresponding values + * to filter for. + * @param beforeDate The maximum dateTime of previous AnalysisResults to use for + * the Anomaly Detection. + * @param afterDate The minimum dateTime of previous AnalysisResults to use for + * the Anomaly Detection. + * @param hint A hint to provide additional context why a constraint + * could have failed. + * @return + */ + private[deequ] def isNewestPointNonAnomalousWithExtendedResults[S <: State[S]]( + metricsRepository: MetricsRepository, + anomalyDetectionStrategyWithExtendedResults: AnomalyDetectionStrategyWithExtendedResults, + analyzer: Analyzer[S, Metric[Double]], + withTagValues: Map[String, String], + afterDate: Option[Long], + beforeDate: Option[Long], + hint: Option[String] = None) + : Check = { + + val anomalyAssertionFunction = Check.isNewestPointNonAnomalousWithExtendedResults( + metricsRepository, + anomalyDetectionStrategyWithExtendedResults, + analyzer, + withTagValues, + afterDate, + beforeDate + )(_) + + addConstraint(anomalyConstraintWithExtendedResults(analyzer, anomalyAssertionFunction, hint)) + } + /** * Creates a constraint that asserts on a column entropy. @@ -1263,6 +1306,7 @@ case class Check( } .collect { case constraint: AnalysisBasedConstraint[_, _, _] => constraint.analyzer + case constraint: AnomalyExtendedResultsConstraint[_, _, _] => constraint.analyzer } .map { _.asInstanceOf[Analyzer[_, Metric[_]]] } .toSet @@ -1355,4 +1399,117 @@ object Check { detectedAnomalies.anomalies.isEmpty } + + + /** + * Common assertion function checking if the value can be considered as normal (that no + * anomalies were detected), given the anomaly detection strategy with extended results + * and details on how to retrieve the history. + * This assertion function returns an AnomalyDetectionAssertionResult which contains + * anomaly detection extended results. + * + * @param metricsRepository A metrics repository to get the previous results. + * @param anomalyDetectionStrategyWithExtendedResults The anomaly detection strategy with extended results. + * @param analyzer The analyzer for the metric to run anomaly detection on. + * @param withTagValues Can contain a Map with tag names and the corresponding values + * to filter for. + * @param beforeDate The maximum dateTime of previous AnalysisResults to use for + * the Anomaly Detection. + * @param afterDate The minimum dateTime of previous AnalysisResults to use for + * the Anomaly Detection. + * @param currentMetricValue current metric value. + * @return The AnomalyDetectionAssertionResult with the boolean if the newest data point is anomalous + * along with the AnomalyDetectionExtendedResult object which contains the + * anomaly detection extended result details. + */ + private[deequ] def isNewestPointNonAnomalousWithExtendedResults[S <: State[S]]( + metricsRepository: MetricsRepository, + anomalyDetectionStrategyWithExtendedResults: AnomalyDetectionStrategyWithExtendedResults, + analyzer: Analyzer[S, Metric[Double]], + withTagValues: Map[String, String], + afterDate: Option[Long], + beforeDate: Option[Long])( + currentMetricValue: Double) + : AnomalyDetectionAssertionResult = { + + // Get history keys + var repositoryLoader = metricsRepository.load() + + repositoryLoader = repositoryLoader.withTagValues(withTagValues) + + beforeDate.foreach { beforeDate => + repositoryLoader = repositoryLoader.before(beforeDate) + } + + afterDate.foreach { afterDate => + repositoryLoader = repositoryLoader.after(afterDate) + } + + repositoryLoader = repositoryLoader.forAnalyzers(Seq(analyzer)) + + val analysisResults = repositoryLoader.get() + + require(analysisResults.nonEmpty, "There have to be previous results in the MetricsRepository!") + + val historicalMetrics = analysisResults + // If we have multiple DataPoints with the same dateTime, which should not happen in most + // cases, we still want consistent behaviour, so we sort them by Tags first + // (sorting is stable in Scala) + .sortBy(_.resultKey.tags.values) + .map { analysisResult => + val analyzerContextMetricMap = analysisResult.analyzerContext.metricMap + + val onlyAnalyzerMetricEntryInLoadedAnalyzerContext = analyzerContextMetricMap.headOption + + val doubleMetricOption = onlyAnalyzerMetricEntryInLoadedAnalyzerContext + .collect { case (_, metric) => metric.asInstanceOf[Metric[Double]] } + + val dataSetDate = analysisResult.resultKey.dataSetDate + + (dataSetDate, doubleMetricOption) + } + + // Ensure this is the last dataPoint + val testDateTime = analysisResults.map(_.resultKey.dataSetDate).max + 1 + require(testDateTime != Long.MaxValue, "Test DateTime cannot be Long.MaxValue, otherwise the" + + "Anomaly Detection, which works with an open upper interval bound, won't test anything") + + // Run given anomaly detection strategy and return false if the newest value is an Anomaly + val anomalyDetector = AnomalyDetectorWithExtendedResults(anomalyDetectionStrategyWithExtendedResults) + val anomalyDetectionResult: ExtendedDetectionResult = anomalyDetector.isNewPointAnomalousWithExtendedResults( + HistoryUtils.extractMetricValues[Double](historicalMetrics), + DataPoint(testDateTime, Some(currentMetricValue))) + + + // this function checks if the newest point is anomalous and returns a boolean for assertion, + // along with that newest point with anomaly check details + getNewestPointAnomalyResults(anomalyDetectionResult) + } + + /** + * Takes in ExtendedDetectionResult and returns AnomalyDetectionAssertionResult + * @param extendedDetectionResult Contains sequence of AnomalyDetectionDataPoints + * @return The AnomalyDetectionAssertionResult with the boolean if the newest data point is anomalous + * and the AnomalyDetectionExtendedResult containing the newest data point + * wrapped in the AnomalyDetectionDataPoint class + */ + private[deequ] def getNewestPointAnomalyResults(extendedDetectionResult: ExtendedDetectionResult): + AnomalyDetectionAssertionResult = { + val (hasAnomaly, anomalyDetectionExtendedResults): (Boolean, AnomalyDetectionExtendedResult) = { + + // Based on upstream code, this anomaly detection data point sequence should never be empty. + require(extendedDetectionResult.anomalyDetectionDataPointSequence != Nil, + "anomalyDetectionDataPoints from AnomalyDetectionExtendedResult cannot be empty") + + // Get the last anomaly detection data point of sequence (there should only be one element for now). + // Check the isAnomaly boolean, also return the last anomaly detection data point + // wrapped in the anomaly detection extended result class. + extendedDetectionResult.anomalyDetectionDataPointSequence match { + case _ :+ lastAnomalyDataPointPair => + (lastAnomalyDataPointPair._2.isAnomaly, AnomalyDetectionExtendedResult(lastAnomalyDataPointPair._2)) + } + } + AnomalyDetectionAssertionResult( + hasAnomaly = hasAnomaly, anomalyDetectionExtendedResult = anomalyDetectionExtendedResults) + } } diff --git a/src/main/scala/com/amazon/deequ/constraints/AnomalyExtendedResultsConstraint.scala b/src/main/scala/com/amazon/deequ/constraints/AnomalyExtendedResultsConstraint.scala new file mode 100644 index 000000000..03c374565 --- /dev/null +++ b/src/main/scala/com/amazon/deequ/constraints/AnomalyExtendedResultsConstraint.scala @@ -0,0 +1,134 @@ +/** + * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"). You may not + * use this file except in compliance with the License. A copy of the License + * is located at + * + * http://aws.amazon.com/apache2.0/ + * + * or in the "license" file accompanying this file. This file is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either + * express or implied. See the License for the specific language governing + * permissions and limitations under the License. + * + */ + +package com.amazon.deequ.constraints + +import com.amazon.deequ.analyzers.Analyzer +import com.amazon.deequ.analyzers.State +import com.amazon.deequ.anomalydetection.AnomalyDetectionAssertionResult +import com.amazon.deequ.metrics.Metric +import org.apache.spark.sql.DataFrame + +import scala.util.Success +import scala.util.Failure + +/** + * Case class for anomaly with extended results constraints that provides unified way to access + * AnalyzerContext and metrics stored in it. + * + * Runs the analysis and get the value of the metric returned by the analysis, + * picks the numeric value that will be used in the assertion function with metric picker + * runs the assertion. + * + * @param analyzer Analyzer to be run on the data frame. + * @param assertion Assertion function that returns an AnomalyDetectionAssertionResult with + * anomaly detection extended results as well as the assertion boolean. + * @param valuePicker Optional function to pick the interested part of the metric value that the + * assertion will be running on. Absence of such function means the metric + * value would be used in the assertion as it is. + * @param hint A hint to provide additional context why a constraint could have failed. + * @tparam M : Type of the metric value generated by the Analyzer. + * @tparam V : Type of the value being used in assertion function. + * + */ +private[deequ] case class AnomalyExtendedResultsConstraint[S <: State[S], M, V]( + analyzer: Analyzer[S, Metric[M]], + private[deequ] val assertion: V => AnomalyDetectionAssertionResult, + private[deequ] val valuePicker: Option[M => V] = None, + private[deequ] val hint: Option[String] = None) + extends Constraint { + + private[deequ] def calculateAndEvaluate(data: DataFrame) = { + val metric = analyzer.calculate(data) + evaluate(Map(analyzer -> metric)) + } + + override def evaluate( + analysisResults: Map[Analyzer[_, Metric[_]], Metric[_]]) + : ConstraintResult = { + + val metric = analysisResults.get(analyzer).map(_.asInstanceOf[Metric[M]]) + + metric.map(pickValueAndAssert).getOrElse( + // Analysis is missing + ConstraintResult(this, ConstraintStatus.Failure, + message = Some(AnomalyExtendedResultsConstraint.MissingAnalysis), metric = metric) + ) + } + + private[this] def pickValueAndAssert(metric: Metric[M]): ConstraintResult = { + + metric.value match { + // Analysis done successfully and result metric is there + case Success(metricValue) => + try { + val assertOn = runPickerOnMetric(metricValue) + val anomalyAssertionResult = runAssertion(assertOn) + + if (!anomalyAssertionResult.hasAnomaly) { + ConstraintResult(this, ConstraintStatus.Success, metric = Some(metric), + anomalyDetectionExtendedResultOption = Some(anomalyAssertionResult.anomalyDetectionExtendedResult)) + } else { + var errorMessage = s"Value: $assertOn does not meet the constraint requirement," + + s" check the anomaly detection metadata!" + hint.foreach(hint => errorMessage += s" $hint") + + ConstraintResult(this, ConstraintStatus.Failure, Some(errorMessage), Some(metric), + anomalyDetectionExtendedResultOption = Some(anomalyAssertionResult.anomalyDetectionExtendedResult)) + } + + } catch { + case AnomalyExtendedResultsConstraint.ConstraintAssertionException(msg) => + ConstraintResult(this, ConstraintStatus.Failure, + message = Some(s"${AnomalyExtendedResultsConstraint.AssertionException}: $msg!"), metric = Some(metric)) + case AnomalyExtendedResultsConstraint.ValuePickerException(msg) => + ConstraintResult(this, ConstraintStatus.Failure, + message = Some(s"${AnomalyExtendedResultsConstraint.ProblematicMetricPicker}: $msg!"), + metric = Some(metric)) + } + // An exception occurred during analysis + case Failure(e) => ConstraintResult(this, + ConstraintStatus.Failure, message = Some(e.getMessage), metric = Some(metric)) + } + } + + private def runPickerOnMetric(metricValue: M): V = + try { + valuePicker.map(function => function(metricValue)).getOrElse(metricValue.asInstanceOf[V]) + } catch { + case e: Exception => throw AnomalyExtendedResultsConstraint.ValuePickerException(e.getMessage) + } + + private def runAssertion(assertOn: V): AnomalyDetectionAssertionResult = + try { + assertion(assertOn) + } catch { + case e: Exception => throw AnomalyExtendedResultsConstraint.ConstraintAssertionException(e.getMessage) + } + + // 'assertion' and 'valuePicker' are lambdas we have to represent them like '' + override def toString: String = + s"AnomalyBasedConstraint($analyzer,,${valuePicker.map(_ => "")},$hint)" +} + +private[deequ] object AnomalyExtendedResultsConstraint { + val MissingAnalysis = "Missing Analysis, can't run the constraint!" + val ProblematicMetricPicker = "Can't retrieve the value to assert on" + val AssertionException = "Can't execute the assertion" + + private case class ValuePickerException(message: String) extends RuntimeException(message) + private case class ConstraintAssertionException(message: String) extends RuntimeException(message) +} diff --git a/src/main/scala/com/amazon/deequ/constraints/Constraint.scala b/src/main/scala/com/amazon/deequ/constraints/Constraint.scala index e289b3859..413e384ca 100644 --- a/src/main/scala/com/amazon/deequ/constraints/Constraint.scala +++ b/src/main/scala/com/amazon/deequ/constraints/Constraint.scala @@ -17,6 +17,7 @@ package com.amazon.deequ.constraints import com.amazon.deequ.analyzers._ +import com.amazon.deequ.anomalydetection.{AnomalyDetectionAssertionResult, AnomalyDetectionExtendedResult} import com.amazon.deequ.metrics.BucketDistribution import com.amazon.deequ.metrics.Distribution import com.amazon.deequ.metrics.Metric @@ -30,11 +31,23 @@ object ConstraintStatus extends Enumeration { val Success, Failure = Value } +/** + * ConstraintResult Class + * + * @param constraint Constraint associated with result. + * @param status Status of constraint (Success, Failure). + * @param message Optional message for errors. + * @param metric Optional Metric from calculation. + * @param anomalyDetectionExtendedResultOption optional anomaly detection extended results + * if using anomaly detection with extended results. + + */ case class ConstraintResult( constraint: Constraint, status: ConstraintStatus.Value, message: Option[String] = None, - metric: Option[Metric[_]] = None) + metric: Option[Metric[_]] = None, + anomalyDetectionExtendedResultOption: Option[AnomalyDetectionExtendedResult] = None) /** Common trait for all data quality constraints */ trait Constraint extends Serializable { @@ -248,6 +261,28 @@ object Constraint { new NamedConstraint(constraint, s"AnomalyConstraint($analyzer)") } + /** + * Runs Completeness analysis on the given column and executes the anomaly assertion + * and also returns extended results. + * + * @param analyzer Analyzer for the metric to do Anomaly Detection on. + * @param anomalyAssertion Function that receives a double input parameter + * (since the metric is double metric) and returns an AnomalyDetectionAssertionResult + * which contains a boolean and anomaly extended results. + * @param hint A hint to provide additional context why a constraint could have failed. + */ + def anomalyConstraintWithExtendedResults[S <: State[S]]( + analyzer: Analyzer[S, Metric[Double]], + anomalyAssertion: Double => AnomalyDetectionAssertionResult, + hint: Option[String] = None) + : Constraint = { + + val constraint = AnomalyExtendedResultsConstraint[S, Double, Double](analyzer, anomalyAssertion, + hint = hint) + + new NamedConstraint(constraint, s"AnomalyConstraintWithExtendedResults($analyzer)") + } + /** * Runs Uniqueness analysis on the given columns and executes the assertion * diff --git a/src/main/scala/com/amazon/deequ/examples/AnomalyDetectionWithExtendedResultsExample.scala b/src/main/scala/com/amazon/deequ/examples/AnomalyDetectionWithExtendedResultsExample.scala new file mode 100644 index 000000000..6666b9171 --- /dev/null +++ b/src/main/scala/com/amazon/deequ/examples/AnomalyDetectionWithExtendedResultsExample.scala @@ -0,0 +1,99 @@ +/** + * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"). You may not + * use this file except in compliance with the License. A copy of the License + * is located at + * + * http://aws.amazon.com/apache2.0/ + * + * or in the "license" file accompanying this file. This file is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either + * express or implied. See the License for the specific language governing + * permissions and limitations under the License. + * + */ + +package com.amazon.deequ.examples + +import com.amazon.deequ.VerificationSuite +import com.amazon.deequ.analyzers.Size +import com.amazon.deequ.anomalydetection.RelativeRateOfChangeStrategy +import com.amazon.deequ.checks.CheckStatus._ +import com.amazon.deequ.examples.ExampleUtils.itemsAsDataframe +import com.amazon.deequ.examples.ExampleUtils.withSpark +import com.amazon.deequ.repository.ResultKey +import com.amazon.deequ.repository.memory.InMemoryMetricsRepository + +private[examples] object AnomalyDetectionWithExtendedResultsExample extends App { + + withSpark { session => + + /* In this simple example, we assume that we compute metrics on a dataset every day and we want + to ensure that they don't change drastically. For sake of simplicity, we just look at the + size of the data */ + + /* Anomaly detection operates on metrics stored in a metric repository, so lets create one */ + val metricsRepository = new InMemoryMetricsRepository() + + /* This is the key which we use to store the metrics for the dataset from yesterday */ + val yesterdaysKey = ResultKey(System.currentTimeMillis() - 24 * 60 * 1000) + + /* Yesterday, the data had only two rows */ + val yesterdaysDataset = itemsAsDataframe(session, + Item(1, "Thingy A", "awesome thing.", "high", 0), + Item(2, "Thingy B", "available at http://thingb.com", null, 0)) + + /* We test for anomalies in the size of the data, it should not increase by more than 2x. Note + that we store the resulting metrics in our repository */ + VerificationSuite() + .onData(yesterdaysDataset) + .useRepository(metricsRepository) + .saveOrAppendResult(yesterdaysKey) + .addAnomalyCheckWithExtendedResults( + RelativeRateOfChangeStrategy(maxRateIncrease = Some(2.0)), + Size() + ) + .run() + + /* Todays data has five rows, so the data size more than doubled and our anomaly check should + catch this */ + val todaysDataset = itemsAsDataframe(session, + Item(1, "Thingy A", "awesome thing.", "high", 0), + Item(2, "Thingy B", "available at http://thingb.com", null, 0), + Item(3, null, null, "low", 5), + Item(4, "Thingy D", "checkout https://thingd.ca", "low", 10), + Item(5, "Thingy E", null, "high", 12)) + + /* The key for today's result */ + val todaysKey = ResultKey(System.currentTimeMillis()) + + /* Repeat the anomaly check for today's data */ + val verificationResult = VerificationSuite() + .onData(todaysDataset) + .useRepository(metricsRepository) + .saveOrAppendResult(todaysKey) + .addAnomalyCheckWithExtendedResults( + RelativeRateOfChangeStrategy(maxRateIncrease = Some(2.0)), + Size() + ) + .run() + + /* Did we find an anomaly? */ + if (verificationResult.status != Success) { + println("Anomaly detected in the Size() metric!") + val anomalyDetectionDataPoint = verificationResult.checkResults.head._2.constraintResults. + head.anomalyDetectionExtendedResultOption.get.anomalyDetectionDataPoint + println(s"Rate of change of ${anomalyDetectionDataPoint.anomalyMetricValue} was not in " + + s"${anomalyDetectionDataPoint.anomalyCheckRange}") + + /* Lets have a look at the actual metrics. */ + metricsRepository + .load() + .forAnalyzers(Seq(Size())) + .getSuccessMetricsAsDataFrame(session) + .show() + } + } + +} diff --git a/src/main/scala/com/amazon/deequ/examples/anomaly_detection_example.md b/src/main/scala/com/amazon/deequ/examples/anomaly_detection_example.md index 9acf7d83d..d72f5e951 100644 --- a/src/main/scala/com/amazon/deequ/examples/anomaly_detection_example.md +++ b/src/main/scala/com/amazon/deequ/examples/anomaly_detection_example.md @@ -1,5 +1,7 @@ # Anomaly detection +*After reading this page, check out [anomaly checks with extended results](https://github.com/awslabs/deequ/blob/master/src/main/scala/com/amazon/deequ/examples/anomaly_detection_with_extended_results_example.md) for how to access more details about the anomaly check such as the upper and lower bounds used in the check. This requires using a different method that has the same signature.* + Very often, it is hard to exactly define what constraints we want to evaluate on our data. However, we often have a better understanding of how much change we expect in certain metrics of our data. Therefore, **deequ** supports anomaly detection for data quality metrics. The idea is that we regularly store the metrics of our data in a [MetricsRepository](https://github.com/awslabs/deequ/blob/master/src/main/scala/com/amazon/deequ/examples/metrics_repository_example.md). Once we do that, we can run anomaly checks that compare the current value of the metric to its values in the past and allow us to detect anomalous changes. In this simple example, we assume that we compute the size of a dataset every day and we want to ensure that it does not change drastically: the number of rows on a given day should not be more than double of what we have seen on the day before. diff --git a/src/main/scala/com/amazon/deequ/examples/anomaly_detection_with_extended_results_example.md b/src/main/scala/com/amazon/deequ/examples/anomaly_detection_with_extended_results_example.md new file mode 100644 index 000000000..6b89b5d06 --- /dev/null +++ b/src/main/scala/com/amazon/deequ/examples/anomaly_detection_with_extended_results_example.md @@ -0,0 +1,75 @@ +# Anomaly detection with extended results + +Using the `addAnomalyCheckWithExtendedResults` method instead of the original `addAnomalyCheck`method, you can get more +detailed results about the anomaly detection result from the newly created metric. You can get details such as: + +- dataMetricValue: The metric value that is the data point. +- anomalyMetricValue: The metric value that is being checked for the anomaly detection strategy, which isn't always equal to the dataMetricValue. +- anomalyCheckRange: The range of bounds used in the anomaly check, the anomalyMetricValue is compared to this range. +- isAnomaly: If the anomalyMetricValue is outside the anomalyCheckRange, this is true. +- confidence: The confidence of the anomaly detection. +- detail: An optional detail message. + +These are contained within the AnomalyDetectionDataPoint class. +```scala +class AnomalyDetectionDataPoint( +val dataMetricValue: Double, +val anomalyMetricValue: Double, +val anomalyCheckRange: BoundedRange, +val isAnomaly: Boolean, +val confidence: Double, +val detail: Option[String]) + +case class BoundedRange(lowerBound: Bound, upperBound: Bound) + +case class Bound(value: Double, inclusive: Boolean) +``` + +In terms of accessing the result, the AnomalyDetectionDataPoint is wrapped in an AnomalyDetectionExtendedResult class +that is an optional field in the ConstraintResult class. The ConstraintResult class is a class that contains the +results of a constraint check. + +```scala +case class ConstraintResult( + constraint: Constraint, + status: ConstraintStatus.Value, + message: Option[String] = None, + metric: Option[Metric[_]] = None, + anomalyDetectionExtendedResultOption: Option[AnomalyDetectionExtendedResult] = None) + +case class AnomalyDetectionExtendedResult(anomalyDetectionDataPoint: AnomalyDetectionDataPoint) +``` + + +In order to get extended results you need to run your verification suite with +the `addAnomalyCheckWithExtendedResults` method, which has the same method signature as the original `addAnomalyCheck` +method. + +```scala +val result = VerificationSuite() + .onData(yesterdaysDataset) + .useRepository(metricsRepository) + .saveOrAppendResult(yesterdaysKey) + .addAnomalyCheckWithExtendedResults( + RelativeRateOfChangeStrategy(maxRateIncrease = Some(2.0)), + Size()) + .run() + +val anomalyDetectionExtendedResult: AnomalyDetectionExtendedResult = result.checkResults.head._2.constraintResults.head + .anomalyDetectionExtendedResultOption.getOrElse("placeholder to do something else") + +val anomalyDetectionDataPoint: AnomalyDetectionDataPoint = anomalyDetectionExtendedResult.anomalyDetectionDataPoint +``` + +You can access the values of the anomaly detection extended results like the anomalyMetricValue and anomalyCheckRange. +```scala +println(s"Anomaly check range: ${anomalyDetectionDataPoint.anomalyCheckRange}") +println(s"Anomaly metric value: ${anomalyDetectionDataPoint.anomalyMetricValue}") +``` + +``` +Anomaly check range: BoundedRange(Bound(-2.0,true),Bound(2.0,true)) +Anomaly metric value: 4.5 +``` + +An [executable version of this example with extended results](https://github.com/awslabs/deequ/blob/master/src/main/scala/com/amazon/deequ/examples/AnomalyDetectionWithExtendedResultsExample.scala) is available as part of our code base. diff --git a/src/test/scala/com/amazon/deequ/VerificationSuiteTest.scala b/src/test/scala/com/amazon/deequ/VerificationSuiteTest.scala index 146579e8e..54d9040a4 100644 --- a/src/test/scala/com/amazon/deequ/VerificationSuiteTest.scala +++ b/src/test/scala/com/amazon/deequ/VerificationSuiteTest.scala @@ -19,6 +19,9 @@ package com.amazon.deequ import com.amazon.deequ.analyzers._ import com.amazon.deequ.analyzers.runners.AnalyzerContext import com.amazon.deequ.anomalydetection.AbsoluteChangeStrategy +import com.amazon.deequ.anomalydetection.AnomalyDetectionDataPoint +import com.amazon.deequ.anomalydetection.Bound +import com.amazon.deequ.anomalydetection.BoundedRange import com.amazon.deequ.checks.Check import com.amazon.deequ.checks.CheckLevel import com.amazon.deequ.checks.CheckStatus @@ -1085,6 +1088,199 @@ class VerificationSuiteTest extends WordSpec with Matchers with SparkContextSpec } } + "addAnomalyCheckWithExtendedResults should work and output extended results" in withSparkSession { sparkSession => + evaluateWithRepositoryWithHistory { repository => + + val df = getDfWithNRows(sparkSession, 11) + val saveResultsWithKey = ResultKey(5, Map.empty) + + val analyzers = Completeness("item") :: Nil + + val verificationResultOne = VerificationSuite() + .onData(df) + .useRepository(repository) + .addRequiredAnalyzers(analyzers) + .saveOrAppendResult(saveResultsWithKey) + .addAnomalyCheckWithExtendedResults( + AbsoluteChangeStrategy(Some(-2.0), Some(2.0)), + Size(), + Some(AnomalyCheckConfig(CheckLevel.Warning, "Anomaly check to fail")) + ) + .run() + + val verificationResultTwo = VerificationSuite() + .onData(df) + .useRepository(repository) + .addRequiredAnalyzers(analyzers) + .saveOrAppendResult(saveResultsWithKey) + .addAnomalyCheckWithExtendedResults( + AbsoluteChangeStrategy(Some(-7.0), Some(7.0)), + Size(), + Some(AnomalyCheckConfig(CheckLevel.Error, "Anomaly check to succeed", + Map.empty, Some(0), Some(11))) + ) + .run() + + val verificationResultThree = VerificationSuite() + .onData(df) + .useRepository(repository) + .addRequiredAnalyzers(analyzers) + .saveOrAppendResult(saveResultsWithKey) + .addAnomalyCheckWithExtendedResults( + AbsoluteChangeStrategy(Some(-7.0), Some(7.0)), + Size() + ) + .run() + + val checkResultsOne = verificationResultOne.checkResults.head._2.status + val actualResultsOneAnomalyDetectionDataPoint = + verificationResultOne.checkResults.head._2.constraintResults.head + .anomalyDetectionExtendedResultOption.get.anomalyDetectionDataPoint + val expectedResultsOneAnomalyDetectionDataPoint = + AnomalyDetectionDataPoint(11.0, 7.0, BoundedRange(Bound(-2.0, inclusive = true), + Bound(2.0, inclusive = true)), isAnomaly = true, 1.0) + + val checkResultsTwo = verificationResultTwo.checkResults.head._2.status + val actualResultsTwoAnomalyDetectionDataPoint = + verificationResultTwo.checkResults.head._2.constraintResults.head + .anomalyDetectionExtendedResultOption.get.anomalyDetectionDataPoint + val expectedResultsTwoAnomalyDetectionDataPoint = + AnomalyDetectionDataPoint(11.0, 0.0, BoundedRange(Bound(-7.0, inclusive = true), + Bound(7.0, inclusive = true)), isAnomaly = false, 1.0) + + val checkResultsThree = verificationResultThree.checkResults.head._2.status + val actualResultsThreeAnomalyDetectionDataPoint = + verificationResultThree.checkResults.head._2.constraintResults.head + .anomalyDetectionExtendedResultOption.get.anomalyDetectionDataPoint + val expectedResultsThreeAnomalyDetectionDataPoint = + AnomalyDetectionDataPoint(11.0, 0.0, BoundedRange(Bound(-7.0, inclusive = true), + Bound(7.0, inclusive = true)), isAnomaly = false, 1.0) + + assert(checkResultsOne == CheckStatus.Warning) + assert(checkResultsTwo == CheckStatus.Success) + assert(checkResultsThree == CheckStatus.Success) + + assert(actualResultsOneAnomalyDetectionDataPoint == expectedResultsOneAnomalyDetectionDataPoint) + assert(actualResultsTwoAnomalyDetectionDataPoint == expectedResultsTwoAnomalyDetectionDataPoint) + assert(actualResultsThreeAnomalyDetectionDataPoint == expectedResultsThreeAnomalyDetectionDataPoint) + } + } + + "addAnomalyCheckWithExtendedResults with duplicate check analyzer should work and output extended results" in + withSparkSession { sparkSession => + evaluateWithRepositoryWithHistory { repository => + + val df = getDfWithNRows(sparkSession, 11) + val saveResultsWithKey = ResultKey(5, Map.empty) + + val analyzers = Completeness("item") :: Nil + + val verificationResultOne = VerificationSuite() + .onData(df) + .addCheck(Check(CheckLevel.Error, "group-1").hasSize(_ == 11)) + .useRepository(repository) + .addRequiredAnalyzers(analyzers) + .saveOrAppendResult(saveResultsWithKey) + .addAnomalyCheckWithExtendedResults( + AbsoluteChangeStrategy(Some(-2.0), Some(2.0)), + Size(), + Some(AnomalyCheckConfig(CheckLevel.Warning, "Anomaly check to fail")) + ) + .run() + + val verificationResultTwo = VerificationSuite() + .onData(df) + .useRepository(repository) + .addRequiredAnalyzers(analyzers) + .saveOrAppendResult(saveResultsWithKey) + .addAnomalyCheckWithExtendedResults( + AbsoluteChangeStrategy(Some(-7.0), Some(7.0)), + Size(), + Some(AnomalyCheckConfig(CheckLevel.Error, "Anomaly check to succeed", + Map.empty, Some(0), Some(11))) + ) + .run() + + val checkResultsOne = verificationResultOne.checkResults.values.toSeq(1).status + val actualResultsOneAnomalyDetectionDataPoint = + verificationResultOne.checkResults.values.toSeq(1).constraintResults.head + .anomalyDetectionExtendedResultOption.get.anomalyDetectionDataPoint + val expectedResultsOneAnomalyDetectionDataPoint = + AnomalyDetectionDataPoint(11.0, 7.0, BoundedRange(Bound(-2.0, inclusive = true), + Bound(2.0, inclusive = true)), isAnomaly = true, 1.0) + + val checkResultsTwo = verificationResultTwo.checkResults.head._2.status + val actualResultsTwoAnomalyDetectionDataPoint = + verificationResultTwo.checkResults.head._2.constraintResults.head + .anomalyDetectionExtendedResultOption.get.anomalyDetectionDataPoint + val expectedResultsTwoAnomalyDetectionDataPoint = + AnomalyDetectionDataPoint(11.0, 0.0, BoundedRange(Bound(-7.0, inclusive = true), + Bound(7.0, inclusive = true)), isAnomaly = false, 1.0) + + assert(checkResultsOne == CheckStatus.Warning) + assert(checkResultsTwo == CheckStatus.Success) + + assert(actualResultsOneAnomalyDetectionDataPoint == expectedResultsOneAnomalyDetectionDataPoint) + assert(actualResultsTwoAnomalyDetectionDataPoint == expectedResultsTwoAnomalyDetectionDataPoint) + } + } + + + "addAnomalyCheckWithExtendedResults with two anomaly checks on the same suite should work and " + + "output extended results" in + withSparkSession { sparkSession => + evaluateWithRepositoryWithHistory { repository => + + val df = getDfWithNRows(sparkSession, 11) + val saveResultsWithKey = ResultKey(5, Map.empty) + + val analyzers = Completeness("item") :: Nil + + val verificationResultOne = VerificationSuite() + .onData(df) + .addCheck(Check(CheckLevel.Error, "group-1").hasSize(_ == 11)) + .useRepository(repository) + .addRequiredAnalyzers(analyzers) + .saveOrAppendResult(saveResultsWithKey) + .addAnomalyCheckWithExtendedResults( + AbsoluteChangeStrategy(Some(-2.0), Some(2.0)), + Size(), + Some(AnomalyCheckConfig(CheckLevel.Warning, "Anomaly check to fail")) + ) + .addAnomalyCheckWithExtendedResults( + AbsoluteChangeStrategy(Some(-7.0), Some(7.0)), + Size(), + Some(AnomalyCheckConfig(CheckLevel.Error, "Anomaly check to succeed", + Map.empty, Some(0), Some(11))) + ) + .run() + + + val checkResultsOne = verificationResultOne.checkResults.values.toSeq(1).status + val actualResultsOneAnomalyDetectionDataPoint = + verificationResultOne.checkResults.values.toSeq(1).constraintResults.head + .anomalyDetectionExtendedResultOption.get.anomalyDetectionDataPoint + val expectedResultsOneAnomalyDetectionDataPoint = + AnomalyDetectionDataPoint(11.0, 7.0, BoundedRange(Bound(-2.0, inclusive = true), + Bound(2.0, inclusive = true)), isAnomaly = true, 1.0) + + val checkResultsTwo = verificationResultOne.checkResults.values.toSeq(2).status + val actualResultsTwoAnomalyDetectionDataPoint = + verificationResultOne.checkResults.values.toSeq(2).constraintResults.head + .anomalyDetectionExtendedResultOption.get.anomalyDetectionDataPoint + val expectedResultsTwoAnomalyDetectionDataPoint = + AnomalyDetectionDataPoint(11.0, 7.0, BoundedRange(Bound(-7.0, inclusive = true), + Bound(7.0, inclusive = true)), isAnomaly = false, 1.0) + + + assert(checkResultsOne == CheckStatus.Warning) + assert(checkResultsTwo == CheckStatus.Success) + + assert(actualResultsOneAnomalyDetectionDataPoint == expectedResultsOneAnomalyDetectionDataPoint) + assert(actualResultsTwoAnomalyDetectionDataPoint == expectedResultsTwoAnomalyDetectionDataPoint) + } + } + "write output files to specified locations" in withSparkSession { sparkSession => val df = getDfWithNumericValues(sparkSession) diff --git a/src/test/scala/com/amazon/deequ/anomalydetection/AbsoluteChangeStrategyTest.scala b/src/test/scala/com/amazon/deequ/anomalydetection/AbsoluteChangeStrategyTest.scala index 66d3c737a..1c435b546 100644 --- a/src/test/scala/com/amazon/deequ/anomalydetection/AbsoluteChangeStrategyTest.scala +++ b/src/test/scala/com/amazon/deequ/anomalydetection/AbsoluteChangeStrategyTest.scala @@ -17,20 +17,14 @@ package com.amazon.deequ.anomalydetection import breeze.linalg.DenseVector -import org.scalatest.{Matchers, WordSpec} +import org.scalatest.Matchers +import org.scalatest.WordSpec class AbsoluteChangeStrategyTest extends WordSpec with Matchers { "Absolute Change Strategy" should { - val strategy = AbsoluteChangeStrategy(Some(-2.0), Some(2.0)) - val data = (for (i <- 0 to 50) yield { - if (i < 20 || i > 30) { - 1.0 - } else { - if (i % 2 == 0) i else -i - } - }).toVector + val (strategy, data) = setupDefaultStrategyAndData() "detect all anomalies if no interval specified" in { val anomalyResult = strategy.detect(data) @@ -156,7 +150,173 @@ class AbsoluteChangeStrategyTest extends WordSpec with Matchers { assert(value < lowerBound || value > upperBound) } } + } + + + "Absolute Change Strategy using Extended Results" should { + + val (strategy, data) = setupDefaultStrategyAndData() + + "detect all anomalies if no interval specified" in { + val anomalyResult = strategy.detectWithExtendedResults(data).filter({case (_, anom) => anom.isAnomaly}) + val expectedAnomalyCheckRange = BoundedRange(Bound(-2.0, inclusive = true), Bound(2.0, inclusive = true)) + val expectedResult: Seq[(Int, AnomalyDetectionDataPoint)] = Seq( + (20, AnomalyDetectionDataPoint(20, 19, expectedAnomalyCheckRange, isAnomaly = true, 1.0)), + (21, AnomalyDetectionDataPoint(-21, -41, expectedAnomalyCheckRange, isAnomaly = true, 1.0)), + (22, AnomalyDetectionDataPoint(22, 43, expectedAnomalyCheckRange, isAnomaly = true, 1.0)), + (23, AnomalyDetectionDataPoint(-23, -45, expectedAnomalyCheckRange, isAnomaly = true, 1.0)), + (24, AnomalyDetectionDataPoint(24, 47, expectedAnomalyCheckRange, isAnomaly = true, 1.0)), + (25, AnomalyDetectionDataPoint(-25, -49, expectedAnomalyCheckRange, isAnomaly = true, 1.0)), + (26, AnomalyDetectionDataPoint(26, 51, expectedAnomalyCheckRange, isAnomaly = true, 1.0)), + (27, AnomalyDetectionDataPoint(-27, -53, expectedAnomalyCheckRange, isAnomaly = true, 1.0)), + (28, AnomalyDetectionDataPoint(28, 55, expectedAnomalyCheckRange, isAnomaly = true, 1.0)), + (29, AnomalyDetectionDataPoint(-29, -57, expectedAnomalyCheckRange, isAnomaly = true, 1.0)), + (30, AnomalyDetectionDataPoint(30, 59, expectedAnomalyCheckRange, isAnomaly = true, 1.0)), + (31, AnomalyDetectionDataPoint(1, -29, expectedAnomalyCheckRange, isAnomaly = true, 1.0)) + ) + assert(anomalyResult == expectedResult) + } + + "only detect anomalies in interval" in { + val anomalyResult = strategy.detectWithExtendedResults(data, (25, 50)).filter({case (_, anom) => anom.isAnomaly}) + val expectedAnomalyCheckRange = BoundedRange(Bound(-2.0, inclusive = true), Bound(2.0, inclusive = true)) + val expectedResult: Seq[(Int, AnomalyDetectionDataPoint)] = Seq( + (25, AnomalyDetectionDataPoint(-25, -49, expectedAnomalyCheckRange, isAnomaly = true, 1.0)), + (26, AnomalyDetectionDataPoint(26, 51, expectedAnomalyCheckRange, isAnomaly = true, 1.0)), + (27, AnomalyDetectionDataPoint(-27, -53, expectedAnomalyCheckRange, isAnomaly = true, 1.0)), + (28, AnomalyDetectionDataPoint(28, 55, expectedAnomalyCheckRange, isAnomaly = true, 1.0)), + (29, AnomalyDetectionDataPoint(-29, -57, expectedAnomalyCheckRange, isAnomaly = true, 1.0)), + (30, AnomalyDetectionDataPoint(30, 59, expectedAnomalyCheckRange, isAnomaly = true, 1.0)), + (31, AnomalyDetectionDataPoint(1, -29, expectedAnomalyCheckRange, isAnomaly = true, 1.0)) + ) + assert(anomalyResult == expectedResult) + } + + "ignore min rate if none is given" in { + val strategy = AbsoluteChangeStrategy(None, Some(1.0)) + val anomalyResult = strategy.detectWithExtendedResults(data).filter({case (_, anom) => anom.isAnomaly}) + val expectedAnomalyCheckRange = BoundedRange(lowerBound = Bound(Double.MinValue, inclusive = true), + upperBound = Bound(1.0, inclusive = true)) + // Anomalies with positive values only + val expectedResult: Seq[(Int, AnomalyDetectionDataPoint)] = Seq( + (20, AnomalyDetectionDataPoint(20, 19, expectedAnomalyCheckRange, isAnomaly = true, 1.0)), + (22, AnomalyDetectionDataPoint(22, 43, expectedAnomalyCheckRange, isAnomaly = true, 1.0)), + (24, AnomalyDetectionDataPoint(24, 47, expectedAnomalyCheckRange, isAnomaly = true, 1.0)), + (26, AnomalyDetectionDataPoint(26, 51, expectedAnomalyCheckRange, isAnomaly = true, 1.0)), + (28, AnomalyDetectionDataPoint(28, 55, expectedAnomalyCheckRange, isAnomaly = true, 1.0)), + (30, AnomalyDetectionDataPoint(30, 59, expectedAnomalyCheckRange, isAnomaly = true, 1.0)) + ) + + assert(anomalyResult == expectedResult) + } + + "ignore max rate if none is given" in { + val strategy = AbsoluteChangeStrategy(Some(-1.0), None) + val anomalyResult = strategy.detectWithExtendedResults(data).filter({case (_, anom) => anom.isAnomaly}) + val expectedAnomalyCheckRange = BoundedRange(lowerBound = Bound(-1.0, inclusive = true), + upperBound = Bound(Double.MaxValue, inclusive = true)) + + // Anomalies with negative values only + val expectedResult: Seq[(Int, AnomalyDetectionDataPoint)] = Seq( + (21, AnomalyDetectionDataPoint(-21, -41, expectedAnomalyCheckRange, isAnomaly = true, 1.0)), + (23, AnomalyDetectionDataPoint(-23, -45, expectedAnomalyCheckRange, isAnomaly = true, 1.0)), + (25, AnomalyDetectionDataPoint(-25, -49, expectedAnomalyCheckRange, isAnomaly = true, 1.0)), + (27, AnomalyDetectionDataPoint(-27, -53, expectedAnomalyCheckRange, isAnomaly = true, 1.0)), + (29, AnomalyDetectionDataPoint(-29, -57, expectedAnomalyCheckRange, isAnomaly = true, 1.0)), + (31, AnomalyDetectionDataPoint(1, -29, expectedAnomalyCheckRange, isAnomaly = true, 1.0)) + ) + assert(anomalyResult == expectedResult) + } + + "detect no anomalies if rates are set to min/ max value" in { + val strategy = AbsoluteChangeStrategy(Some(Double.MinValue), Some(Double.MaxValue)) + val anomalyResult = strategy.detectWithExtendedResults(data).filter({case (_, anom) => anom.isAnomaly}) + + val expectedResult: List[(Int, AnomalyDetectionDataPoint)] = List() + assert(anomalyResult == expectedResult) + } + + "attribute indices correctly for higher orders without search interval" in { + val data = Vector(0.0, 1.0, 3.0, 6.0, 18.0, 72.0) + val strategy = AbsoluteChangeStrategy(None, Some(8.0), order = 2) + val result = strategy.detectWithExtendedResults(data).filter({ case (_, anom) => anom.isAnomaly }) + + val expectedResult = Seq( + (4, AnomalyDetectionDataPoint(18.0, 9.0, BoundedRange(lowerBound = Bound(Double.MinValue, inclusive = true), + upperBound = Bound(8.0, inclusive = true)), isAnomaly = true, 1.0)), + (5, AnomalyDetectionDataPoint(72.0, 42.0, BoundedRange(lowerBound = Bound(Double.MinValue, inclusive = true), + upperBound = Bound(8.0, inclusive = true)), isAnomaly = true, 1.0)) + ) + assert(result == expectedResult) + } + + "attribute indices correctly for higher orders with search interval" in { + val data = Vector(0.0, 1.0, 3.0, 6.0, 18.0, 72.0) + val strategy = AbsoluteChangeStrategy(None, Some(8.0), order = 2) + val result = strategy.detectWithExtendedResults(data, (5, 6)).filter({case (_, anom) => anom.isAnomaly}) + + val expectedResult = Seq( + (5, AnomalyDetectionDataPoint(72.0, 42.0, BoundedRange(lowerBound = Bound(Double.MinValue, inclusive = true), + upperBound = Bound(8.0, inclusive = true)), isAnomaly = true, 1.0)) + ) + assert(result == expectedResult) + } + + "behave like the threshold strategy when order is 0" in { + val data = Vector(1.0, -1.0, 4.0, -7.0) + val result = strategy.detectWithExtendedResults(data).filter({case (_, anom) => anom.isAnomaly}) + + val expectedResult = Seq( + (2, AnomalyDetectionDataPoint(4.0, 5.0, BoundedRange(Bound(-2.0, inclusive = true), + Bound(2.0, inclusive = true)), isAnomaly = true, 1.0)), + (3, AnomalyDetectionDataPoint(-7.0, -11.0, BoundedRange(Bound(-2.0, inclusive = true), + Bound(2.0, inclusive = true)), isAnomaly = true, 1.0)) + ) + assert(result == expectedResult) + } + + + "work fine with empty input" in { + val emptySeries = Vector[Double]() + val anomalyResult = strategy.detectWithExtendedResults(emptySeries).filter({case (_, anom) => anom.isAnomaly}) + + assert(anomalyResult == Seq[(Int, AnomalyDetectionDataPoint)]()) + } + + "produce error message with correct value and bounds" in { + val result = strategy.detectWithExtendedResults(data).filter({case (_, anom) => anom.isAnomaly}) + + result.foreach { case (_, anom) => + val (value, lowerBound, upperBound) = + AnomalyDetectionTestUtils.firstThreeDoublesFromString(anom.detail.get) + + assert(value === anom.anomalyMetricValue) + assert(value < lowerBound || value > upperBound) + } + } + "assert anomalies are outside of anomaly bounds" in { + val result = strategy.detectWithExtendedResults(data).filter({ case (_, anom) => anom.isAnomaly }) + + result.foreach { case (_, anom) => + val value = anom.anomalyMetricValue + val upperBound = anom.anomalyCheckRange.upperBound.value + val lowerBound = anom.anomalyCheckRange.lowerBound.value + + assert(value < lowerBound || value > upperBound) + } + } + } + private def setupDefaultStrategyAndData(): (AbsoluteChangeStrategy, Vector[Double]) = { + val strategy = AbsoluteChangeStrategy(Some(-2.0), Some(2.0)) + val data = (for (i <- 0 to 50) yield { + if (i < 20 || i > 30) { + 1.0 + } else { + if (i % 2 == 0) i else -i + } + }).toVector + (strategy, data) } } diff --git a/src/test/scala/com/amazon/deequ/anomalydetection/AnomalyDetectorTest.scala b/src/test/scala/com/amazon/deequ/anomalydetection/AnomalyDetectorTest.scala index 08f411bd1..cdb87b763 100644 --- a/src/test/scala/com/amazon/deequ/anomalydetection/AnomalyDetectorTest.scala +++ b/src/test/scala/com/amazon/deequ/anomalydetection/AnomalyDetectorTest.scala @@ -17,18 +17,21 @@ package com.amazon.deequ.anomalydetection import org.scalamock.scalatest.MockFactory -import org.scalatest.{Matchers, PrivateMethodTester, WordSpec} +import org.scalatest.Matchers +import org.scalatest.PrivateMethodTester +import org.scalatest.WordSpec class AnomalyDetectorTest extends WordSpec with Matchers with MockFactory with PrivateMethodTester { - private val fakeAnomalyDetector = stub[AnomalyDetectionStrategy] - val aD = AnomalyDetector(fakeAnomalyDetector) - val data = Seq((0L, -1.0), (1L, 2.0), (2L, 3.0), (3L, 0.5)).map { case (t, v) => - DataPoint[Double](t, Option(v)) - } "Anomaly Detector" should { + val fakeAnomalyDetector = stub[AnomalyDetectionStrategy] + + val aD = AnomalyDetector(fakeAnomalyDetector) + val data = Seq((0L, -1.0), (1L, 2.0), (2L, 3.0), (3L, 0.5)).map { case (t, v) => + DataPoint[Double](t, Option(v)) + } "ignore missing values" in { val data = Seq(DataPoint[Double](0L, Option(1.0)), DataPoint[Double](1L, Option(2.0)), @@ -105,4 +108,115 @@ class AnomalyDetectorTest extends WordSpec with Matchers with MockFactory with P } } + + "Anomaly Detector with ExtendedResults" should { + + val fakeAnomalyDetector = stub[AnomalyDetectionStrategyWithExtendedResults] + + // This is used as a default bounded range value for anomaly detection + val defaultBoundedRange = BoundedRange(lowerBound = Bound(0.0, inclusive = true), + upperBound = Bound(1.0, inclusive = true)) + + val aD = AnomalyDetectorWithExtendedResults(fakeAnomalyDetector) + val data = Seq((0L, -1.0), (1L, 2.0), (2L, 3.0), (3L, 0.5)).map { case (t, v) => + DataPoint[Double](t, Option(v)) + } + + "ignore missing values" in { + val data = Seq(DataPoint[Double](0L, Option(1.0)), DataPoint[Double](1L, Option(2.0)), + DataPoint[Double](2L, None), DataPoint[Double](3L, Option(1.0))) + + (fakeAnomalyDetector.detectWithExtendedResults _ when(Vector(1.0, 2.0, 1.0), (0, 3))) + .returns(Seq((1, AnomalyDetectionDataPoint(2.0, 2.0, defaultBoundedRange, confidence = 1.0, + isAnomaly = true)))) + + val anomalyResult = aD.detectAnomaliesInHistoryWithExtendedResults(data, (0L, 4L)) + + assert(anomalyResult == ExtendedDetectionResult(Seq((1L, AnomalyDetectionDataPoint(2.0, 2.0, + defaultBoundedRange, confidence = 1.0, isAnomaly = true))))) + } + + "only detect values in range" in { + (fakeAnomalyDetector.detectWithExtendedResults _ when(Vector(-1.0, 2.0, 3.0, 0.5), (2, 4))) + .returns(Seq((2, AnomalyDetectionDataPoint(3.0, 3.0, defaultBoundedRange, confidence = 1.0, + isAnomaly = true)))) + + val anomalyResult = aD.detectAnomaliesInHistoryWithExtendedResults(data, (2L, 4L)) + + assert(anomalyResult == ExtendedDetectionResult(Seq((2L, AnomalyDetectionDataPoint(3.0, 3.0, + defaultBoundedRange, confidence = 1.0, isAnomaly = true))))) + } + + "throw an error when intervals are not ordered" in { + intercept[IllegalArgumentException] { + aD.detectAnomaliesInHistoryWithExtendedResults(data, (4, 2)) + } + } + + "treat ordered values with time gaps correctly" in { + val data = (for (i <- 1 to 10) yield { + (i.toLong * 200L) -> 5.0 + }).map { case (t, v) => + DataPoint[Double](t, Option(v)) + } + + (fakeAnomalyDetector.detectWithExtendedResults _ when(data.map(_.metricValue.get).toVector, (0, 2))) + .returns ( + Seq( + (0, AnomalyDetectionDataPoint(5.0, 5.0, defaultBoundedRange, confidence = 1.0, isAnomaly = true)), + (1, AnomalyDetectionDataPoint(5.0, 5.0, defaultBoundedRange, confidence = 1.0, isAnomaly = true)) + ) + ) + + val anomalyResult = aD.detectAnomaliesInHistoryWithExtendedResults(data, (200L, 401L)) + + assert(anomalyResult == ExtendedDetectionResult(Seq( + (200L, AnomalyDetectionDataPoint(5.0, 5.0, defaultBoundedRange, confidence = 1.0, isAnomaly = true)), + (400L, AnomalyDetectionDataPoint(5.0, 5.0, defaultBoundedRange, confidence = 1.0, isAnomaly = true))) + )) + } + + "treat unordered values with time gaps correctly" in { + val data = Seq((10L, -1.0), (25L, 2.0), (11L, 3.0), (0L, 0.5)).map { case (t, v) => + DataPoint[Double](t, Option(v)) + } + val tS = AnomalyDetector(SimpleThresholdStrategy(lowerBound = -0.5, upperBound = 1.0)) + + (fakeAnomalyDetector.detectWithExtendedResults _ when(Vector(0.5, -1.0, 3.0, 2.0), (0, 4))) + .returns( + Seq( + (1, AnomalyDetectionDataPoint(-1.0, -1.0, defaultBoundedRange, confidence = 1.0, isAnomaly = true)), + (2, AnomalyDetectionDataPoint(3.0, 3.0, defaultBoundedRange, confidence = 1.0, isAnomaly = true)), + (3, AnomalyDetectionDataPoint(2.0, 2.0, defaultBoundedRange, confidence = 1.0, isAnomaly = true)) + ) + ) + + val anomalyResult = aD.detectAnomaliesInHistoryWithExtendedResults(data) + + assert(anomalyResult == ExtendedDetectionResult( + Seq((10L, AnomalyDetectionDataPoint(-1.0, -1.0, defaultBoundedRange, confidence = 1.0, isAnomaly = true)), + (11L, AnomalyDetectionDataPoint(3.0, 3.0, defaultBoundedRange, confidence = 1.0, isAnomaly = true)), + (25L, AnomalyDetectionDataPoint(2.0, 2.0, defaultBoundedRange, confidence = 1.0, isAnomaly = true))))) + } + + "treat unordered values without time gaps correctly" in { + val data = Seq((1L, -1.0), (3L, 2.0), (2L, 3.0), (0L, 0.5)).map { case (t, v) => + DataPoint[Double](t, Option(v)) + } + + (fakeAnomalyDetector.detectWithExtendedResults _ when(Vector(0.5, -1.0, 3.0, 2.0), (0, 4))) + .returns(Seq((1, AnomalyDetectionDataPoint(-1.0, -1.0, defaultBoundedRange, confidence = 1.0, + isAnomaly = true)), + (2, AnomalyDetectionDataPoint(3.0, 3.0, defaultBoundedRange, confidence = 1.0, isAnomaly = true)), + (3, AnomalyDetectionDataPoint(2.0, 2.0, defaultBoundedRange, confidence = 1.0, isAnomaly = true)))) + + val anomalyResult = aD.detectAnomaliesInHistoryWithExtendedResults(data) + + assert(anomalyResult == ExtendedDetectionResult(Seq( + (1L, AnomalyDetectionDataPoint(-1.0, -1.0, defaultBoundedRange, confidence = 1.0, isAnomaly = true)), + (2L, AnomalyDetectionDataPoint(3.0, 3.0, defaultBoundedRange, confidence = 1.0, isAnomaly = true)), + (3L, AnomalyDetectionDataPoint(2.0, 2.0, defaultBoundedRange, confidence = 1.0, isAnomaly = true))))) + } + + } } diff --git a/src/test/scala/com/amazon/deequ/anomalydetection/BatchNormalStrategyTest.scala b/src/test/scala/com/amazon/deequ/anomalydetection/BatchNormalStrategyTest.scala index 05b9a6272..1634053eb 100644 --- a/src/test/scala/com/amazon/deequ/anomalydetection/BatchNormalStrategyTest.scala +++ b/src/test/scala/com/amazon/deequ/anomalydetection/BatchNormalStrategyTest.scala @@ -16,7 +16,8 @@ package com.amazon.deequ.anomalydetection -import org.scalatest.{Matchers, WordSpec} +import org.scalatest.Matchers +import org.scalatest.WordSpec import scala.util.Random @@ -24,19 +25,7 @@ class BatchNormalStrategyTest extends WordSpec with Matchers { "Batch Normal Strategy" should { - val strategy = - BatchNormalStrategy(lowerDeviationFactor = Some(1.0), upperDeviationFactor = Some(1.0)) - - val r = new Random(1) - val dist = (for (_ <- 0 to 49) yield { - r.nextGaussian() - }).toArray - - for (i <- 20 to 30) { - dist(i) += i + (i % 2 * -2 * i) - } - - val data = dist.toVector + val (strategy, data) = setupDefaultStrategyAndData() "only detect anomalies in interval" in { val anomalyResult = strategy.detect(data, (25, 50)) @@ -120,4 +109,136 @@ class BatchNormalStrategyTest extends WordSpec with Matchers { } } } + + "Batch Normal Strategy using Extended Results " should { + + val (strategy, data) = setupDefaultStrategyAndData() + + "only detect anomalies in interval" in { + val anomalyResult = + strategy.detectWithExtendedResults(data, (25, 50)).filter({ case (_, anom) => anom.isAnomaly }) + + val expectedAnomalyCheckRange = BoundedRange(Bound(-9.280850004177061, inclusive = true), + Bound(10.639954755150061, inclusive = true)) + val expectedResult: Seq[(Int, AnomalyDetectionDataPoint)] = Seq( + (25, AnomalyDetectionDataPoint(data(25), data(25), expectedAnomalyCheckRange, isAnomaly = true, 1.0)), + (26, AnomalyDetectionDataPoint(data(26), data(26), expectedAnomalyCheckRange, isAnomaly = true, 1.0)), + (27, AnomalyDetectionDataPoint(data(27), data(27), expectedAnomalyCheckRange, isAnomaly = true, 1.0)), + (28, AnomalyDetectionDataPoint(data(28), data(28), expectedAnomalyCheckRange, isAnomaly = true, 1.0)), + (29, AnomalyDetectionDataPoint(data(29), data(29), expectedAnomalyCheckRange, isAnomaly = true, 1.0)), + (30, AnomalyDetectionDataPoint(data(30), data(30), expectedAnomalyCheckRange, isAnomaly = true, 1.0)) + ) + assert(anomalyResult == expectedResult) + } + + "ignore lower factor if none is given" in { + val strategy = BatchNormalStrategy(None, Some(1.0)) + val anomalyResult = + strategy.detectWithExtendedResults(data, (20, 31)).filter({ case (_, anom) => anom.isAnomaly }) + + val expectedAnomalyCheckRange = BoundedRange(Bound(Double.NegativeInfinity, inclusive = true), + Bound(0.7781496015857838, inclusive = true)) + // Anomalies with positive values only + val expectedResult: Seq[(Int, AnomalyDetectionDataPoint)] = Seq( + (20, AnomalyDetectionDataPoint(data(20), data(20), expectedAnomalyCheckRange, isAnomaly = true, 1.0)), + (22, AnomalyDetectionDataPoint(data(22), data(22), expectedAnomalyCheckRange, isAnomaly = true, 1.0)), + (24, AnomalyDetectionDataPoint(data(24), data(24), expectedAnomalyCheckRange, isAnomaly = true, 1.0)), + (26, AnomalyDetectionDataPoint(data(26), data(26), expectedAnomalyCheckRange, isAnomaly = true, 1.0)), + (28, AnomalyDetectionDataPoint(data(28), data(28), expectedAnomalyCheckRange, isAnomaly = true, 1.0)), + (30, AnomalyDetectionDataPoint(data(30), data(30), expectedAnomalyCheckRange, isAnomaly = true, 1.0)) + ) + assert(anomalyResult == expectedResult) + } + + "ignore upper factor if none is given" in { + val strategy = BatchNormalStrategy(Some(1.0), None) + val anomalyResult = + strategy.detectWithExtendedResults(data, (10, 30)).filter({ case (_, anom) => anom.isAnomaly }) + val expectedAnomalyCheckRange = BoundedRange(Bound(-5.063730045618394, inclusive = true), + Bound(Double.PositiveInfinity, inclusive = true)) + + // Anomalies with negative values only + val expectedResult: Seq[(Int, AnomalyDetectionDataPoint)] = Seq( + (21, AnomalyDetectionDataPoint(data(21), data(21), expectedAnomalyCheckRange, isAnomaly = true, 1.0)), + (23, AnomalyDetectionDataPoint(data(23), data(23), expectedAnomalyCheckRange, isAnomaly = true, 1.0)), + (25, AnomalyDetectionDataPoint(data(25), data(25), expectedAnomalyCheckRange, isAnomaly = true, 1.0)), + (27, AnomalyDetectionDataPoint(data(27), data(27), expectedAnomalyCheckRange, isAnomaly = true, 1.0)), + (29, AnomalyDetectionDataPoint(data(29), data(29), expectedAnomalyCheckRange, isAnomaly = true, 1.0)) + ) + assert(anomalyResult == expectedResult) + } + + "ignore values in interval for mean/ stdDev if specified" in { + val data = Vector(1.0, 1.0, 1.0, 1000.0, 500.0, 1.0) + val strategy = BatchNormalStrategy(Some(3.0), Some(3.0)) + val anomalyResult = + strategy.detectWithExtendedResults(data, (3, 5)).filter({ case (_, anom) => anom.isAnomaly }) + + val expectedResult: Seq[(Int, AnomalyDetectionDataPoint)] = Seq( + (3, AnomalyDetectionDataPoint(1000, 1000, BoundedRange(Bound(1.0, inclusive = true), + Bound(1.0, inclusive = true)), isAnomaly = true, 1.0)), + (4, AnomalyDetectionDataPoint(500, 500, BoundedRange(Bound(1.0, inclusive = true), + Bound(1.0, inclusive = true)), isAnomaly = true, 1.0)) + ) + assert(anomalyResult == expectedResult) + } + + "throw an exception when trying to exclude all data points from calculation" in { + val strategy = BatchNormalStrategy() + intercept[IllegalArgumentException] { + strategy.detectWithExtendedResults(data).filter({ case (_, anom) => anom.isAnomaly }) + } + } + "detect no anomalies if factors are set to max value" in { + val strategy = BatchNormalStrategy(Some(Double.MaxValue), Some(Double.MaxValue)) + val anomalyResult = + strategy.detectWithExtendedResults(data, (30, 51)).filter({ case (_, anom) => anom.isAnomaly }) + + val expected: List[(Int, AnomalyDetectionDataPoint)] = List() + assert(anomalyResult == expected) + } + + "produce error message with correct value and bounds" in { + val result = strategy.detectWithExtendedResults(data, (25, 50)).filter({ case (_, anom) => anom.isAnomaly }) + + result.foreach { case (_, anom) => + val (value, lowerBound, upperBound) = + AnomalyDetectionTestUtils.firstThreeDoublesFromString(anom.detail.get) + + assert(value === anom.anomalyMetricValue) + assert(value < lowerBound || value > upperBound) + } + } + + "assert anomalies are outside of anomaly bounds" in { + val result = strategy.detectWithExtendedResults(data, (25, 50)).filter({ case (_, anom) => anom.isAnomaly }) + + result.foreach { case (_, anom) => + val value = anom.anomalyMetricValue + val upperBound = anom.anomalyCheckRange.upperBound.value + val lowerBound = anom.anomalyCheckRange.lowerBound.value + + assert(value < lowerBound || value > upperBound) + } + + + } + } + + private def setupDefaultStrategyAndData(): (BatchNormalStrategy, Vector[Double]) = { + val strategy = + BatchNormalStrategy(lowerDeviationFactor = Some(1.0), upperDeviationFactor = Some(1.0)) + + val r = new Random(1) + val dist = (for (_ <- 0 to 49) yield { + r.nextGaussian() + }).toArray + + for (i <- 20 to 30) { + dist(i) += i + (i % 2 * -2 * i) + } + + val data = dist.toVector + (strategy, data) + } } diff --git a/src/test/scala/com/amazon/deequ/anomalydetection/OnlineNormalStrategyTest.scala b/src/test/scala/com/amazon/deequ/anomalydetection/OnlineNormalStrategyTest.scala index 781ffb7ad..28f8ebdf7 100644 --- a/src/test/scala/com/amazon/deequ/anomalydetection/OnlineNormalStrategyTest.scala +++ b/src/test/scala/com/amazon/deequ/anomalydetection/OnlineNormalStrategyTest.scala @@ -16,7 +16,8 @@ package com.amazon.deequ.anomalydetection -import org.scalatest.{Matchers, WordSpec} +import org.scalatest.Matchers +import org.scalatest.WordSpec import breeze.stats.meanAndVariance import scala.util.Random @@ -26,18 +27,8 @@ class OnlineNormalStrategyTest extends WordSpec with Matchers { "Online Normal Strategy" should { - val strategy = OnlineNormalStrategy(lowerDeviationFactor = Some(1.5), - upperDeviationFactor = Some(1.5), ignoreStartPercentage = 0.2) - val r = new Random(1) - - val dist = (for (_ <- 0 to 50) yield { - r.nextGaussian() - }).toArray - - for (i <- 20 to 30) - dist(i) += i + (i % 2 * -2 * i) + val (strategy, data, r) = setupDefaultStrategyAndData() - val data = dist.toVector "detect all anomalies if no interval specified" in { val strategy = OnlineNormalStrategy(lowerDeviationFactor = Some(3.5), @@ -168,4 +159,188 @@ class OnlineNormalStrategyTest extends WordSpec with Matchers { } } } + + "Online Normal Strategy with Extended Results" should { + + val (strategy, data, r) = setupDefaultStrategyAndData() + "detect all anomalies if no interval specified" in { + val strategy = OnlineNormalStrategy(lowerDeviationFactor = Some(3.5), + upperDeviationFactor = Some(3.5), ignoreStartPercentage = 0.2) + val anomalyResult = strategy.detectWithExtendedResults(data).filter({case (_, anom) => anom.isAnomaly}) + + val expectedResult: Seq[(Int, AnomalyDetectionDataPoint)] = Seq( + (20, AnomalyDetectionDataPoint(data(20), data(20), + BoundedRange(Bound(-14.868489924421404, inclusive = true), Bound(14.255383455388895, inclusive = true)), + isAnomaly = true, 1.0)), + (21, AnomalyDetectionDataPoint(data(21), data(21), + BoundedRange(Bound(-13.6338479733374, inclusive = true), Bound(13.02074150430489, inclusive = true)), + isAnomaly = true, 1.0)), + (22, AnomalyDetectionDataPoint(data(22), data(22), + BoundedRange(Bound(-16.71733585267535, inclusive = true), Bound(16.104229383642842, inclusive = true)), + isAnomaly = true, 1.0)), + (23, AnomalyDetectionDataPoint(data(23), data(23), + BoundedRange(Bound(-17.346915620547467, inclusive = true), Bound(16.733809151514958, inclusive = true)), + isAnomaly = true, 1.0)), + (24, AnomalyDetectionDataPoint(data(24), data(24), + BoundedRange(Bound(-17.496117397890874, inclusive = true), Bound(16.883010928858365, inclusive = true)), + isAnomaly = true, 1.0)), + (25, AnomalyDetectionDataPoint(data(25), data(25), + BoundedRange(Bound(-17.90391150851199, inclusive = true), Bound(17.29080503947948, inclusive = true)), + isAnomaly = true, 1.0)), + (26, AnomalyDetectionDataPoint(data(26), data(26), + BoundedRange(Bound(-17.028892797350824, inclusive = true), Bound(16.415786328318315, inclusive = true)), + isAnomaly = true, 1.0)), + (27, AnomalyDetectionDataPoint(data(27), data(27), + BoundedRange(Bound(-17.720100310354653, inclusive = true), Bound(17.106993841322144, inclusive = true)), + isAnomaly = true, 1.0)), + (28, AnomalyDetectionDataPoint(data(28), data(28), + BoundedRange(Bound(-18.23663168508628, inclusive = true), Bound(17.62352521605377, inclusive = true)), + isAnomaly = true, 1.0)), + (29, AnomalyDetectionDataPoint(data(29), data(29), + BoundedRange(Bound(-19.32641622778204, inclusive = true), Bound(18.71330975874953, inclusive = true)), + isAnomaly = true, 1.0)), + (30, AnomalyDetectionDataPoint(data(30), data(30), + BoundedRange(Bound(-18.96540323993527, inclusive = true), Bound(18.35229677090276, inclusive = true)), + isAnomaly = true, 1.0)) + ) + assert(anomalyResult == expectedResult) + } + + "only detect anomalies in interval" in { + val anomalyResult = strategy.detectWithExtendedResults(data, (25, 31)).filter({case (_, anom) => anom.isAnomaly}) + + val expectedResult: Seq[(Int, AnomalyDetectionDataPoint)] = Seq( + (25, AnomalyDetectionDataPoint(data(25), data(25), + BoundedRange(Bound(-15.630116599125694, inclusive = true), Bound(16.989221350098695, inclusive = true)), + isAnomaly = true, 1.0)), + (26, AnomalyDetectionDataPoint(data(26), data(26), + BoundedRange(Bound(-14.963376676338362, inclusive = true), Bound(16.322481427311363, inclusive = true)), + isAnomaly = true, 1.0)), + (27, AnomalyDetectionDataPoint(data(27), data(27), + BoundedRange(Bound(-15.131834814393196, inclusive = true), Bound(16.490939565366197, inclusive = true)), + isAnomaly = true, 1.0)), + (28, AnomalyDetectionDataPoint(data(28), data(28), + BoundedRange(Bound(-14.76810451038132, inclusive = true), Bound(16.12720926135432, inclusive = true)), + isAnomaly = true, 1.0)), + (29, AnomalyDetectionDataPoint(data(29), data(29), + BoundedRange(Bound(-15.078145049879462, inclusive = true), Bound(16.437249800852463, inclusive = true)), + isAnomaly = true, 1.0)), + (30, AnomalyDetectionDataPoint(data(30), data(30), + BoundedRange(Bound(-14.540171084298914, inclusive = true), Bound(15.899275835271913, inclusive = true)), + isAnomaly = true, 1.0)) + ) + assert(anomalyResult == expectedResult) + } + + "ignore lower factor if none is given" in { + val strategy = OnlineNormalStrategy(lowerDeviationFactor = None, + upperDeviationFactor = Some(1.5)) + val anomalyResult = strategy.detectWithExtendedResults(data).filter({case (_, anom) => anom.isAnomaly}) + + // Anomalies with positive values only + val expectedResult: Seq[(Int, AnomalyDetectionDataPoint)] = Seq( + (20, AnomalyDetectionDataPoint(data(20), data(20), + BoundedRange(Bound(Double.NegativeInfinity, inclusive = true), Bound(5.934276775443095, inclusive = true)), + isAnomaly = true, 1.0)), + (22, AnomalyDetectionDataPoint(data(22), data(22), + BoundedRange(Bound(Double.NegativeInfinity, inclusive = true), Bound(7.979098353666404, inclusive = true)), + isAnomaly = true, 1.0)), + (24, AnomalyDetectionDataPoint(data(24), data(24), + BoundedRange(Bound(Double.NegativeInfinity, inclusive = true), Bound(9.582136909647211, inclusive = true)), + isAnomaly = true, 1.0)), + (26, AnomalyDetectionDataPoint(data(26), data(26), + BoundedRange(Bound(Double.NegativeInfinity, inclusive = true), Bound(10.320400087389258, inclusive = true)), + isAnomaly = true, 1.0)), + (28, AnomalyDetectionDataPoint(data(28), data(28), + BoundedRange(Bound(Double.NegativeInfinity, inclusive = true), Bound(11.113502213504855, inclusive = true)), + isAnomaly = true, 1.0)), + (30, AnomalyDetectionDataPoint(data(30), data(30), + BoundedRange(Bound(Double.NegativeInfinity, inclusive = true), Bound(11.776810456746686, inclusive = true)), + isAnomaly = true, 1.0)) + ) + assert(anomalyResult == expectedResult) + } + + "ignore upper factor if none is given" in { + val strategy = OnlineNormalStrategy(lowerDeviationFactor = Some(1.5), + upperDeviationFactor = None) + val anomalyResult = strategy.detectWithExtendedResults(data).filter({case (_, anom) => anom.isAnomaly}) + + // Anomalies with negative values only + val expectedResult: Seq[(Int, AnomalyDetectionDataPoint)] = Seq( + (21, AnomalyDetectionDataPoint(data(21), data(21), + BoundedRange(Bound(-7.855820681098751, inclusive = true), Bound(Double.PositiveInfinity, inclusive = true)), + isAnomaly = true, 1.0)), + (23, AnomalyDetectionDataPoint(data(23), data(23), + BoundedRange(Bound(-10.14631437278386, inclusive = true), Bound(Double.PositiveInfinity, inclusive = true)), + isAnomaly = true, 1.0)), + (25, AnomalyDetectionDataPoint(data(25), data(25), + BoundedRange(Bound(-11.038751996286909, inclusive = true), Bound(Double.PositiveInfinity, inclusive = true)), + isAnomaly = true, 1.0)), + (27, AnomalyDetectionDataPoint(data(27), data(27), + BoundedRange(Bound(-11.359107787232386, inclusive = true), Bound(Double.PositiveInfinity, inclusive = true)), + isAnomaly = true, 1.0)), + (29, AnomalyDetectionDataPoint(data(29), data(29), + BoundedRange(Bound(-12.097995027317015, inclusive = true), Bound(Double.PositiveInfinity, inclusive = true)), + isAnomaly = true, 1.0)) + ) + assert(anomalyResult == expectedResult) + } + + "work fine with empty input" in { + val emptySeries = Vector[Double]() + val anomalyResult = strategy.detectWithExtendedResults(emptySeries).filter({case (_, anom) => anom.isAnomaly}) + + assert(anomalyResult == Seq[(Int, AnomalyDetectionDataPoint)]()) + } + + "detect no anomalies if factors are set to max value" in { + val strategy = OnlineNormalStrategy(Some(Double.MaxValue), Some(Double.MaxValue)) + val anomalyResult = strategy.detectWithExtendedResults(data).filter({case (_, anom) => anom.isAnomaly}) + + val expected: List[(Int, AnomalyDetectionDataPoint)] = List() + assert(anomalyResult == expected) + } + + "produce error message with correct value and bounds" in { + val result = strategy.detectWithExtendedResults(data).filter({case (_, anom) => anom.isAnomaly}) + + result.foreach { case (_, anom) => + val (value, lowerBound, upperBound) = + AnomalyDetectionTestUtils.firstThreeDoublesFromString(anom.detail.get) + + assert(value === anom.anomalyMetricValue) + assert(value < lowerBound || value > upperBound) + } + } + + "assert anomalies are outside of anomaly bounds" in { + val result = strategy.detectWithExtendedResults(data).filter({ case (_, anom) => anom.isAnomaly }) + + result.foreach { case (_, anom) => + val value = anom.anomalyMetricValue + val upperBound = anom.anomalyCheckRange.upperBound.value + val lowerBound = anom.anomalyCheckRange.lowerBound.value + + assert(value < lowerBound || value > upperBound) + } + } + } + + + private def setupDefaultStrategyAndData(): (OnlineNormalStrategy, Vector[Double], Random) = { + val strategy = OnlineNormalStrategy(lowerDeviationFactor = Some(1.5), + upperDeviationFactor = Some(1.5), ignoreStartPercentage = 0.2) + val r = new Random(1) + + val dist = (for (_ <- 0 to 50) yield { + r.nextGaussian() + }).toArray + + for (i <- 20 to 30) + dist(i) += i + (i % 2 * -2 * i) + + val data = dist.toVector + (strategy, data, r) + } } diff --git a/src/test/scala/com/amazon/deequ/anomalydetection/RateOfChangeStrategyTest.scala b/src/test/scala/com/amazon/deequ/anomalydetection/RateOfChangeStrategyTest.scala index 70f66f033..7c87b85ee 100644 --- a/src/test/scala/com/amazon/deequ/anomalydetection/RateOfChangeStrategyTest.scala +++ b/src/test/scala/com/amazon/deequ/anomalydetection/RateOfChangeStrategyTest.scala @@ -16,7 +16,8 @@ package com.amazon.deequ.anomalydetection -import org.scalatest.{Matchers, WordSpec} +import org.scalatest.Matchers +import org.scalatest.WordSpec /** * The tested class RateOfChangeStrategy is deprecated. @@ -26,14 +27,7 @@ class RateOfChangeStrategyTest extends WordSpec with Matchers { "RateOfChange Strategy" should { - val strategy = RateOfChangeStrategy(Some(-2.0), Some(2.0)) - val data = (for (i <- 0 to 50) yield { - if (i < 20 || i > 30) { - 1.0 - } else { - if (i % 2 == 0) i else -i - } - }).toVector + val (strategy, data) = setupDefaultStrategyAndData() "detect all anomalies if no interval specified" in { val anomalyResult = strategy.detect(data) @@ -43,4 +37,43 @@ class RateOfChangeStrategyTest extends WordSpec with Matchers { assert(anomalyResult == expected) } } + + "RateOfChange Strategy with Extended Results" should { + + val (strategy, data) = setupDefaultStrategyAndData() + + "detect all anomalies if no interval specified" in { + val anomalyResult = strategy.detectWithExtendedResults(data).filter({case (_, anom) => anom.isAnomaly}) + + val expectedAnomalyThreshold = BoundedRange(Bound(-2.0, inclusive = true), Bound(2.0, inclusive = true)) + val expectedResult: Seq[(Int, AnomalyDetectionDataPoint)] = Seq( + (20, AnomalyDetectionDataPoint(20, 19, expectedAnomalyThreshold, isAnomaly = true, 1.0)), + (21, AnomalyDetectionDataPoint(-21, -41, expectedAnomalyThreshold, isAnomaly = true, 1.0)), + (22, AnomalyDetectionDataPoint(22, 43, expectedAnomalyThreshold, isAnomaly = true, 1.0)), + (23, AnomalyDetectionDataPoint(-23, -45, expectedAnomalyThreshold, isAnomaly = true, 1.0)), + (24, AnomalyDetectionDataPoint(24, 47, expectedAnomalyThreshold, isAnomaly = true, 1.0)), + (25, AnomalyDetectionDataPoint(-25, -49, expectedAnomalyThreshold, isAnomaly = true, 1.0)), + (26, AnomalyDetectionDataPoint(26, 51, expectedAnomalyThreshold, isAnomaly = true, 1.0)), + (27, AnomalyDetectionDataPoint(-27, -53, expectedAnomalyThreshold, isAnomaly = true, 1.0)), + (28, AnomalyDetectionDataPoint(28, 55, expectedAnomalyThreshold, isAnomaly = true, 1.0)), + (29, AnomalyDetectionDataPoint(-29, -57, expectedAnomalyThreshold, isAnomaly = true, 1.0)), + (30, AnomalyDetectionDataPoint(30, 59, expectedAnomalyThreshold, isAnomaly = true, 1.0)), + (31, AnomalyDetectionDataPoint(1, -29, expectedAnomalyThreshold, isAnomaly = true, 1.0)) + ) + assert(anomalyResult == expectedResult) + } + + } + + private def setupDefaultStrategyAndData(): (RateOfChangeStrategy, Vector[Double]) = { + val strategy = RateOfChangeStrategy(Some(-2.0), Some(2.0)) + val data = (for (i <- 0 to 50) yield { + if (i < 20 || i > 30) { + 1.0 + } else { + if (i % 2 == 0) i else -i + } + }).toVector + (strategy, data) + } } diff --git a/src/test/scala/com/amazon/deequ/anomalydetection/RelativeRateOfChangeStrategyTest.scala b/src/test/scala/com/amazon/deequ/anomalydetection/RelativeRateOfChangeStrategyTest.scala index bfde6ba18..bd09d0e97 100644 --- a/src/test/scala/com/amazon/deequ/anomalydetection/RelativeRateOfChangeStrategyTest.scala +++ b/src/test/scala/com/amazon/deequ/anomalydetection/RelativeRateOfChangeStrategyTest.scala @@ -17,20 +17,14 @@ package com.amazon.deequ.anomalydetection import breeze.linalg.DenseVector -import org.scalatest.{Matchers, WordSpec} +import org.scalatest.Matchers +import org.scalatest.WordSpec class RelativeRateOfChangeStrategyTest extends WordSpec with Matchers { "Relative Rate of Change Strategy" should { - val strategy = RelativeRateOfChangeStrategy(Some(0.5), Some(2.0)) - val data = (for (i <- 0 to 50) yield { - if (i < 20 || i > 30) { - 1.0 - } else { - if (i % 2 == 0) i else 1 - } - }).toVector + val (strategy, data) = setupDefaultStrategyAndData() "detect all anomalies if no interval specified" in { val anomalyResult = strategy.detect(data) @@ -150,4 +144,164 @@ class RelativeRateOfChangeStrategyTest extends WordSpec with Matchers { } } } + + "Relative Rate of Change Strategy with Extended Results" should { + + val (strategy, data) = setupDefaultStrategyAndData() + + "detect all anomalies if no interval specified" in { + val anomalyResult = strategy.detectWithExtendedResults(data).filter({case (_, anom) => anom.isAnomaly}) + + val expectedAnomalyCheckRange = BoundedRange(Bound(0.5, inclusive = true), Bound(2.0, inclusive = true)) + val expectedResult: Seq[(Int, AnomalyDetectionDataPoint)] = Seq( + (20, AnomalyDetectionDataPoint(20, 20, expectedAnomalyCheckRange, isAnomaly = true, 1.0)), + (21, AnomalyDetectionDataPoint(1, 0.05, expectedAnomalyCheckRange, isAnomaly = true, 1.0)), + (22, AnomalyDetectionDataPoint(22, 22, expectedAnomalyCheckRange, isAnomaly = true, 1.0)), + (23, AnomalyDetectionDataPoint(1, 0.045454545454545456, expectedAnomalyCheckRange, isAnomaly = true, 1.0)), + (24, AnomalyDetectionDataPoint(24, 24, expectedAnomalyCheckRange, isAnomaly = true, 1.0)), + (25, AnomalyDetectionDataPoint(1, 0.041666666666666664, expectedAnomalyCheckRange, isAnomaly = true, 1.0)), + (26, AnomalyDetectionDataPoint(26, 26, expectedAnomalyCheckRange, isAnomaly = true, 1.0)), + (27, AnomalyDetectionDataPoint(1, 0.038461538461538464, expectedAnomalyCheckRange, isAnomaly = true, 1.0)), + (28, AnomalyDetectionDataPoint(28, 28, expectedAnomalyCheckRange, isAnomaly = true, 1.0)), + (29, AnomalyDetectionDataPoint(1, 0.03571428571428571, expectedAnomalyCheckRange, isAnomaly = true, 1.0)), + (30, AnomalyDetectionDataPoint(30, 30, expectedAnomalyCheckRange, isAnomaly = true, 1.0)), + (31, AnomalyDetectionDataPoint(1, 0.03333333333333333, expectedAnomalyCheckRange, isAnomaly = true, 1.0)) + ) + assert(anomalyResult == expectedResult) + } + + "only detect anomalies in interval" in { + val anomalyResult = strategy.detectWithExtendedResults(data, (25, 50)).filter({case (_, anom) => anom.isAnomaly}) + + val expectedAnomalyCheckRange = BoundedRange(Bound(0.5, inclusive = true), Bound(2.0, inclusive = true)) + val expectedResult: Seq[(Int, AnomalyDetectionDataPoint)] = Seq( + (25, AnomalyDetectionDataPoint(1, 0.041666666666666664, expectedAnomalyCheckRange, isAnomaly = true, 1.0)), + (26, AnomalyDetectionDataPoint(26, 26, expectedAnomalyCheckRange, isAnomaly = true, 1.0)), + (27, AnomalyDetectionDataPoint(1, 0.038461538461538464, expectedAnomalyCheckRange, isAnomaly = true, 1.0)), + (28, AnomalyDetectionDataPoint(28, 28, expectedAnomalyCheckRange, isAnomaly = true, 1.0)), + (29, AnomalyDetectionDataPoint(1, 0.03571428571428571, expectedAnomalyCheckRange, isAnomaly = true, 1.0)), + (30, AnomalyDetectionDataPoint(30, 30, expectedAnomalyCheckRange, isAnomaly = true, 1.0)), + (31, AnomalyDetectionDataPoint(1, 0.03333333333333333, expectedAnomalyCheckRange, isAnomaly = true, 1.0)) + ) + assert(anomalyResult == expectedResult) + } + + "ignore min rate if none is given" in { + val strategy = RelativeRateOfChangeStrategy(None, Some(1.0)) + val anomalyResult = strategy.detectWithExtendedResults(data).filter({case (_, anom) => anom.isAnomaly}) + + // Anomalies with positive values only + val expectedAnomalyCheckRange = BoundedRange(Bound(-1.7976931348623157E308, inclusive = true), + Bound(1.0, inclusive = true)) + val expectedResult: Seq[(Int, AnomalyDetectionDataPoint)] = Seq( + (20, AnomalyDetectionDataPoint(20, 20, expectedAnomalyCheckRange, isAnomaly = true, 1.0)), + (22, AnomalyDetectionDataPoint(22, 22, expectedAnomalyCheckRange, isAnomaly = true, 1.0)), + (24, AnomalyDetectionDataPoint(24, 24, expectedAnomalyCheckRange, isAnomaly = true, 1.0)), + (26, AnomalyDetectionDataPoint(26, 26, expectedAnomalyCheckRange, isAnomaly = true, 1.0)), + (28, AnomalyDetectionDataPoint(28, 28, expectedAnomalyCheckRange, isAnomaly = true, 1.0)), + (30, AnomalyDetectionDataPoint(30, 30, expectedAnomalyCheckRange, isAnomaly = true, 1.0)) + ) + assert(anomalyResult == expectedResult) + } + + "ignore max rate if none is given" in { + val strategy = RelativeRateOfChangeStrategy(Some(0.5), None) + val anomalyResult = strategy.detectWithExtendedResults(data).filter({case (_, anom) => anom.isAnomaly}) + + // Anomalies with negative values only + val expectedAnomalyCheckRange = BoundedRange(Bound(0.5, inclusive = true), + Bound(1.7976931348623157E308, inclusive = true)) + val expectedResult: Seq[(Int, AnomalyDetectionDataPoint)] = Seq( + (21, AnomalyDetectionDataPoint(1, 0.05, expectedAnomalyCheckRange, isAnomaly = true, 1.0)), + (23, AnomalyDetectionDataPoint(1, 0.045454545454545456, expectedAnomalyCheckRange, isAnomaly = true, 1.0)), + (25, AnomalyDetectionDataPoint(1, 0.041666666666666664, expectedAnomalyCheckRange, isAnomaly = true, 1.0)), + (27, AnomalyDetectionDataPoint(1, 0.038461538461538464, expectedAnomalyCheckRange, isAnomaly = true, 1.0)), + (29, AnomalyDetectionDataPoint(1, 0.03571428571428571, expectedAnomalyCheckRange, isAnomaly = true, 1.0)), + (31, AnomalyDetectionDataPoint(1, 0.03333333333333333, expectedAnomalyCheckRange, isAnomaly = true, 1.0)) + ) + assert(anomalyResult == expectedResult) + } + + "detect no anomalies if rates are set to min/ max value" in { + val strategy = RelativeRateOfChangeStrategy(Some(Double.MinValue), Some(Double.MaxValue)) + val anomalyResult = strategy.detectWithExtendedResults(data).filter({case (_, anom) => anom.isAnomaly}) + + val expected: List[(Int, AnomalyDetectionDataPoint)] = List() + assert(anomalyResult == expected) + } + + "attribute indices correctly for higher orders without search interval" in { + val data = Vector(0.0, 1.0, 3.0, 6.0, 18.0, 72.0) + val strategy = RelativeRateOfChangeStrategy(None, Some(8.0), order = 2) + val anomalyResult = strategy.detectWithExtendedResults(data).filter({case (_, anom) => anom.isAnomaly}) + + val expectedResult: Seq[(Int, AnomalyDetectionDataPoint)] = Seq( + (2, AnomalyDetectionDataPoint(3, Double.PositiveInfinity, + BoundedRange(Bound(-1.7976931348623157E308, inclusive = true), Bound(8.0, inclusive = true)), + isAnomaly = true, 1.0)), + (5, AnomalyDetectionDataPoint(72, 12, + BoundedRange(Bound(-1.7976931348623157E308, inclusive = true), Bound(8.0, inclusive = true)), + isAnomaly = true, 1.0)) + ) + assert(anomalyResult == expectedResult) + } + + "attribute indices correctly for higher orders with search interval" in { + val data = Vector(0.0, 1.0, 3.0, 6.0, 18.0, 72.0) + val strategy = RelativeRateOfChangeStrategy(None, Some(8.0), order = 2) + val anomalyResult = strategy.detectWithExtendedResults(data, (5, 6)).filter({case (_, anom) => anom.isAnomaly}) + + val expectedResult: Seq[(Int, AnomalyDetectionDataPoint)] = Seq( + (5, AnomalyDetectionDataPoint(72, 12, + BoundedRange(Bound(-1.7976931348623157E308, inclusive = true), Bound(8.0, inclusive = true)), + isAnomaly = true, 1.0)) + ) + assert(anomalyResult == expectedResult) + } + + "work fine with empty input" in { + val emptySeries = Vector[Double]() + val anomalyResult = strategy.detectWithExtendedResults(emptySeries).filter({case (_, anom) => anom.isAnomaly}) + + assert(anomalyResult == Seq[(Int, AnomalyDetectionDataPoint)]()) + } + + "produce error message with correct value and bounds" in { + val result = strategy.detectWithExtendedResults(data).filter({case (_, anom) => anom.isAnomaly}) + + result.foreach { case (_, anom) => + val (value, lowerBound, upperBound) = + AnomalyDetectionTestUtils.firstThreeDoublesFromString(anom.detail.get) + + assert(value === anom.anomalyMetricValue) + assert(value < lowerBound || value > upperBound) + } + } + + "assert anomalies are outside of anomaly bounds" in { + val result = strategy.detectWithExtendedResults(data).filter({ case (_, anom) => anom.isAnomaly }) + + result.foreach { case (_, anom) => + val value = anom.anomalyMetricValue + val upperBound = anom.anomalyCheckRange.upperBound.value + val lowerBound = anom.anomalyCheckRange.lowerBound.value + + assert(value < lowerBound || value > upperBound) + } + } + + + } + + private def setupDefaultStrategyAndData(): (RelativeRateOfChangeStrategy, Vector[Double]) = { + val strategy = RelativeRateOfChangeStrategy(Some(0.5), Some(2.0)) + val data = (for (i <- 0 to 50) yield { + if (i < 20 || i > 30) { + 1.0 + } else { + if (i % 2 == 0) i else 1 + } + }).toVector + (strategy, data) + } } diff --git a/src/test/scala/com/amazon/deequ/anomalydetection/SimpleThresholdStrategyTest.scala b/src/test/scala/com/amazon/deequ/anomalydetection/SimpleThresholdStrategyTest.scala index 92ead9e48..28d49d4c2 100644 --- a/src/test/scala/com/amazon/deequ/anomalydetection/SimpleThresholdStrategyTest.scala +++ b/src/test/scala/com/amazon/deequ/anomalydetection/SimpleThresholdStrategyTest.scala @@ -16,14 +16,14 @@ package com.amazon.deequ.anomalydetection -import org.scalatest.{Matchers, WordSpec} +import org.scalatest.Matchers +import org.scalatest.WordSpec class SimpleThresholdStrategyTest extends WordSpec with Matchers { "Simple Threshold Strategy" should { - val strategy = SimpleThresholdStrategy(upperBound = 1.0) - val data = Vector(-1.0, 2.0, 3.0, 0.5) + val (strategy, data) = setupDefaultStrategyAndData() val expected = Seq((1, Anomaly(Option(2.0), 1.0)), (2, Anomaly(Option(3.0), 1.0))) "detect values above threshold" in { @@ -70,5 +70,78 @@ class SimpleThresholdStrategyTest extends WordSpec with Matchers { assert(value < lowerBound || value > upperBound) } } + + "Simple Threshold Strategy with Extended Results" should { + + val (strategy, data) = setupDefaultStrategyAndData() + val expectedAnomalyCheckRange = BoundedRange(lowerBound = Bound(Double.MinValue, inclusive = true), + upperBound = Bound(1.0, inclusive = true)) + val expectedResult = Seq( + (1, AnomalyDetectionDataPoint(2.0, 2.0, expectedAnomalyCheckRange, isAnomaly = true, 1.0)), + (2, AnomalyDetectionDataPoint(3.0, 3.0, expectedAnomalyCheckRange, isAnomaly = true, 1.0))) + + "detect values above threshold" in { + val anomalyResult = + strategy.detectWithExtendedResults(data, (0, 4)).filter({ case (_, anom) => anom.isAnomaly }) + + assert(anomalyResult == expectedResult) + } + + "detect all values without range specified" in { + val anomalyResult = strategy.detectWithExtendedResults(data).filter({ case (_, anom) => anom.isAnomaly }) + + assert(anomalyResult == expectedResult) + } + + "work fine with empty input" in { + val emptySeries = Vector[Double]() + val anomalyResult = + strategy.detectWithExtendedResults(emptySeries).filter({ case (_, anom) => anom.isAnomaly }) + + assert(anomalyResult == Seq[(Int, AnomalyDetectionDataPoint)]()) + } + + "work with upper and lower threshold" in { + val tS = SimpleThresholdStrategy(lowerBound = -0.5, upperBound = 1.0) + val anomalyResult = tS.detectWithExtendedResults(data).filter({ case (_, anom) => anom.isAnomaly }) + val expectedAnomalyCheckRange = BoundedRange(Bound(-0.5, inclusive = true), Bound(1.0, inclusive = true)) + val expectedResult = Seq( + (0, AnomalyDetectionDataPoint(-1.0, -1.0, expectedAnomalyCheckRange, isAnomaly = true, 1.0)), + (1, AnomalyDetectionDataPoint(2.0, 2.0, expectedAnomalyCheckRange, isAnomaly = true, 1.0)), + (2, AnomalyDetectionDataPoint(3.0, 3.0, expectedAnomalyCheckRange, isAnomaly = true, 1.0))) + + assert(anomalyResult == expectedResult) + } + + "produce error message with correct value and bounds" in { + val result = strategy.detectWithExtendedResults(data).filter({ case (_, anom) => anom.isAnomaly }) + + result.foreach { case (_, anom) => + val (value, lowerBound, upperBound) = + AnomalyDetectionTestUtils.firstThreeDoublesFromString(anom.detail.get) + + assert(value === anom.anomalyMetricValue) + assert(value < lowerBound || value > upperBound) + } + } + + "assert anomalies are outside of anomaly bounds" in { + val result = strategy.detectWithExtendedResults(data).filter({ case (_, anom) => anom.isAnomaly }) + + result.foreach { case (_, anom) => + val value = anom.anomalyMetricValue + val upperBound = anom.anomalyCheckRange.upperBound.value + val lowerBound = anom.anomalyCheckRange.lowerBound.value + + assert(value < lowerBound || value > upperBound) + } + } + } + } + + private def setupDefaultStrategyAndData(): (SimpleThresholdStrategy, Vector[Double]) = { + val strategy = SimpleThresholdStrategy(upperBound = 1.0) + val data = Vector(-1.0, 2.0, 3.0, 0.5) + (strategy, data) } } diff --git a/src/test/scala/com/amazon/deequ/anomalydetection/seasonal/HoltWintersTest.scala b/src/test/scala/com/amazon/deequ/anomalydetection/seasonal/HoltWintersTest.scala index 36689180b..af1854e07 100644 --- a/src/test/scala/com/amazon/deequ/anomalydetection/seasonal/HoltWintersTest.scala +++ b/src/test/scala/com/amazon/deequ/anomalydetection/seasonal/HoltWintersTest.scala @@ -16,19 +16,19 @@ package com.amazon.deequ.anomalydetection.seasonal -import com.amazon.deequ.anomalydetection.Anomaly +import com.amazon.deequ.anomalydetection.{Anomaly, AnomalyDetectionDataPoint} import org.scalatest.matchers.should.Matchers import org.scalatest.wordspec.AnyWordSpec +import scala.util.Random + class HoltWintersTest extends AnyWordSpec with Matchers { import HoltWintersTest._ "Additive Holt-Winters" should { - val rng = new util.Random(seed = 42L) - val twoWeeksOfData = Vector.fill(2)( - Vector[Double](1, 1, 1.2, 1.3, 1.5, 2.1, 1.9) - ).flatten.map(_ + rng.nextGaussian()) + + val twoWeeksOfData = setupData() "fail if start after or equal to end" in { val caught = intercept[IllegalArgumentException]( @@ -206,72 +206,273 @@ class HoltWintersTest extends AnyWordSpec with Matchers { anomalies should have size 3 } - } - "work on hourly data with daily seasonality" in { - // https://www.kaggle.com/datasets/fedesoriano/traffic-prediction-dataset - val hourlyTrafficData = Vector[Double]( - 15, 13, 10, 7, 9, 6, 9, 8, 11, 12, 15, 17, 16, 15, 16, 12, 12, 16, 17, 20, 17, 19, 20, 15, - 14, 12, 14, 12, 12, 11, 13, 14, 12, 22, 32, 31, 35, 26, 34, 30, 27, 27, 24, 26, 29, 32, 30, 27, - 21, 18, 19, 13, 11, 11, 11, 14, 15, 29, 33, 32, 32, 29, 27, 26, 28, 26, 25, 29, 26, 24, 25, 20, - 18, 18, 13, 13, 10, 12, 13, 11, 13, 22, 26, 27, 31, 24, 23, 26, 26, 24, 23, 25, 26, 24, 26, 24, - 19, 20, 18, 13, 13, 9, 12, 12, 15, 16, 23, 24, 25, 24, 26, 22, 20, 20, 22, 26, 22, 21, 21, 21, - 16, 18, 19, 14, 12, 13, 14, 14, 13, 20, 22, 26, 26, 21, 23, 23, 19, 19, 20, 24, 18, 19, 16, 17, - 16, 16, 10, 9, 8, 7, 9, 8, 12, 13, 17, 14, 14, 14, 14, 11, 15, 13, 12, 17, 18, 17, 16, 15, 13 - ) + "work on hourly data with daily seasonality" in { + // https://www.kaggle.com/datasets/fedesoriano/traffic-prediction-dataset + val hourlyTrafficData = Vector[Double]( + 15, 13, 10, 7, 9, 6, 9, 8, 11, 12, 15, 17, 16, 15, 16, 12, 12, 16, 17, 20, 17, 19, 20, 15, + 14, 12, 14, 12, 12, 11, 13, 14, 12, 22, 32, 31, 35, 26, 34, 30, 27, 27, 24, 26, 29, 32, 30, 27, + 21, 18, 19, 13, 11, 11, 11, 14, 15, 29, 33, 32, 32, 29, 27, 26, 28, 26, 25, 29, 26, 24, 25, 20, + 18, 18, 13, 13, 10, 12, 13, 11, 13, 22, 26, 27, 31, 24, 23, 26, 26, 24, 23, 25, 26, 24, 26, 24, + 19, 20, 18, 13, 13, 9, 12, 12, 15, 16, 23, 24, 25, 24, 26, 22, 20, 20, 22, 26, 22, 21, 21, 21, + 16, 18, 19, 14, 12, 13, 14, 14, 13, 20, 22, 26, 26, 21, 23, 23, 19, 19, 20, 24, 18, 19, 16, 17, + 16, 16, 10, 9, 8, 7, 9, 8, 12, 13, 17, 14, 14, 14, 14, 11, 15, 13, 12, 17, 18, 17, 16, 15, 13 + ) - val strategy = new HoltWinters( - HoltWinters.MetricInterval.Hourly, - HoltWinters.SeriesSeasonality.Daily) - - val nDaysTrain = 6 - val nDaysTest = 1 - val trainSize = nDaysTrain * 24 - val testSize = nDaysTest * 24 - val nTotal = trainSize + testSize - - val anomalies = strategy.detect( - hourlyTrafficData.take(nTotal), - trainSize -> nTotal - ) + val strategy = new HoltWinters( + HoltWinters.MetricInterval.Hourly, + HoltWinters.SeriesSeasonality.Daily) + + val nDaysTrain = 6 + val nDaysTest = 1 + val trainSize = nDaysTrain * 24 + val testSize = nDaysTest * 24 + val nTotal = trainSize + testSize + + val anomalies = strategy.detect( + hourlyTrafficData.take(nTotal), + trainSize -> nTotal + ) + + anomalies should have size 2 + } + + "work on monthly data with yearly seasonality using custom seriesPeriodicity" in { + // https://datamarket.com/data/set/22ox/monthly-milk-production-pounds-per-cow-jan-62-dec-75 + val monthlyMilkProduction = Vector[Double]( + 589, 561, 640, 656, 727, 697, 640, 599, 568, 577, 553, 582, + 600, 566, 653, 673, 742, 716, 660, 617, 583, 587, 565, 598, + 628, 618, 688, 705, 770, 736, 678, 639, 604, 611, 594, 634, + 658, 622, 709, 722, 782, 756, 702, 653, 615, 621, 602, 635, + 677, 635, 736, 755, 811, 798, 735, 697, 661, 667, 645, 688, + 713, 667, 762, 784, 837, 817, 767, 722, 681, 687, 660, 698, + 717, 696, 775, 796, 858, 826, 783, 740, 701, 706, 677, 711, + 734, 690, 785, 805, 871, 845, 801, 764, 725, 723, 690, 734, + 750, 707, 807, 824, 886, 859, 819, 783, 740, 747, 711, 751, + 804, 756, 860, 878, 942, 913, 869, 834, 790, 800, 763, 800, + 826, 799, 890, 900, 961, 935, 894, 855, 809, 810, 766, 805, + 821, 773, 883, 898, 957, 924, 881, 837, 784, 791, 760, 802, + 828, 778, 889, 902, 969, 947, 908, 867, 815, 812, 773, 813, + 834, 782, 892, 903, 966, 937, 896, 858, 817, 827, 797, 843 + ) + + val strategy = new HoltWinters(12) + + val nYearsTrain = 3 + val nYearsTest = 1 + val trainSize = nYearsTrain * 12 + val testSize = nYearsTest * 12 + val nTotal = trainSize + testSize + + val anomalies = strategy.detect( + monthlyMilkProduction.take(nTotal), + trainSize -> nTotal + ) + + anomalies should have size 7 + } - anomalies should have size 2 } - "work on monthly data with yearly seasonality using custom seriesPeriodicity" in { - // https://datamarket.com/data/set/22ox/monthly-milk-production-pounds-per-cow-jan-62-dec-75 - val monthlyMilkProduction = Vector[Double]( - 589, 561, 640, 656, 727, 697, 640, 599, 568, 577, 553, 582, - 600, 566, 653, 673, 742, 716, 660, 617, 583, 587, 565, 598, - 628, 618, 688, 705, 770, 736, 678, 639, 604, 611, 594, 634, - 658, 622, 709, 722, 782, 756, 702, 653, 615, 621, 602, 635, - 677, 635, 736, 755, 811, 798, 735, 697, 661, 667, 645, 688, - 713, 667, 762, 784, 837, 817, 767, 722, 681, 687, 660, 698, - 717, 696, 775, 796, 858, 826, 783, 740, 701, 706, 677, 711, - 734, 690, 785, 805, 871, 845, 801, 764, 725, 723, 690, 734, - 750, 707, 807, 824, 886, 859, 819, 783, 740, 747, 711, 751, - 804, 756, 860, 878, 942, 913, 869, 834, 790, 800, 763, 800, - 826, 799, 890, 900, 961, 935, 894, 855, 809, 810, 766, 805, - 821, 773, 883, 898, 957, 924, 881, 837, 784, 791, 760, 802, - 828, 778, 889, 902, 969, 947, 908, 867, 815, 812, 773, 813, - 834, 782, 892, 903, 966, 937, 896, 858, 817, 827, 797, 843 - ) - val strategy = new HoltWinters(12) + "Additive Holt-Winters with Extended Results" should { - val nYearsTrain = 3 - val nYearsTest = 1 - val trainSize = nYearsTrain * 12 - val testSize = nYearsTest * 12 - val nTotal = trainSize + testSize + val twoWeeksOfData = setupData() - val anomalies = strategy.detect( - monthlyMilkProduction.take(nTotal), - trainSize -> nTotal - ) + "fail if start after or equal to end" in { + val caught = intercept[IllegalArgumentException]( + dailyMetricsWithWeeklySeasonalityAnomaliesWithExtendedResults(twoWeeksOfData, 1 -> 1)) - anomalies should have size 7 + caught.getMessage shouldBe "requirement failed: Start must be before end" + } + + "fail if no at least two cycles are available" in { + val fullInterval = 0 -> Int.MaxValue + + val caught = intercept[IllegalArgumentException]( + dailyMetricsWithWeeklySeasonalityAnomaliesWithExtendedResults(Vector.empty, fullInterval)) + + caught.getMessage shouldBe "requirement failed: Provided data series is empty" + } + + "fail for negative search interval" in { + val negativeInterval = -2 -> -1 + + val caught = intercept[IllegalArgumentException]( + dailyMetricsWithWeeklySeasonalityAnomaliesWithExtendedResults(twoWeeksOfData, negativeInterval)) + + caught.getMessage shouldBe + "requirement failed: The search interval needs to be strictly positive" + } + + "fail for too few data" in { + val fullInterval = 0 -> Int.MaxValue + val shortSeries = Vector[Double](1, 2, 3) + + val caught = intercept[IllegalArgumentException]( + dailyMetricsWithWeeklySeasonalityAnomaliesWithExtendedResults(shortSeries, fullInterval)) + + caught.getMessage shouldBe + "requirement failed: Need at least two full cycles of data to estimate model" + } + + "run anomaly detection on the last data point if search interval beyond series size" in { + val interval = 100 -> 110 + val anomalies = dailyMetricsWithWeeklySeasonalityAnomaliesWithExtendedResults(twoWeeksOfData, interval) + anomalies shouldBe empty + } + + "predict no anomaly for normally distributed errors" in { + val seriesWithOutlier = twoWeeksOfData ++ Vector(twoWeeksOfData.head) + val anomalies = + dailyMetricsWithWeeklySeasonalityAnomaliesWithExtendedResults(seriesWithOutlier, 14 -> 15) + .filter({case (_, anom) => anom.isAnomaly}) + anomalies shouldBe empty + } + + "predict an anomaly" in { + val seriesWithOutlier = twoWeeksOfData ++ Vector(0.0d) + val anomalies = dailyMetricsWithWeeklySeasonalityAnomaliesWithExtendedResults( + seriesWithOutlier, 14 -> Int.MaxValue) + + anomalies should have size 1 + val (anomalyIndex, _) = anomalies.head + anomalyIndex shouldBe 14 + } + + "predict no anomalies on longer series" in { + val seriesWithOutlier = twoWeeksOfData ++ twoWeeksOfData + val anomalies = dailyMetricsWithWeeklySeasonalityAnomaliesWithExtendedResults( + seriesWithOutlier, 26 -> Int.MaxValue).filter({case (_, anom) => anom.isAnomaly}) + anomalies shouldBe empty + } + + "detect no anomalies on constant series" in { + val series = (0 until 21).map(_ => 1.0).toVector + val anomalies = dailyMetricsWithWeeklySeasonalityAnomaliesWithExtendedResults(series, 14 -> Int.MaxValue) + .filter({case (_, anom) => anom.isAnomaly}) + anomalies shouldBe empty + } + + "detect a single anomaly in constant series with a single error" in { + val series = ((0 until 20).map(_ => 1.0) ++ Seq(0.0)).toVector + val anomalies = dailyMetricsWithWeeklySeasonalityAnomaliesWithExtendedResults(series, 14 -> Int.MaxValue) + .filter({case (_, anom) => anom.isAnomaly}) + + anomalies should have size 1 + val (detectionIndex, _) = anomalies.head + detectionIndex shouldBe 20 + } + + "detect no anomalies on exact linear trend series" in { + val series = (0 until 48).map(_.toDouble).toVector + val anomalies = dailyMetricsWithWeeklySeasonalityAnomaliesWithExtendedResults(series, 36 -> Int.MaxValue) + .filter({case (_, anom) => anom.isAnomaly}) + anomalies shouldBe empty + } + + "detect no anomalies on exact linear and seasonal effects" in { + val periodicity = 7 + val series = (0 until 48).map(t => math.sin(2 * math.Pi / periodicity * t)) + .zipWithIndex.map { case (s, level) => s + level }.toVector + + val anomalies = dailyMetricsWithWeeklySeasonalityAnomaliesWithExtendedResults(series, 36 -> Int.MaxValue) + .filter({case (_, anom) => anom.isAnomaly}) + anomalies shouldBe empty + } + + "detect anomalies if the training data is wrong" in { + val train = Vector.fill(2)(Vector[Double](0, 1, 1, 1, 1, 1, 1)).flatten + val test = Vector[Double](1, 1, 1, 1, 1, 1, 1) + val series = train ++ test + + val anomalies = dailyMetricsWithWeeklySeasonalityAnomaliesWithExtendedResults(series, 14 -> 21) + .filter({case (_, anom) => anom.isAnomaly}) + + anomalies should have size 1 + val (detectionIndex, _) = anomalies.head + detectionIndex shouldBe 14 + } + + "work on monthly data with yearly seasonality" in { + // https://datamarket.com/data/set/22ox/monthly-milk-production-pounds-per-cow-jan-62-dec-75 + val monthlyMilkProduction = Vector[Double]( + 589, 561, 640, 656, 727, 697, 640, 599, 568, 577, 553, 582, + 600, 566, 653, 673, 742, 716, 660, 617, 583, 587, 565, 598, + 628, 618, 688, 705, 770, 736, 678, 639, 604, 611, 594, 634, + 658, 622, 709, 722, 782, 756, 702, 653, 615, 621, 602, 635, + 677, 635, 736, 755, 811, 798, 735, 697, 661, 667, 645, 688, + 713, 667, 762, 784, 837, 817, 767, 722, 681, 687, 660, 698, + 717, 696, 775, 796, 858, 826, 783, 740, 701, 706, 677, 711, + 734, 690, 785, 805, 871, 845, 801, 764, 725, 723, 690, 734, + 750, 707, 807, 824, 886, 859, 819, 783, 740, 747, 711, 751, + 804, 756, 860, 878, 942, 913, 869, 834, 790, 800, 763, 800, + 826, 799, 890, 900, 961, 935, 894, 855, 809, 810, 766, 805, + 821, 773, 883, 898, 957, 924, 881, 837, 784, 791, 760, 802, + 828, 778, 889, 902, 969, 947, 908, 867, 815, 812, 773, 813, + 834, 782, 892, 903, 966, 937, 896, 858, 817, 827, 797, 843 + ) + + val strategy = new HoltWinters( + HoltWinters.MetricInterval.Monthly, + HoltWinters.SeriesSeasonality.Yearly) + + val nYearsTrain = 3 + val nYearsTest = 1 + val trainSize = nYearsTrain * 12 + val testSize = nYearsTest * 12 + val nTotal = trainSize + testSize + + val anomalies = strategy.detectWithExtendedResults( + monthlyMilkProduction.take(nTotal), + trainSize -> nTotal + ).filter({case (_, anom) => anom.isAnomaly}) + + anomalies should have size 7 + } + + "work on an additional series with yearly seasonality" in { + // https://datamarket.com/data/set/22n4/monthly-car-sales-in-quebec-1960-1968 + val monthlyCarSalesQuebec = Vector[Double]( + 6550, 8728, 12026, 14395, 14587, 13791, 9498, 8251, 7049, 9545, 9364, 8456, + 7237, 9374, 11837, 13784, 15926, 13821, 11143, 7975, 7610, 10015, 12759, 8816, + 10677, 10947, 15200, 17010, 20900, 16205, 12143, 8997, 5568, 11474, 12256, 10583, + 10862, 10965, 14405, 20379, 20128, 17816, 12268, 8642, 7962, 13932, 15936, 12628, + 12267, 12470, 18944, 21259, 22015, 18581, 15175, 10306, 10792, 14752, 13754, 11738, + 12181, 12965, 19990, 23125, 23541, 21247, 15189, 14767, 10895, 17130, 17697, 16611, + 12674, 12760, 20249, 22135, 20677, 19933, 15388, 15113, 13401, 16135, 17562, 14720, + 12225, 11608, 20985, 19692, 24081, 22114, 14220, 13434, 13598, 17187, 16119, 13713, + 13210, 14251, 20139, 21725, 26099, 21084, 18024, 16722, 14385, 21342, 17180, 14577 + ) + + val strategy = new HoltWinters( + HoltWinters.MetricInterval.Monthly, + HoltWinters.SeriesSeasonality.Yearly) + + val nYearsTrain = 3 + val nYearsTest = 1 + val trainSize = nYearsTrain * 12 + val testSize = nYearsTest * 12 + val nTotal = trainSize + testSize + + val anomalies = strategy.detectWithExtendedResults( + monthlyCarSalesQuebec.take(nTotal), + trainSize -> nTotal + ).filter({case (_, anom) => anom.isAnomaly}) + + anomalies should have size 3 + } + } + + private def setupData(): Vector[Double] = { + val rng = new util.Random(seed = 42L) + val twoWeeksOfData = Vector.fill(2)( + Vector[Double](1, 1, 1.2, 1.3, 1.5, 2.1, 1.9) + ).flatten.map(_ + rng.nextGaussian()) + twoWeeksOfData } + + } object HoltWintersTest { @@ -288,4 +489,16 @@ object HoltWintersTest { strategy.detect(series, interval) } + def dailyMetricsWithWeeklySeasonalityAnomaliesWithExtendedResults( + series: Vector[Double], + interval: (Int, Int)): Seq[(Int, AnomalyDetectionDataPoint)] = { + + val strategy = new HoltWinters( + HoltWinters.MetricInterval.Daily, + HoltWinters.SeriesSeasonality.Weekly + ) + + strategy.detectWithExtendedResults(series, interval) + } + } diff --git a/src/test/scala/com/amazon/deequ/checks/ApplicabilityTest.scala b/src/test/scala/com/amazon/deequ/checks/ApplicabilityTest.scala index 542f40fcf..73e589886 100644 --- a/src/test/scala/com/amazon/deequ/checks/ApplicabilityTest.scala +++ b/src/test/scala/com/amazon/deequ/checks/ApplicabilityTest.scala @@ -18,11 +18,14 @@ package com.amazon.deequ package checks import com.amazon.deequ.analyzers.applicability.Applicability -import com.amazon.deequ.analyzers.{Completeness, Compliance, Maximum, Minimum} +import com.amazon.deequ.analyzers.{Completeness, Compliance, Maximum, Minimum, Size} +import com.amazon.deequ.anomalydetection.{AnomalyDetectionStrategy, AnomalyDetectionStrategyWithExtendedResults} +import com.amazon.deequ.repository.MetricsRepository import org.apache.spark.sql.types._ +import org.scalamock.scalatest.MockFactory import org.scalatest.wordspec.AnyWordSpec -class ApplicabilityTest extends AnyWordSpec with SparkContextSpec { +class ApplicabilityTest extends AnyWordSpec with SparkContextSpec with MockFactory { private[this] val schema = StructType(Array( StructField("stringCol", StringType, nullable = true), @@ -48,7 +51,7 @@ class ApplicabilityTest extends AnyWordSpec with SparkContextSpec { "Applicability tests for checks" should { - "recognize applicable checks as applicable" in withSparkSession { session => + "recognize applicable analysis based checks as applicable" in withSparkSession { session => val applicability = new Applicability(session) @@ -66,6 +69,25 @@ class ApplicabilityTest extends AnyWordSpec with SparkContextSpec { } } + "recognize applicable anomaly based checks with extended results as applicable" in withSparkSession { session => + + val applicability = new Applicability(session) + val fakeAnomalyDetector = mock[AnomalyDetectionStrategyWithExtendedResults] + val repository = mock[MetricsRepository] + val validCheck = Check(CheckLevel.Error, "anomaly test") + .isNewestPointNonAnomalousWithExtendedResults(repository, fakeAnomalyDetector, Size(), Map.empty, + None, None) + + val resultForValidCheck = applicability.isApplicable(validCheck, schema) + + assert(resultForValidCheck.isApplicable) + assert(resultForValidCheck.failures.isEmpty) + assert(resultForValidCheck.constraintApplicabilities.size == validCheck.constraints.size) + resultForValidCheck.constraintApplicabilities.foreach { case (_, applicable) => + assert(applicable) + } + } + "detect checks with non existing columns" in withSparkSession { session => val applicability = new Applicability(session) diff --git a/src/test/scala/com/amazon/deequ/checks/CheckTest.scala b/src/test/scala/com/amazon/deequ/checks/CheckTest.scala index eab056f31..31c5209d7 100644 --- a/src/test/scala/com/amazon/deequ/checks/CheckTest.scala +++ b/src/test/scala/com/amazon/deequ/checks/CheckTest.scala @@ -21,7 +21,14 @@ import com.amazon.deequ.analyzers._ import com.amazon.deequ.analyzers.runners.AnalysisRunner import com.amazon.deequ.analyzers.runners.AnalyzerContext import com.amazon.deequ.anomalydetection.Anomaly +import com.amazon.deequ.anomalydetection.AnomalyDetectionAssertionResult +import com.amazon.deequ.anomalydetection.AnomalyDetectionDataPoint import com.amazon.deequ.anomalydetection.AnomalyDetectionStrategy +import com.amazon.deequ.anomalydetection.AnomalyDetectionStrategyWithExtendedResults +import com.amazon.deequ.anomalydetection.Bound +import com.amazon.deequ.anomalydetection.BoundedRange +import com.amazon.deequ.anomalydetection.ExtendedDetectionResult +import com.amazon.deequ.checks.Check.getNewestPointAnomalyResults import com.amazon.deequ.constraints.ConstrainableDataTypes import com.amazon.deequ.constraints.ConstraintStatus import com.amazon.deequ.metrics.DoubleMetric @@ -48,6 +55,10 @@ class CheckTest extends AnyWordSpec with Matchers with SparkContextSpec with Fix import CheckTest._ + // This is used as a default bounded range value for anomaly detection tests. + private[this] val defaultBoundedRange = BoundedRange(lowerBound = Bound(0.0, inclusive = true), + upperBound = Bound(1.0, inclusive = true)) + "Check" should { "return the correct check status for completeness" in withSparkSession { sparkSession => @@ -1160,6 +1171,234 @@ class CheckTest extends AnyWordSpec with Matchers with SparkContextSpec with Fix } } + "Check isNewestPointNonAnomalousWithExtendedResults" should { + + "return the correct check status for anomaly detection for different analyzers" in + withSparkSession { sparkSession => + evaluateWithRepository { repository => + // Fake Anomaly Detector + val fakeAnomalyDetector = mock[AnomalyDetectionStrategyWithExtendedResults] + inSequence { + // Size results + (fakeAnomalyDetector.detectWithExtendedResults _) + .expects(Vector(1.0, 2.0, 3.0, 4.0, 11.0), (4, 5)) + .returns(Seq( + (0, AnomalyDetectionDataPoint(1.0, 1.0, defaultBoundedRange, confidence = 1.0, isAnomaly = false)), + (1, AnomalyDetectionDataPoint(2.0, 2.0, defaultBoundedRange, confidence = 1.0, isAnomaly = false)), + (2, AnomalyDetectionDataPoint(3.0, 3.0, defaultBoundedRange, confidence = 1.0, isAnomaly = false)), + (3, AnomalyDetectionDataPoint(4.0, 4.0, defaultBoundedRange, confidence = 1.0, isAnomaly = false)), + (4, AnomalyDetectionDataPoint(11.0, 11.0, defaultBoundedRange, confidence = 1.0, isAnomaly = false)))) + .once() + (fakeAnomalyDetector.detectWithExtendedResults _).expects(Vector(1.0, 2.0, 3.0, 4.0, 4.0), (4, 5)) + .returns(Seq( + (0, AnomalyDetectionDataPoint(1.0, 1.0, defaultBoundedRange, confidence = 1.0, isAnomaly = false)), + (1, AnomalyDetectionDataPoint(2.0, 2.0, defaultBoundedRange, confidence = 1.0, isAnomaly = false)), + (2, AnomalyDetectionDataPoint(3.0, 3.0, defaultBoundedRange, confidence = 1.0, isAnomaly = false)), + (3, AnomalyDetectionDataPoint(4.0, 4.0, defaultBoundedRange, confidence = 1.0, isAnomaly = false)), + (4, AnomalyDetectionDataPoint(4.0, 4.0, defaultBoundedRange, confidence = 1.0, isAnomaly = true)))) + .once() + // Distinctness results + (fakeAnomalyDetector.detectWithExtendedResults _) + .expects(Vector(1.0, 2.0, 3.0, 4.0, 1), (4, 5)) + .returns(Seq( + (0, AnomalyDetectionDataPoint(1.0, 1.0, defaultBoundedRange, confidence = 1.0, isAnomaly = false)), + (1, AnomalyDetectionDataPoint(2.0, 2.0, defaultBoundedRange, confidence = 1.0, isAnomaly = false)), + (2, AnomalyDetectionDataPoint(3.0, 3.0, defaultBoundedRange, confidence = 1.0, isAnomaly = false)), + (3, AnomalyDetectionDataPoint(4.0, 4.0, defaultBoundedRange, confidence = 1.0, isAnomaly = false)), + (4, AnomalyDetectionDataPoint(1.0, 1.0, defaultBoundedRange, confidence = 1.0, isAnomaly = false)))) + .once() + (fakeAnomalyDetector.detectWithExtendedResults _) + .expects(Vector(1.0, 2.0, 3.0, 4.0, 1), (4, 5)) + .returns(Seq( + (0, AnomalyDetectionDataPoint(1.0, 1.0, defaultBoundedRange, confidence = 1.0, isAnomaly = false)), + (1, AnomalyDetectionDataPoint(2.0, 2.0, defaultBoundedRange, confidence = 1.0, isAnomaly = false)), + (2, AnomalyDetectionDataPoint(3.0, 3.0, defaultBoundedRange, confidence = 1.0, isAnomaly = false)), + (3, AnomalyDetectionDataPoint(4.0, 4.0, defaultBoundedRange, confidence = 1.0, isAnomaly = false)), + (4, AnomalyDetectionDataPoint(1.0, 1.0, defaultBoundedRange, confidence = 1.0, isAnomaly = true)))) + .once() + } + + // Get test AnalyzerContexts + val analysis = Analysis().addAnalyzers(Seq(Size(), Distinctness(Seq("c0", "c1")))) + + val context11Rows = AnalysisRunner.run(getDfWithNRows(sparkSession, 11), analysis) + val context4Rows = AnalysisRunner.run(getDfWithNRows(sparkSession, 4), analysis) + val contextNoRows = AnalysisRunner.run(getDfEmpty(sparkSession), analysis) + + // Check isNewestPointNonAnomalousWithExtendedResults using Size + val sizeAnomalyCheck = Check(CheckLevel.Error, "anomaly test") + .isNewestPointNonAnomalousWithExtendedResults(repository, fakeAnomalyDetector, Size(), Map.empty, + None, None) + + assert(sizeAnomalyCheck.evaluate(context11Rows).status == CheckStatus.Success) + assert(sizeAnomalyCheck.evaluate(context4Rows).status == CheckStatus.Error) + assert(sizeAnomalyCheck.evaluate(contextNoRows).status == CheckStatus.Error) + + // Now with Distinctness + val distinctnessAnomalyCheck = Check(CheckLevel.Error, "anomaly test") + .isNewestPointNonAnomalousWithExtendedResults(repository, fakeAnomalyDetector, + Distinctness(Seq("c0", "c1")), Map.empty, None, None) + + assert(distinctnessAnomalyCheck.evaluate(context11Rows).status == CheckStatus.Success) + assert(distinctnessAnomalyCheck.evaluate(context4Rows).status == CheckStatus.Error) + assert(distinctnessAnomalyCheck.evaluate(contextNoRows).status == CheckStatus.Error) + } + } + + "only use historic results filtered by tagValues if specified" in + withSparkSession { sparkSession => + evaluateWithRepository { repository => + // Fake Anomaly Detector + val fakeAnomalyDetector = mock[AnomalyDetectionStrategyWithExtendedResults] + inSequence { + // Size results + (fakeAnomalyDetector.detectWithExtendedResults _) + .expects(Vector(1.0, 2.0, 11.0), (2, 3)) + .returns(Seq( + (0, AnomalyDetectionDataPoint(1.0, 1.0, defaultBoundedRange, confidence = 1.0, isAnomaly = false)), + (1, AnomalyDetectionDataPoint(2.0, 2.0, defaultBoundedRange, confidence = 1.0, isAnomaly = false)), + (2, AnomalyDetectionDataPoint(11.0, 11.0, defaultBoundedRange, confidence = 1.0, isAnomaly = false)))) + .once() + (fakeAnomalyDetector.detectWithExtendedResults _).expects(Vector(1.0, 2.0, 4.0), (2, 3)) + .returns(Seq( + (0, AnomalyDetectionDataPoint(1.0, 1.0, defaultBoundedRange, confidence = 1.0, isAnomaly = false)), + (1, AnomalyDetectionDataPoint(2.0, 2.0, defaultBoundedRange, confidence = 1.0, isAnomaly = false)), + (2, AnomalyDetectionDataPoint(4.0, 4.0, defaultBoundedRange, confidence = 1.0, isAnomaly = true)))) + .once() + } + + // Get test AnalyzerContexts + val analysis = Analysis().addAnalyzer(Size()) + + val context11Rows = AnalysisRunner.run(getDfWithNRows(sparkSession, 11), analysis) + val context4Rows = AnalysisRunner.run(getDfWithNRows(sparkSession, 4), analysis) + val contextNoRows = AnalysisRunner.run(getDfEmpty(sparkSession), analysis) + + // Check isNewestPointNonAnomalousWithExtendedResults using Size + val sizeAnomalyCheck = Check(CheckLevel.Error, "anomaly test") + .isNewestPointNonAnomalousWithExtendedResults(repository, fakeAnomalyDetector, Size(), + Map("Region" -> "EU"), None, None) + + assert(sizeAnomalyCheck.evaluate(context11Rows).status == CheckStatus.Success) + assert(sizeAnomalyCheck.evaluate(context4Rows).status == CheckStatus.Error) + assert(sizeAnomalyCheck.evaluate(contextNoRows).status == CheckStatus.Error) + } + } + + "only use historic results after some dateTime if specified" in + withSparkSession { sparkSession => + evaluateWithRepository { repository => + // Fake Anomaly Detector + val fakeAnomalyDetector = mock[AnomalyDetectionStrategyWithExtendedResults] + inSequence { + // Size results + (fakeAnomalyDetector.detectWithExtendedResults _) + .expects(Vector(3.0, 4.0, 11.0), (2, 3)) + .returns(Seq( + (0, AnomalyDetectionDataPoint(3.0, 3.0, defaultBoundedRange, confidence = 1.0, isAnomaly = false)), + (1, AnomalyDetectionDataPoint(4.0, 4.0, defaultBoundedRange, confidence = 1.0, isAnomaly = false)), + (2, AnomalyDetectionDataPoint(11.0, 11.0, defaultBoundedRange, confidence = 1.0, isAnomaly = false)))) + .once() + (fakeAnomalyDetector.detectWithExtendedResults _).expects(Vector(3.0, 4.0, 4.0), (2, 3)) + .returns(Seq( + (0, AnomalyDetectionDataPoint(3.0, 3.0, defaultBoundedRange, confidence = 1.0, isAnomaly = false)), + (1, AnomalyDetectionDataPoint(4.0, 4.0, defaultBoundedRange, confidence = 1.0, isAnomaly = false)), + (2, AnomalyDetectionDataPoint(4.0, 4.0, defaultBoundedRange, confidence = 1.0, isAnomaly = true)))) + .once() + } + + // Get test AnalyzerContexts + val analysis = Analysis().addAnalyzer(Size()) + + val context11Rows = AnalysisRunner.run(getDfWithNRows(sparkSession, 11), analysis) + val context4Rows = AnalysisRunner.run(getDfWithNRows(sparkSession, 4), analysis) + val contextNoRows = AnalysisRunner.run(getDfEmpty(sparkSession), analysis) + + // Check isNewestPointNonAnomalousWithExtendedResults using Size + val sizeAnomalyCheck = Check(CheckLevel.Error, "anomaly test") + .isNewestPointNonAnomalousWithExtendedResults(repository, fakeAnomalyDetector, Size(), + Map.empty, Some(3), None) + + assert(sizeAnomalyCheck.evaluate(context11Rows).status == CheckStatus.Success) + assert(sizeAnomalyCheck.evaluate(context4Rows).status == CheckStatus.Error) + assert(sizeAnomalyCheck.evaluate(contextNoRows).status == CheckStatus.Error) + } + } + + "only use historic results before some dateTime if specified" in + withSparkSession { sparkSession => + evaluateWithRepository { repository => + // Fake Anomaly Detector + val fakeAnomalyDetector = mock[AnomalyDetectionStrategyWithExtendedResults] + inSequence { + // Size results + (fakeAnomalyDetector.detectWithExtendedResults _) + .expects(Vector(1.0, 2.0, 11.0), (2, 3)) + .returns(Seq( + (0, AnomalyDetectionDataPoint(1.0, 1.0, defaultBoundedRange, confidence = 1.0, isAnomaly = false)), + (1, AnomalyDetectionDataPoint(2.0, 2.0, defaultBoundedRange, confidence = 1.0, isAnomaly = false)), + (2, AnomalyDetectionDataPoint(11.0, 11.0, defaultBoundedRange, confidence = 1.0, isAnomaly = false)))) + .once() + (fakeAnomalyDetector.detectWithExtendedResults _).expects(Vector(1.0, 2.0, 4.0), (2, 3)) + .returns(Seq( + (0, AnomalyDetectionDataPoint(1.0, 1.0, defaultBoundedRange, confidence = 1.0, isAnomaly = false)), + (1, AnomalyDetectionDataPoint(2.0, 2.0, defaultBoundedRange, confidence = 1.0, isAnomaly = false)), + (2, AnomalyDetectionDataPoint(4.0, 4.0, defaultBoundedRange, confidence = 1.0, isAnomaly = true)))) + .once() + } + + // Get test AnalyzerContexts + val analysis = Analysis().addAnalyzer(Size()) + + val context11Rows = AnalysisRunner.run(getDfWithNRows(sparkSession, 11), analysis) + val context4Rows = AnalysisRunner.run(getDfWithNRows(sparkSession, 4), analysis) + val contextNoRows = AnalysisRunner.run(getDfEmpty(sparkSession), analysis) + + // Check isNewestPointNonAnomalousWithExtendedResults using Size + val sizeAnomalyCheck = Check(CheckLevel.Error, "anomaly test") + .isNewestPointNonAnomalousWithExtendedResults(repository, fakeAnomalyDetector, Size(), + Map.empty, None, Some(2)) + + assert(sizeAnomalyCheck.evaluate(context11Rows).status == CheckStatus.Success) + assert(sizeAnomalyCheck.evaluate(context4Rows).status == CheckStatus.Error) + assert(sizeAnomalyCheck.evaluate(contextNoRows).status == CheckStatus.Error) + } + } + } + + "getNewestPointAnomalyResults returns correct assertion result from anomaly detection data point sequence " + + "with multiple data points" in { + val anomalySequence: Seq[(Long, AnomalyDetectionDataPoint)] = + Seq( + (0, AnomalyDetectionDataPoint(1.0, 1.0, defaultBoundedRange, confidence = 1.0, isAnomaly = false)), + (1, AnomalyDetectionDataPoint(2.0, 2.0, defaultBoundedRange, confidence = 1.0, isAnomaly = false)), + (2, AnomalyDetectionDataPoint(11.0, 11.0, defaultBoundedRange, confidence = 1.0, isAnomaly = true))) + val result: AnomalyDetectionAssertionResult = + getNewestPointAnomalyResults(ExtendedDetectionResult(anomalySequence)) + assert(result.hasAnomaly) + assert(result.anomalyDetectionExtendedResult.anomalyDetectionDataPoint == + AnomalyDetectionDataPoint(11.0, 11.0, defaultBoundedRange, confidence = 1.0, isAnomaly = true)) + } + + "getNewestPointAnomalyResults returns correct assertion result from anomaly detection data point sequence " + + "with one data point" in { + val anomalySequence: Seq[(Long, AnomalyDetectionDataPoint)] = + Seq( + (0, AnomalyDetectionDataPoint(11.0, 11.0, defaultBoundedRange, confidence = 1.0, isAnomaly = false))) + val result: AnomalyDetectionAssertionResult = + getNewestPointAnomalyResults(ExtendedDetectionResult(anomalySequence)) + assert(!result.hasAnomaly) + assert(result.anomalyDetectionExtendedResult.anomalyDetectionDataPoint == + AnomalyDetectionDataPoint(11.0, 11.0, defaultBoundedRange, confidence = 1.0, isAnomaly = false)) + } + + "assert getNewestPointAnomalyResults throws exception from empty anomaly detection sequence" in { + val anomalySequence: Seq[(Long, AnomalyDetectionDataPoint)] = Seq() + intercept[IllegalArgumentException] { + getNewestPointAnomalyResults(ExtendedDetectionResult(anomalySequence)) + } + } + /** * Test for DataSync in verification suite. */ diff --git a/src/test/scala/com/amazon/deequ/constraints/AnomalyExtendedResultsConstraintTest.scala b/src/test/scala/com/amazon/deequ/constraints/AnomalyExtendedResultsConstraintTest.scala new file mode 100644 index 000000000..606b1966b --- /dev/null +++ b/src/test/scala/com/amazon/deequ/constraints/AnomalyExtendedResultsConstraintTest.scala @@ -0,0 +1,315 @@ +/** + * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"). You may not + * use this file except in compliance with the License. A copy of the License + * is located at + * + * http://aws.amazon.com/apache2.0/ + * + * or in the "license" file accompanying this file. This file is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either + * express or implied. See the License for the specific language governing + * permissions and limitations under the License. + * + */ + +package com.amazon.deequ.constraints + +import com.amazon.deequ.SparkContextSpec +import com.amazon.deequ.analyzers._ +import com.amazon.deequ.analyzers.runners.MetricCalculationException +import com.amazon.deequ.anomalydetection.AnomalyDetectionAssertionResult +import com.amazon.deequ.anomalydetection.AnomalyDetectionDataPoint +import com.amazon.deequ.anomalydetection.AnomalyDetectionExtendedResult +import com.amazon.deequ.anomalydetection.Bound +import com.amazon.deequ.anomalydetection.BoundedRange +import com.amazon.deequ.constraints.ConstraintUtils.calculate +import com.amazon.deequ.metrics.{DoubleMetric, Entity, Metric} +import com.amazon.deequ.utils.FixtureSupport +import org.apache.spark.sql.DataFrame +import org.scalamock.scalatest.MockFactory +import org.scalatest.{Matchers, PrivateMethodTester, WordSpec} + +import scala.util.{Failure, Try} + +class AnomalyExtendedResultsConstraintTest extends WordSpec with Matchers with SparkContextSpec + with FixtureSupport with MockFactory with PrivateMethodTester { + + /** + * Sample function to use as value picker + * + * @return Returns input multiplied by 2 + */ + def valueDoubler(value: Double): Double = { + value * 2 + } + + /** + * Sample analyzer that returns a 1.0 value if the given column exists and fails otherwise. + */ + case class SampleAnalyzer(column: String) extends Analyzer[NumMatches, DoubleMetric] { + override def toFailureMetric(exception: Exception): DoubleMetric = { + DoubleMetric(Entity.Column, "sample", column, Failure(MetricCalculationException + .wrapIfNecessary(exception))) + } + + + override def calculate( + data: DataFrame, + stateLoader: Option[StateLoader], + statePersister: Option[StatePersister], + filterCondition: Option[String]) + : DoubleMetric = { + val value: Try[Double] = Try { + require(data.columns.contains(column), s"Missing column $column") + 1.0 + } + DoubleMetric(Entity.Column, "sample", column, value) + } + + override def computeStateFrom(data: DataFrame, filterCondition: Option[String] = None) + : Option[NumMatches] = { + throw new NotImplementedError() + } + + override def computeMetricFrom(state: Option[NumMatches]): DoubleMetric = { + throw new NotImplementedError() + } + } + + "Anomaly extended results constraint" should { + + val defaultBoundedRange = BoundedRange(lowerBound = Bound(0.0, inclusive = true), + upperBound = Bound(1.0, inclusive = true)) + + "assert correctly on values if analysis is successful" in + withSparkSession { sparkSession => + val df = getDfMissing(sparkSession) + + // Analysis result should equal to 1.0 for an existing column + + val anomalyAssertionFunctionA = (_: Double) => { + AnomalyDetectionAssertionResult(hasAnomaly = false, + AnomalyDetectionExtendedResult(AnomalyDetectionDataPoint(1.0, 1.0, confidence = 1.0, + anomalyCheckRange = defaultBoundedRange, isAnomaly = false)) + ) + } + + val resultA = calculate( + AnomalyExtendedResultsConstraint[NumMatches, Double, Double]( + SampleAnalyzer("att1"), anomalyAssertionFunctionA), df) + + assert(resultA.status == ConstraintStatus.Success) + assert(resultA.message.isEmpty) + assert(resultA.metric.isDefined) + + val anomalyAssertionFunctionB = (_: Double) => { + AnomalyDetectionAssertionResult(hasAnomaly = true, + AnomalyDetectionExtendedResult(AnomalyDetectionDataPoint(1.0, 1.0, confidence = 1.0, + anomalyCheckRange = defaultBoundedRange, isAnomaly = true))) + } + + // Analysis result should equal to 1.0 for an existing column + val resultB = calculate(AnomalyExtendedResultsConstraint[NumMatches, Double, Double]( + SampleAnalyzer("att1"), anomalyAssertionFunctionB), df) + + assert(resultB.status == ConstraintStatus.Failure) + assert(resultB.message.contains( + "Value: 1.0 does not meet the constraint requirement, check the anomaly detection metadata!")) + assert(resultB.metric.isDefined) + + val anomalyAssertionFunctionC = anomalyAssertionFunctionA + + // Analysis should fail for a non existing column + val resultC = calculate(AnomalyExtendedResultsConstraint[NumMatches, Double, Double]( + SampleAnalyzer("someMissingColumn"), anomalyAssertionFunctionC), df) + + assert(resultC.status == ConstraintStatus.Failure) + assert(resultC.message.contains("requirement failed: Missing column someMissingColumn")) + assert(resultC.metric.isDefined) + } + + "execute value picker on the analysis result value, if provided" in + withSparkSession { sparkSession => + + + val df = getDfMissing(sparkSession) + + val anomalyAssertionFunctionA = (_: Double) => { + AnomalyDetectionAssertionResult(hasAnomaly = false, + AnomalyDetectionExtendedResult(AnomalyDetectionDataPoint(2.0, 2.0, confidence = 1.0, + anomalyCheckRange = defaultBoundedRange, isAnomaly = false)) + ) + } + + // Analysis result should equal to 100.0 for an existing column + assert(calculate(AnomalyExtendedResultsConstraint[NumMatches, Double, Double]( + SampleAnalyzer("att1"), anomalyAssertionFunctionA, Some(valueDoubler)), df).status == + ConstraintStatus.Success) + + val anomalyAssertionFunctionB = (_: Double) => { + AnomalyDetectionAssertionResult(hasAnomaly = true, + AnomalyDetectionExtendedResult(AnomalyDetectionDataPoint(2.0, 2.0, confidence = 1.0, + anomalyCheckRange = defaultBoundedRange, isAnomaly = true))) + } + + assert(calculate(AnomalyExtendedResultsConstraint[NumMatches, Double, Double]( + SampleAnalyzer("att1"), anomalyAssertionFunctionB, Some(valueDoubler)), df).status == + ConstraintStatus.Failure) + + val anomalyAssertionFunctionC = anomalyAssertionFunctionA + + // Analysis should fail for a non existing column + assert(calculate(AnomalyExtendedResultsConstraint[NumMatches, Double, Double]( + SampleAnalyzer("someMissingColumn"), anomalyAssertionFunctionC, Some(valueDoubler)), df).status == + ConstraintStatus.Failure) + } + + "get the analysis from the context, if provided" in withSparkSession { sparkSession => + val df = getDfMissing(sparkSession) + + val emptyResults = Map.empty[Analyzer[_, Metric[_]], Metric[_]] + + val validResults = Map[Analyzer[_, Metric[_]], Metric[_]]( + SampleAnalyzer("att1") -> SampleAnalyzer("att1").calculate(df), + SampleAnalyzer("someMissingColumn") -> SampleAnalyzer("someMissingColumn").calculate(df) + ) + + val anomalyAssertionFunctionA = (_: Double) => { + AnomalyDetectionAssertionResult(hasAnomaly = false, + AnomalyDetectionExtendedResult(AnomalyDetectionDataPoint(1.0, 1.0, confidence = 1.0, + anomalyCheckRange = defaultBoundedRange, isAnomaly = false))) + } + val anomalyAssertionFunctionB = (_: Double) => { + AnomalyDetectionAssertionResult(hasAnomaly = true, + AnomalyDetectionExtendedResult(AnomalyDetectionDataPoint(1.0, 1.0, confidence = 1.0, + anomalyCheckRange = defaultBoundedRange, isAnomaly = true))) + } + + // Analysis result should equal to 1.0 for an existing column + assert(AnomalyExtendedResultsConstraint[NumMatches, Double, Double] + (SampleAnalyzer("att1"), anomalyAssertionFunctionA) + .evaluate(validResults).status == ConstraintStatus.Success) + assert(AnomalyExtendedResultsConstraint[NumMatches, Double, Double] + (SampleAnalyzer("att1"), anomalyAssertionFunctionB) + .evaluate(validResults).status == ConstraintStatus.Failure) + assert(AnomalyExtendedResultsConstraint[NumMatches, Double, Double] + (SampleAnalyzer("someMissingColumn"), anomalyAssertionFunctionA) + .evaluate(validResults).status == ConstraintStatus.Failure) + + // Although assertion would pass, since analysis result is missing, + // constraint fails with missing analysis message + AnomalyExtendedResultsConstraint[NumMatches, Double, Double](SampleAnalyzer("att1"), anomalyAssertionFunctionA) + .evaluate(emptyResults) match { + case result => + assert(result.status == ConstraintStatus.Failure) + assert(result.message.contains("Missing Analysis, can't run the constraint!")) + assert(result.metric.isEmpty) + } + } + + "execute value picker on the analysis result value retrieved from context, if provided" in + withSparkSession { sparkSession => + val df = getDfMissing(sparkSession) + val validResults = Map[Analyzer[_, Metric[_]], Metric[_]]( + SampleAnalyzer("att1") -> SampleAnalyzer("att1").calculate(df)) + + val anomalyAssertionFunction = (_: Double) => { + AnomalyDetectionAssertionResult(hasAnomaly = false, + AnomalyDetectionExtendedResult(AnomalyDetectionDataPoint(2.0, 2.0, confidence = 1.0, + anomalyCheckRange = defaultBoundedRange, isAnomaly = false))) + } + + assert(AnomalyExtendedResultsConstraint[NumMatches, Double, Double]( + SampleAnalyzer("att1"), anomalyAssertionFunction, Some(valueDoubler)) + .evaluate(validResults).status == ConstraintStatus.Success) + } + + + "fail on analysis if value picker is provided but fails" in withSparkSession { sparkSession => + def problematicValuePicker(value: Double): Double = { + throw new RuntimeException("Something wrong with this picker") + } + + val df = getDfMissing(sparkSession) + + val emptyResults = Map.empty[Analyzer[_, Metric[_]], Metric[_]] + val validResults = Map[Analyzer[_, Metric[_]], Metric[_]]( + SampleAnalyzer("att1") -> SampleAnalyzer("att1").calculate(df)) + + val anomalyAssertionFunction = (_: Double) => { + AnomalyDetectionAssertionResult(hasAnomaly = false, + AnomalyDetectionExtendedResult(AnomalyDetectionDataPoint(1.0, 1.0, confidence = 1.0, + anomalyCheckRange = defaultBoundedRange, isAnomaly = false))) + } + val constraint = AnomalyExtendedResultsConstraint[NumMatches, Double, Double]( + SampleAnalyzer("att1"), anomalyAssertionFunction, Some(problematicValuePicker)) + + calculate(constraint, df) match { + case result => + assert(result.status == ConstraintStatus.Failure) + assert(result.message.get.contains("Can't retrieve the value to assert on")) + assert(result.metric.isDefined) + } + + constraint.evaluate(validResults) match { + case result => + assert(result.status == ConstraintStatus.Failure) + assert(result.message.isDefined) + assert(result.message.get.startsWith("Can't retrieve the value to assert on")) + assert(result.metric.isDefined) + } + + constraint.evaluate(emptyResults) match { + case result => + assert(result.status == ConstraintStatus.Failure) + assert(result.message.contains("Missing Analysis, can't run the constraint!")) + assert(result.metric.isEmpty) + } + + } + + "fail on failed assertion function with hint in exception message if provided" in + withSparkSession { sparkSession => + + val df = getDfMissing(sparkSession) + + val anomalyAssertionFunction = (_: Double) => { + AnomalyDetectionAssertionResult(hasAnomaly = true, + AnomalyDetectionExtendedResult(AnomalyDetectionDataPoint(1.0, 1.0, confidence = 1.0, + anomalyCheckRange = defaultBoundedRange, isAnomaly = true))) + } + + val failingConstraint = AnomalyExtendedResultsConstraint[NumMatches, Double, Double]( + SampleAnalyzer("att1"), anomalyAssertionFunction, hint = Some("Value should be like ...!")) + + calculate(failingConstraint, df) match { + case result => + assert(result.status == ConstraintStatus.Failure) + assert(result.message.isDefined) + assert(result.message.get == "Value: 1.0 does not meet the constraint requirement, " + + "check the anomaly detection metadata! Value should be like ...!") + assert(result.metric.isDefined) + } + } + + "return failed constraint for a failing assertion" in withSparkSession { session => + val msg = "-test-" + val exception = new RuntimeException(msg) + val df = getDfMissing(session) + + def failingAssertion(value: Double): AnomalyDetectionAssertionResult = throw exception + + val constraintResult = calculate( + AnomalyExtendedResultsConstraint[NumMatches, Double, Double]( + SampleAnalyzer("att1"), failingAssertion), df + ) + + assert(constraintResult.status == ConstraintStatus.Failure) + assert(constraintResult.metric.isDefined) + assert(constraintResult.message.contains(s"Can't execute the assertion: $msg!")) + } + + } +} diff --git a/src/test/scala/com/amazon/deequ/constraints/ConstraintUtils.scala b/src/test/scala/com/amazon/deequ/constraints/ConstraintUtils.scala index 5782bc18c..27065cafb 100644 --- a/src/test/scala/com/amazon/deequ/constraints/ConstraintUtils.scala +++ b/src/test/scala/com/amazon/deequ/constraints/ConstraintUtils.scala @@ -21,12 +21,15 @@ import org.apache.spark.sql.DataFrame object ConstraintUtils { def calculate(constraint: Constraint, df: DataFrame): ConstraintResult = { - - val analysisBasedConstraint = constraint match { - case nc: ConstraintDecorator => nc.inner - case c: Constraint => c + val finalConstraint = constraint match { + case nc: ConstraintDecorator => nc.inner + case c: Constraint => c + } + finalConstraint match { + case _: AnalysisBasedConstraint[_, _, _] => + finalConstraint.asInstanceOf[AnalysisBasedConstraint[_, _, _]].calculateAndEvaluate(df) + case _: AnomalyExtendedResultsConstraint[_, _, _] => + finalConstraint.asInstanceOf[AnomalyExtendedResultsConstraint[_, _, _]].calculateAndEvaluate(df) } - - analysisBasedConstraint.asInstanceOf[AnalysisBasedConstraint[_, _, _]].calculateAndEvaluate(df) } } diff --git a/src/test/scala/com/amazon/deequ/constraints/ConstraintsTest.scala b/src/test/scala/com/amazon/deequ/constraints/ConstraintsTest.scala index e4a8ba898..cd8d91d91 100644 --- a/src/test/scala/com/amazon/deequ/constraints/ConstraintsTest.scala +++ b/src/test/scala/com/amazon/deequ/constraints/ConstraintsTest.scala @@ -18,13 +18,21 @@ package com.amazon.deequ package constraints import com.amazon.deequ.utils.FixtureSupport -import org.scalatest.{Matchers, WordSpec} +import org.scalatest.Matchers +import org.scalatest.WordSpec import ConstraintUtils.calculate -import com.amazon.deequ.analyzers.{Completeness, NumMatchesAndCount} +import com.amazon.deequ.analyzers.Completeness +import com.amazon.deequ.analyzers.NumMatchesAndCount import org.apache.spark.sql.Row -import org.apache.spark.sql.types.{DoubleType, StringType} +import org.apache.spark.sql.types.DoubleType +import org.apache.spark.sql.types.StringType import Constraint._ import com.amazon.deequ.SparkContextSpec +import com.amazon.deequ.anomalydetection.AnomalyDetectionAssertionResult +import com.amazon.deequ.anomalydetection.AnomalyDetectionDataPoint +import com.amazon.deequ.anomalydetection.AnomalyDetectionExtendedResult +import com.amazon.deequ.anomalydetection.Bound +import com.amazon.deequ.anomalydetection.BoundedRange class ConstraintsTest extends WordSpec with Matchers with SparkContextSpec with FixtureSupport { @@ -174,4 +182,30 @@ class ConstraintsTest extends WordSpec with Matchers with SparkContextSpec with Completeness("att2"), _ < 0.7), df).status == ConstraintStatus.Failure) } } + + "Anomaly constraint with Extended Results" should { + "assert on anomaly analyzer values" in withSparkSession { sparkSession => + val df = getDfMissing(sparkSession) + val defaultBoundedRange = BoundedRange(lowerBound = Bound(0.0, inclusive = true), + upperBound = Bound(1.0, inclusive = true)) + + assert(calculate(Constraint.anomalyConstraintWithExtendedResults[NumMatchesAndCount]( + Completeness("att1"), (_: Double) => { + AnomalyDetectionAssertionResult(hasAnomaly = false, + AnomalyDetectionExtendedResult(AnomalyDetectionDataPoint(1.0, 1.0, confidence = 1.0, + anomalyCheckRange = defaultBoundedRange, isAnomaly = false))) + } ), df) + .status == ConstraintStatus.Success) + + assert(calculate(Constraint.anomalyConstraintWithExtendedResults[NumMatchesAndCount]( + Completeness("att1"), (_: Double) => { + AnomalyDetectionAssertionResult(hasAnomaly = true, + AnomalyDetectionExtendedResult(AnomalyDetectionDataPoint(1.0, 1.0, confidence = 1.0, + anomalyCheckRange = defaultBoundedRange, isAnomaly = true) + )) + }), df) + .status == ConstraintStatus.Failure) + + } + } } diff --git a/src/test/scala/com/amazon/deequ/repository/MetricsRepositoryAnomalyDetectionIntegrationTest.scala b/src/test/scala/com/amazon/deequ/repository/MetricsRepositoryAnomalyDetectionIntegrationTest.scala index c73ac95b0..2cd475ac6 100644 --- a/src/test/scala/com/amazon/deequ/repository/MetricsRepositoryAnomalyDetectionIntegrationTest.scala +++ b/src/test/scala/com/amazon/deequ/repository/MetricsRepositoryAnomalyDetectionIntegrationTest.scala @@ -57,9 +57,29 @@ class MetricsRepositoryAnomalyDetectionIntegrationTest extends AnyWordSpec with } } + "Anomaly Detection with Extended Results" should { + + "work using the InMemoryMetricsRepository" in withSparkSession { session => + + val repository = new InMemoryMetricsRepository() + + testAnomalyDetection(session, repository, useExtendedResults = true) + + } + + "work using the FileSystemMetricsRepository" in withSparkSession { session => + + val tempDir = TempFileUtils.tempDir("fileSystemRepositoryTest") + val repository = new FileSystemMetricsRepository(session, tempDir + "repository-test.json") + + testAnomalyDetection(session, repository, useExtendedResults = true) + } + } + private[this] def testAnomalyDetection( session: SparkSession, - repository: MetricsRepository) + repository: MetricsRepository, + useExtendedResults: Boolean = false) : Unit = { val data = getTestData(session) @@ -71,8 +91,15 @@ class MetricsRepositoryAnomalyDetectionIntegrationTest extends AnyWordSpec with val (otherCheck, additionalRequiredAnalyzers) = getNormalCheckAndRequiredAnalyzers() // This method is where the interesting stuff happens - val verificationResult = createAnomalyChecksAndRunEverything(data, repository, otherCheck, - additionalRequiredAnalyzers) + val verificationResult = + if (useExtendedResults) { + createAnomalyChecksWithExtendedResultsAndRunEverything( + data, repository, otherCheck, additionalRequiredAnalyzers) + } + else + { + createAnomalyChecksAndRunEverything(data, repository, otherCheck, additionalRequiredAnalyzers) + } printConstraintResults(verificationResult) @@ -189,6 +216,56 @@ class MetricsRepositoryAnomalyDetectionIntegrationTest extends AnyWordSpec with .run() } + private[this] def createAnomalyChecksWithExtendedResultsAndRunEverything( + data: DataFrame, + repository: MetricsRepository, + otherCheck: Check, + additionalRequiredAnalyzers: Seq[Analyzer[_, Metric[_]]]) + : VerificationResult = { + + // We only want to use historic data with the EU tag for the anomaly checks since the new + // data point is from the EU marketplace + val filterEU = Map("marketplace" -> "EU") + + // We only want to use data points before the date time associated with the current + // data point and only ones that are from 2018 + val afterDateTime = createDate(2018, 1, 1) + val beforeDateTime = createDate(2018, 8, 1) + + // Config for the size anomaly check + val sizeAnomalyCheckConfig = AnomalyCheckConfig(CheckLevel.Error, "Size only increases", + filterEU, Some(afterDateTime), Some(beforeDateTime)) + val sizeAnomalyDetectionStrategy = AbsoluteChangeStrategy(Some(0)) + + // Config for the mean sales anomaly check + val meanSalesAnomalyCheckConfig = AnomalyCheckConfig( + CheckLevel.Warning, + "Sales mean within 2 standard deviations", + filterEU, + Some(afterDateTime), + Some(beforeDateTime) + ) + val meanSalesAnomalyDetectionStrategy = OnlineNormalStrategy(upperDeviationFactor = Some(2), + ignoreAnomalies = false) + + // ResultKey to be used when saving the results of this run + val currentRunResultKey = ResultKey(createDate(2018, 8, 1), Map("marketplace" -> "EU")) + + VerificationSuite() + .onData(data) + .addCheck(otherCheck) + .addRequiredAnalyzers(additionalRequiredAnalyzers) + .useRepository(repository) + // Add the Size anomaly check + .addAnomalyCheckWithExtendedResults(sizeAnomalyDetectionStrategy, Size(), Some(sizeAnomalyCheckConfig)) + // Add the Mean sales anomaly check + .addAnomalyCheckWithExtendedResults(meanSalesAnomalyDetectionStrategy, Mean("sales"), + Some(meanSalesAnomalyCheckConfig)) + // Save new data point in the repository after we calculated everything + .saveOrAppendResult(currentRunResultKey) + .run() + } + private[this] def assertAnomalyCheckResultsAreCorrect( verificationResult: VerificationResult) : Unit = {