From 1f8a97da72b4cb99a3d8f36a6139aef03f7613df Mon Sep 17 00:00:00 2001
From: zeotuan <48720253+zeotuan@users.noreply.github.com>
Date: Sat, 25 May 2024 02:08:22 +1000
Subject: [PATCH] Adding Wilson Score Confidence Interval Strategy (#567)
* Configurable RetainCompletenessRule
* Add doc string
* Add default completeness const
* Add ConfidenceIntervalStrategy
* Add Separate Wilson and Wald Interval Test
* Add License information, Fix formatting
* Add License information
* formatting fix
* Update documentation
* Make WaldInterval the default strategy for now
* Formatting import to per line
* Separate group import to per line import
---
.../ConstraintSuggestionExample.scala | 6 ++
.../examples/constraint_suggestion_example.md | 13 +++
.../FractionalCategoricalRangeRule.scala | 12 +--
.../rules/RetainCompletenessRule.scala | 19 ++--
.../interval/ConfidenceIntervalStrategy.scala | 55 +++++++++++
.../rules/interval/WaldIntervalStrategy.scala | 47 +++++++++
.../WilsonScoreIntervalStrategy.scala | 47 +++++++++
.../rules/ConstraintRulesTest.scala | 95 ++++++++++++-------
.../rules/interval/IntervalStrategyTest.scala | 59 ++++++++++++
9 files changed, 299 insertions(+), 54 deletions(-)
create mode 100644 src/main/scala/com/amazon/deequ/suggestions/rules/interval/ConfidenceIntervalStrategy.scala
create mode 100644 src/main/scala/com/amazon/deequ/suggestions/rules/interval/WaldIntervalStrategy.scala
create mode 100644 src/main/scala/com/amazon/deequ/suggestions/rules/interval/WilsonScoreIntervalStrategy.scala
create mode 100644 src/test/scala/com/amazon/deequ/suggestions/rules/interval/IntervalStrategyTest.scala
diff --git a/src/main/scala/com/amazon/deequ/examples/ConstraintSuggestionExample.scala b/src/main/scala/com/amazon/deequ/examples/ConstraintSuggestionExample.scala
index 8aa0fb6c5..fc8f458bf 100644
--- a/src/main/scala/com/amazon/deequ/examples/ConstraintSuggestionExample.scala
+++ b/src/main/scala/com/amazon/deequ/examples/ConstraintSuggestionExample.scala
@@ -17,6 +17,8 @@
package com.amazon.deequ.examples
import com.amazon.deequ.examples.ExampleUtils.withSpark
+import com.amazon.deequ.suggestions.rules.RetainCompletenessRule
+import com.amazon.deequ.suggestions.rules.interval.WilsonScoreIntervalStrategy
import com.amazon.deequ.suggestions.{ConstraintSuggestionRunner, Rules}
private[examples] object ConstraintSuggestionExample extends App {
@@ -51,6 +53,10 @@ private[examples] object ConstraintSuggestionExample extends App {
val suggestionResult = ConstraintSuggestionRunner()
.onData(data)
.addConstraintRules(Rules.EXTENDED)
+ // We can also add our own constraint and customize constraint parameters
+ .addConstraintRule(
+ RetainCompletenessRule(intervalStrategy = WilsonScoreIntervalStrategy())
+ )
.run()
// We can now investigate the constraints that deequ suggested. We get a textual description
diff --git a/src/main/scala/com/amazon/deequ/examples/constraint_suggestion_example.md b/src/main/scala/com/amazon/deequ/examples/constraint_suggestion_example.md
index df159a9c9..472f63c7d 100644
--- a/src/main/scala/com/amazon/deequ/examples/constraint_suggestion_example.md
+++ b/src/main/scala/com/amazon/deequ/examples/constraint_suggestion_example.md
@@ -43,6 +43,17 @@ val suggestionResult = ConstraintSuggestionRunner()
.run()
```
+Alternatively, we also support customizing and adding individual constraint rule using `addConstraintRule()`
+```scala
+val suggestionResult = ConstraintSuggestionRunner()
+ .onData(data)
+
+ .addConstraintRule(
+ RetainCompletenessRule(intervalStrategy = WilsonScoreIntervalStrategy())
+ )
+ .run()
+```
+
We can now investigate the constraints that deequ suggested. We get a textual description and the corresponding scala code for each suggested constraint. Note that the constraint suggestion is based on heuristic rules and assumes that the data it is shown is 'static' and correct, which might often not be the case in the real world. Therefore the suggestions should always be manually reviewed before being applied in real deployments.
```scala
suggestionResult.constraintSuggestions.foreach { case (column, suggestions) =>
@@ -92,3 +103,5 @@ The corresponding scala code is .isContainedIn("status", Array("DELAYED", "UNKNO
Currently, we leave it up to the user to decide whether they want to apply the suggested constraints or not, and provide the corresponding Scala code for convenience. For larger datasets, it makes sense to evaluate the suggested constraints on some held-out portion of the data to see whether they hold or not. You can test this by adding an invocation of `.useTrainTestSplitWithTestsetRatio(0.1)` to the `ConstraintSuggestionRunner`. With this configuration, it would compute constraint suggestions on 90% of the data and evaluate the suggested constraints on the remaining 10%.
Finally, we would also like to note that the constraint suggestion code provides access to the underlying [column profiles](https://github.com/awslabs/deequ/blob/master/src/main/scala/com/amazon/deequ/examples/data_profiling_example.md) that it computed via `suggestionResult.columnProfiles`.
+
+An [executable and extended version of this example](https://github.com/awslabs/deequ/blob/master/src/main/scala/com/amazon/deequ/examples/.scala) is part of our code base.
diff --git a/src/main/scala/com/amazon/deequ/suggestions/rules/FractionalCategoricalRangeRule.scala b/src/main/scala/com/amazon/deequ/suggestions/rules/FractionalCategoricalRangeRule.scala
index 55e410f33..f9dd192e8 100644
--- a/src/main/scala/com/amazon/deequ/suggestions/rules/FractionalCategoricalRangeRule.scala
+++ b/src/main/scala/com/amazon/deequ/suggestions/rules/FractionalCategoricalRangeRule.scala
@@ -23,16 +23,17 @@ import com.amazon.deequ.metrics.DistributionValue
import com.amazon.deequ.profiles.ColumnProfile
import com.amazon.deequ.suggestions.ConstraintSuggestion
import com.amazon.deequ.suggestions.ConstraintSuggestionWithValue
+import com.amazon.deequ.suggestions.rules.interval.ConfidenceIntervalStrategy.defaultIntervalStrategy
+import com.amazon.deequ.suggestions.rules.interval.ConfidenceIntervalStrategy
import org.apache.commons.lang3.StringEscapeUtils
-import scala.math.BigDecimal.RoundingMode
-
/** If we see a categorical range for most values in a column, we suggest an IS IN (...)
* constraint that should hold for most values */
case class FractionalCategoricalRangeRule(
targetDataCoverageFraction: Double = 0.9,
categorySorter: Array[(String, DistributionValue)] => Array[(String, DistributionValue)] =
- categories => categories.sortBy({ case (_, value) => value.absolute }).reverse
+ categories => categories.sortBy({ case (_, value) => value.absolute }).reverse,
+ intervalStrategy: ConfidenceIntervalStrategy = defaultIntervalStrategy
) extends ConstraintRule[ColumnProfile] {
override def shouldBeApplied(profile: ColumnProfile, numRecords: Long): Boolean = {
@@ -79,11 +80,8 @@ case class FractionalCategoricalRangeRule(
val p = ratioSums
val n = numRecords
- val z = 1.96
- // TODO this needs to be more robust for p's close to 0 or 1
- val targetCompliance = BigDecimal(p - z * math.sqrt(p * (1 - p) / n))
- .setScale(2, RoundingMode.DOWN).toDouble
+ val targetCompliance = intervalStrategy.calculateTargetConfidenceInterval(p, n).lowerBound
val description = s"'${profile.column}' has value range $categoriesSql for at least " +
s"${targetCompliance * 100}% of values"
diff --git a/src/main/scala/com/amazon/deequ/suggestions/rules/RetainCompletenessRule.scala b/src/main/scala/com/amazon/deequ/suggestions/rules/RetainCompletenessRule.scala
index 9f995a112..be5bd101f 100644
--- a/src/main/scala/com/amazon/deequ/suggestions/rules/RetainCompletenessRule.scala
+++ b/src/main/scala/com/amazon/deequ/suggestions/rules/RetainCompletenessRule.scala
@@ -21,8 +21,8 @@ import com.amazon.deequ.profiles.ColumnProfile
import com.amazon.deequ.suggestions.CommonConstraintSuggestion
import com.amazon.deequ.suggestions.ConstraintSuggestion
import com.amazon.deequ.suggestions.rules.RetainCompletenessRule._
-
-import scala.math.BigDecimal.RoundingMode
+import com.amazon.deequ.suggestions.rules.interval.ConfidenceIntervalStrategy.defaultIntervalStrategy
+import com.amazon.deequ.suggestions.rules.interval.ConfidenceIntervalStrategy
/**
* If a column is incomplete in the sample, we model its completeness as a binomial variable,
@@ -33,21 +33,18 @@ import scala.math.BigDecimal.RoundingMode
*/
case class RetainCompletenessRule(
minCompleteness: Double = defaultMinCompleteness,
- maxCompleteness: Double = defaultMaxCompleteness
+ maxCompleteness: Double = defaultMaxCompleteness,
+ intervalStrategy: ConfidenceIntervalStrategy = defaultIntervalStrategy
) extends ConstraintRule[ColumnProfile] {
override def shouldBeApplied(profile: ColumnProfile, numRecords: Long): Boolean = {
profile.completeness > minCompleteness && profile.completeness < maxCompleteness
}
override def candidate(profile: ColumnProfile, numRecords: Long): ConstraintSuggestion = {
-
- val p = profile.completeness
- val n = numRecords
- val z = 1.96
-
- // TODO this needs to be more robust for p's close to 0 or 1
- val targetCompleteness = BigDecimal(p - z * math.sqrt(p * (1 - p) / n))
- .setScale(2, RoundingMode.DOWN).toDouble
+ val targetCompleteness = intervalStrategy.calculateTargetConfidenceInterval(
+ profile.completeness,
+ numRecords
+ ).lowerBound
val constraint = completenessConstraint(profile.column, _ >= targetCompleteness)
diff --git a/src/main/scala/com/amazon/deequ/suggestions/rules/interval/ConfidenceIntervalStrategy.scala b/src/main/scala/com/amazon/deequ/suggestions/rules/interval/ConfidenceIntervalStrategy.scala
new file mode 100644
index 000000000..0c12e03a5
--- /dev/null
+++ b/src/main/scala/com/amazon/deequ/suggestions/rules/interval/ConfidenceIntervalStrategy.scala
@@ -0,0 +1,55 @@
+/**
+ * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"). You may not
+ * use this file except in compliance with the License. A copy of the License
+ * is located at
+ *
+ * http://aws.amazon.com/apache2.0/
+ *
+ * or in the "license" file accompanying this file. This file is distributed on
+ * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
+ * express or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ *
+ */
+
+package com.amazon.deequ.suggestions.rules.interval
+
+import breeze.stats.distributions.{Gaussian, Rand}
+import com.amazon.deequ.suggestions.rules.interval.ConfidenceIntervalStrategy._
+
+/**
+ * Strategy for calculate confidence interval
+ * */
+trait ConfidenceIntervalStrategy {
+
+ /**
+ * Generated confidence interval interval
+ * @param pHat sample of the population that share a trait
+ * @param numRecords overall number of records
+ * @param confidence confidence level of method used to estimate the interval.
+ * @return
+ */
+ def calculateTargetConfidenceInterval(
+ pHat: Double,
+ numRecords: Long,
+ confidence: Double = defaultConfidence
+ ): ConfidenceInterval
+
+ def validateInput(pHat: Double, confidence: Double): Unit = {
+ require(0.0 <= pHat && pHat <= 1.0, "pHat must be between 0.0 and 1.0")
+ require(0.0 <= confidence && confidence <= 1.0, "confidence must be between 0.0 and 1.0")
+ }
+
+ def calculateZScore(confidence: Double): Double = Gaussian(0, 1)(Rand).inverseCdf(1 - ((1.0 - confidence)/ 2.0))
+}
+
+object ConfidenceIntervalStrategy {
+ val defaultConfidence = 0.95
+ val defaultIntervalStrategy: ConfidenceIntervalStrategy = WaldIntervalStrategy()
+
+ case class ConfidenceInterval(lowerBound: Double, upperBound: Double)
+}
+
+
diff --git a/src/main/scala/com/amazon/deequ/suggestions/rules/interval/WaldIntervalStrategy.scala b/src/main/scala/com/amazon/deequ/suggestions/rules/interval/WaldIntervalStrategy.scala
new file mode 100644
index 000000000..154d8ebfe
--- /dev/null
+++ b/src/main/scala/com/amazon/deequ/suggestions/rules/interval/WaldIntervalStrategy.scala
@@ -0,0 +1,47 @@
+/**
+ * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"). You may not
+ * use this file except in compliance with the License. A copy of the License
+ * is located at
+ *
+ * http://aws.amazon.com/apache2.0/
+ *
+ * or in the "license" file accompanying this file. This file is distributed on
+ * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
+ * express or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ *
+ */
+
+package com.amazon.deequ.suggestions.rules.interval
+
+import com.amazon.deequ.suggestions.rules.interval.ConfidenceIntervalStrategy.ConfidenceInterval
+import com.amazon.deequ.suggestions.rules.interval.ConfidenceIntervalStrategy.defaultConfidence
+
+import scala.math.BigDecimal.RoundingMode
+
+/**
+ * Implements the Wald Interval method for creating a binomial proportion confidence interval. Provided for backwards
+ * compatibility. using [[WaldIntervalStrategy]] for calculating confidence interval can be problematic when dealing
+ * with small sample sizes or proportions close to 0 or 1. It also have poorer coverage and might produce confidence
+ * limit outside the range of [0,1]
+ * @see
+ * Normal approximation interval (Wikipedia)
+ */
+@deprecated("WilsonScoreIntervalStrategy is recommended for calculating confidence interval")
+case class WaldIntervalStrategy() extends ConfidenceIntervalStrategy {
+ def calculateTargetConfidenceInterval(
+ pHat: Double,
+ numRecords: Long,
+ confidence: Double = defaultConfidence
+ ): ConfidenceInterval = {
+ validateInput(pHat, confidence)
+ val successRatio = BigDecimal(pHat)
+ val marginOfError = BigDecimal(calculateZScore(confidence) * math.sqrt(pHat * (1 - pHat) / numRecords))
+ val lowerBound = (successRatio - marginOfError).setScale(2, RoundingMode.DOWN).toDouble
+ val upperBound = (successRatio + marginOfError).setScale(2, RoundingMode.UP).toDouble
+ ConfidenceInterval(lowerBound, upperBound)
+ }
+}
diff --git a/src/main/scala/com/amazon/deequ/suggestions/rules/interval/WilsonScoreIntervalStrategy.scala b/src/main/scala/com/amazon/deequ/suggestions/rules/interval/WilsonScoreIntervalStrategy.scala
new file mode 100644
index 000000000..6e8371ea5
--- /dev/null
+++ b/src/main/scala/com/amazon/deequ/suggestions/rules/interval/WilsonScoreIntervalStrategy.scala
@@ -0,0 +1,47 @@
+/**
+ * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"). You may not
+ * use this file except in compliance with the License. A copy of the License
+ * is located at
+ *
+ * http://aws.amazon.com/apache2.0/
+ *
+ * or in the "license" file accompanying this file. This file is distributed on
+ * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
+ * express or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ *
+ */
+
+package com.amazon.deequ.suggestions.rules.interval
+
+import com.amazon.deequ.suggestions.rules.interval.ConfidenceIntervalStrategy.ConfidenceInterval
+import com.amazon.deequ.suggestions.rules.interval.ConfidenceIntervalStrategy.defaultConfidence
+
+import scala.math.BigDecimal.RoundingMode
+
+/**
+ * Using Wilson score method for creating a binomial proportion confidence interval.
+ *
+ * @see
+ * Wilson score interval (Wikipedia)
+ */
+case class WilsonScoreIntervalStrategy() extends ConfidenceIntervalStrategy {
+
+ def calculateTargetConfidenceInterval(
+ pHat: Double, numRecords: Long,
+ confidence: Double = defaultConfidence
+ ): ConfidenceInterval = {
+ validateInput(pHat, confidence)
+ val zScore = calculateZScore(confidence)
+ val zSquareOverN = math.pow(zScore, 2) / numRecords
+ val factor = 1.0 / (1 + zSquareOverN)
+ val adjustedSuccessRatio = pHat + zSquareOverN/2
+ val marginOfError = zScore * math.sqrt(pHat * (1 - pHat)/numRecords + zSquareOverN/(4 * numRecords))
+ val lowerBound = BigDecimal(factor * (adjustedSuccessRatio - marginOfError)).setScale(2, RoundingMode.DOWN).toDouble
+ val upperBound = BigDecimal(factor * (adjustedSuccessRatio + marginOfError)).setScale(2, RoundingMode.UP).toDouble
+ ConfidenceInterval(lowerBound, upperBound)
+ }
+}
diff --git a/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala b/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala
index 701a5d983..7b56e3938 100644
--- a/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala
+++ b/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala
@@ -22,10 +22,14 @@ import com.amazon.deequ.checks.{Check, CheckLevel}
import com.amazon.deequ.constraints.ConstrainableDataTypes
import com.amazon.deequ.metrics.{Distribution, DistributionValue}
import com.amazon.deequ.profiles._
+import com.amazon.deequ.suggestions.rules.interval.WaldIntervalStrategy
+import com.amazon.deequ.suggestions.rules.interval.WilsonScoreIntervalStrategy
import com.amazon.deequ.utils.FixtureSupport
import com.amazon.deequ.{SparkContextSpec, VerificationSuite}
import org.scalamock.scalatest.MockFactory
+import org.scalatest.Inspectors.forAll
import org.scalatest.WordSpec
+import org.scalatest.prop.Tables.Table
class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContextSpec
with MockFactory{
@@ -132,6 +136,7 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext
val complete = StandardColumnProfile("col1", 1.0, 100, String, false, Map.empty, None)
val tenPercent = StandardColumnProfile("col1", 0.1, 100, String, false, Map.empty, None)
val incomplete = StandardColumnProfile("col1", .25, 100, String, false, Map.empty, None)
+ val waldIntervalStrategy = WaldIntervalStrategy()
assert(!RetainCompletenessRule().shouldBeApplied(complete, 1000))
assert(!RetainCompletenessRule(0.05, 0.9).shouldBeApplied(complete, 1000))
@@ -139,74 +144,92 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext
assert(RetainCompletenessRule(0.0).shouldBeApplied(tenPercent, 1000))
assert(RetainCompletenessRule(0.0).shouldBeApplied(incomplete, 1000))
assert(RetainCompletenessRule().shouldBeApplied(incomplete, 1000))
+ assert(!RetainCompletenessRule(intervalStrategy = waldIntervalStrategy).shouldBeApplied(complete, 1000))
+ assert(!RetainCompletenessRule(0.05, 0.9, waldIntervalStrategy).shouldBeApplied(complete, 1000))
+ assert(RetainCompletenessRule(0.05, 0.9, waldIntervalStrategy).shouldBeApplied(tenPercent, 1000))
}
"return evaluable constraint candidates" in
withSparkSession { session =>
+ val table = Table(("strategy", "result"), (WaldIntervalStrategy(), true), (WilsonScoreIntervalStrategy(), true))
+ forAll(table) { case (strategy, result) =>
+ val dfWithColumnCandidate = getDfFull(session)
- val dfWithColumnCandidate = getDfFull(session)
+ val fakeColumnProfile = getFakeColumnProfileWithNameAndCompleteness("att1", 0.5)
- val fakeColumnProfile = getFakeColumnProfileWithNameAndCompleteness("att1", 0.5)
+ val check = Check(CheckLevel.Warning, "some")
+ .addConstraint(
+ RetainCompletenessRule(intervalStrategy = strategy).candidate(fakeColumnProfile, 100).constraint
+ )
- val check = Check(CheckLevel.Warning, "some")
- .addConstraint(RetainCompletenessRule().candidate(fakeColumnProfile, 100).constraint)
+ val verificationResult = VerificationSuite()
+ .onData(dfWithColumnCandidate)
+ .addCheck(check)
+ .run()
- val verificationResult = VerificationSuite()
- .onData(dfWithColumnCandidate)
- .addCheck(check)
- .run()
+ val metricResult = verificationResult.metrics.head._2
- val metricResult = verificationResult.metrics.head._2
+ assert(metricResult.value.isSuccess == result)
+ }
- assert(metricResult.value.isSuccess)
}
"return working code to add constraint to check" in
withSparkSession { session =>
+ val table = Table(
+ ("strategy", "colCompleteness", "targetCompleteness", "result"),
+ (WaldIntervalStrategy(), 0.5, 0.4, true),
+ (WilsonScoreIntervalStrategy(), 0.4, 0.3, true)
+ )
+ forAll(table) { case (strategy, colCompleteness, targetCompleteness, result) =>
- val dfWithColumnCandidate = getDfFull(session)
+ val dfWithColumnCandidate = getDfFull(session)
- val fakeColumnProfile = getFakeColumnProfileWithNameAndCompleteness("att1", 0.5)
+ val fakeColumnProfile = getFakeColumnProfileWithNameAndCompleteness("att1", colCompleteness)
- val codeForConstraint = RetainCompletenessRule().candidate(fakeColumnProfile, 100)
- .codeForConstraint
+ val codeForConstraint = RetainCompletenessRule(intervalStrategy = strategy).candidate(fakeColumnProfile, 100)
+ .codeForConstraint
- val expectedCodeForConstraint = """.hasCompleteness("att1", _ >= 0.4,
- | Some("It should be above 0.4!"))""".stripMargin.replaceAll("\n", "")
+ val expectedCodeForConstraint = s""".hasCompleteness("att1", _ >= $targetCompleteness,
+ | Some("It should be above $targetCompleteness!"))""".stripMargin.replaceAll("\n", "")
- assert(expectedCodeForConstraint == codeForConstraint)
+ assert(expectedCodeForConstraint == codeForConstraint)
- val check = Check(CheckLevel.Warning, "some")
- .hasCompleteness("att1", _ >= 0.4, Some("It should be above 0.4!"))
+ val check = Check(CheckLevel.Warning, "some")
+ .hasCompleteness("att1", _ >= targetCompleteness, Some(s"It should be above $targetCompleteness"))
- val verificationResult = VerificationSuite()
- .onData(dfWithColumnCandidate)
- .addCheck(check)
- .run()
+ val verificationResult = VerificationSuite()
+ .onData(dfWithColumnCandidate)
+ .addCheck(check)
+ .run()
- val metricResult = verificationResult.metrics.head._2
+ val metricResult = verificationResult.metrics.head._2
+
+ assert(metricResult.value.isSuccess == result)
+ }
- assert(metricResult.value.isSuccess)
}
"return evaluable constraint candidates with custom min/max completeness" in
withSparkSession { session =>
+ val table = Table(("strategy", "result"), (WaldIntervalStrategy(), true), (WilsonScoreIntervalStrategy(), true))
+ forAll(table) { case (strategy, result) =>
+ val dfWithColumnCandidate = getDfFull(session)
- val dfWithColumnCandidate = getDfFull(session)
-
- val fakeColumnProfile = getFakeColumnProfileWithNameAndCompleteness("att1", 0.5)
+ val fakeColumnProfile = getFakeColumnProfileWithNameAndCompleteness("att1", 0.5)
- val check = Check(CheckLevel.Warning, "some")
- .addConstraint(RetainCompletenessRule(0.4, 0.6).candidate(fakeColumnProfile, 100).constraint)
+ val check = Check(CheckLevel.Warning, "some")
+ .addConstraint(RetainCompletenessRule(0.4, 0.6, strategy).candidate(fakeColumnProfile, 100).constraint)
- val verificationResult = VerificationSuite()
- .onData(dfWithColumnCandidate)
- .addCheck(check)
- .run()
+ val verificationResult = VerificationSuite()
+ .onData(dfWithColumnCandidate)
+ .addCheck(check)
+ .run()
- val metricResult = verificationResult.metrics.head._2
+ val metricResult = verificationResult.metrics.head._2
- assert(metricResult.value.isSuccess)
+ assert(metricResult.value.isSuccess == result)
+ }
}
}
diff --git a/src/test/scala/com/amazon/deequ/suggestions/rules/interval/IntervalStrategyTest.scala b/src/test/scala/com/amazon/deequ/suggestions/rules/interval/IntervalStrategyTest.scala
new file mode 100644
index 000000000..54e6cd1e1
--- /dev/null
+++ b/src/test/scala/com/amazon/deequ/suggestions/rules/interval/IntervalStrategyTest.scala
@@ -0,0 +1,59 @@
+/**
+ * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"). You may not
+ * use this file except in compliance with the License. A copy of the License
+ * is located at
+ *
+ * http://aws.amazon.com/apache2.0/
+ *
+ * or in the "license" file accompanying this file. This file is distributed on
+ * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
+ * express or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ *
+ */
+
+package com.amazon.deequ.suggestions.rules.interval
+
+import com.amazon.deequ.SparkContextSpec
+import com.amazon.deequ.suggestions.rules.interval.ConfidenceIntervalStrategy.ConfidenceInterval
+import com.amazon.deequ.utils.FixtureSupport
+import org.scalamock.scalatest.MockFactory
+import org.scalatest.Inspectors.forAll
+import org.scalatest.prop.Tables.Table
+import org.scalatest.wordspec.AnyWordSpec
+
+class IntervalStrategyTest extends AnyWordSpec with FixtureSupport with SparkContextSpec
+ with MockFactory {
+
+ "ConfidenceIntervalStrategy" should {
+ "be calculated correctly" in {
+ val waldStrategy = WaldIntervalStrategy()
+ val wilsonStrategy = WilsonScoreIntervalStrategy()
+
+ val table = Table(
+ ("strategy", "pHat", "numRecord", "lowerBound", "upperBound"),
+ (waldStrategy, 1.0, 20L, 1.0, 1.0),
+ (waldStrategy, 0.5, 100L, 0.4, 0.6),
+ (waldStrategy, 0.4, 100L, 0.3, 0.5),
+ (waldStrategy, 0.6, 100L, 0.5, 0.7),
+ (waldStrategy, 0.9, 100L, 0.84, 0.96),
+ (waldStrategy, 1.0, 100L, 1.0, 1.0),
+
+ (wilsonStrategy, 0.01, 20L, 0.00, 0.18),
+ (wilsonStrategy, 1.0, 20L, 0.83, 1.0),
+ (wilsonStrategy, 0.5, 100L, 0.4, 0.6),
+ (wilsonStrategy, 0.4, 100L, 0.3, 0.5),
+ (wilsonStrategy, 0.6, 100L, 0.5, 0.7),
+ (wilsonStrategy, 0.9, 100L, 0.82, 0.95),
+ (wilsonStrategy, 1.0, 100L, 0.96, 1.0)
+ )
+
+ forAll(table) { case (strategy, pHat, numRecords, lowerBound, upperBound) =>
+ val actualInterval = strategy.calculateTargetConfidenceInterval(pHat, numRecords)
+ assert(actualInterval == ConfidenceInterval(lowerBound, upperBound))
+ }
+ }
+ }
+}