From 3b41e4c46a23bc44f15f76b2c087e65ae6bf471e Mon Sep 17 00:00:00 2001 From: Tuan Pham Date: Fri, 19 Apr 2024 11:06:06 +1000 Subject: [PATCH 01/12] Configurable RetainCompletenessRule --- .../rules/RetainCompletenessRule.scala | 5 ++-- .../rules/ConstraintRulesTest.scala | 25 +++++++++++++++++++ 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/src/main/scala/com/amazon/deequ/suggestions/rules/RetainCompletenessRule.scala b/src/main/scala/com/amazon/deequ/suggestions/rules/RetainCompletenessRule.scala index 67ae61f92..f632b7d47 100644 --- a/src/main/scala/com/amazon/deequ/suggestions/rules/RetainCompletenessRule.scala +++ b/src/main/scala/com/amazon/deequ/suggestions/rules/RetainCompletenessRule.scala @@ -27,10 +27,9 @@ import scala.math.BigDecimal.RoundingMode * If a column is incomplete in the sample, we model its completeness as a binomial variable, * estimate a confidence interval and use this to define a lower bound for the completeness */ -case class RetainCompletenessRule() extends ConstraintRule[ColumnProfile] { - +case class RetainCompletenessRule(minCompleteness: Double = 0.2, maxCompleteness: Double = 1.0, sensitivity: Double = 1.96) extends ConstraintRule[ColumnProfile] { override def shouldBeApplied(profile: ColumnProfile, numRecords: Long): Boolean = { - profile.completeness > 0.2 && profile.completeness < 1.0 + profile.completeness > minCompleteness && profile.completeness < maxCompleteness } override def candidate(profile: ColumnProfile, numRecords: Long): ConstraintSuggestion = { diff --git a/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala b/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala index 075247932..701a5d983 100644 --- a/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala +++ b/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala @@ -130,9 +130,14 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext "be applied correctly" in { val complete = StandardColumnProfile("col1", 1.0, 100, String, false, Map.empty, None) + val tenPercent = StandardColumnProfile("col1", 0.1, 100, String, false, Map.empty, None) val incomplete = StandardColumnProfile("col1", .25, 100, String, false, Map.empty, None) assert(!RetainCompletenessRule().shouldBeApplied(complete, 1000)) + assert(!RetainCompletenessRule(0.05, 0.9).shouldBeApplied(complete, 1000)) + assert(RetainCompletenessRule(0.05, 0.9).shouldBeApplied(tenPercent, 1000)) + assert(RetainCompletenessRule(0.0).shouldBeApplied(tenPercent, 1000)) + assert(RetainCompletenessRule(0.0).shouldBeApplied(incomplete, 1000)) assert(RetainCompletenessRule().shouldBeApplied(incomplete, 1000)) } @@ -183,6 +188,26 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext assert(metricResult.value.isSuccess) } + + "return evaluable constraint candidates with custom min/max completeness" in + withSparkSession { session => + + val dfWithColumnCandidate = getDfFull(session) + + val fakeColumnProfile = getFakeColumnProfileWithNameAndCompleteness("att1", 0.5) + + val check = Check(CheckLevel.Warning, "some") + .addConstraint(RetainCompletenessRule(0.4, 0.6).candidate(fakeColumnProfile, 100).constraint) + + val verificationResult = VerificationSuite() + .onData(dfWithColumnCandidate) + .addCheck(check) + .run() + + val metricResult = verificationResult.metrics.head._2 + + assert(metricResult.value.isSuccess) + } } "UniqueIfApproximatelyUniqueRule" should { From ac337eae95ea830938d1dd8a8870dddf16205019 Mon Sep 17 00:00:00 2001 From: Tuan Pham Date: Fri, 19 Apr 2024 11:12:50 +1000 Subject: [PATCH 02/12] Add doc string --- .../deequ/suggestions/rules/RetainCompletenessRule.scala | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/main/scala/com/amazon/deequ/suggestions/rules/RetainCompletenessRule.scala b/src/main/scala/com/amazon/deequ/suggestions/rules/RetainCompletenessRule.scala index f632b7d47..df2f49308 100644 --- a/src/main/scala/com/amazon/deequ/suggestions/rules/RetainCompletenessRule.scala +++ b/src/main/scala/com/amazon/deequ/suggestions/rules/RetainCompletenessRule.scala @@ -26,8 +26,11 @@ import scala.math.BigDecimal.RoundingMode /** * If a column is incomplete in the sample, we model its completeness as a binomial variable, * estimate a confidence interval and use this to define a lower bound for the completeness + * + * @param minCompleteness : minimum completeness threshold to determine if rule should be applied + * @param maxCompleteness : maximum completeness threshold to determine if rule should be applied */ -case class RetainCompletenessRule(minCompleteness: Double = 0.2, maxCompleteness: Double = 1.0, sensitivity: Double = 1.96) extends ConstraintRule[ColumnProfile] { +case class RetainCompletenessRule(minCompleteness: Double = 0.2, maxCompleteness: Double = 1.0) extends ConstraintRule[ColumnProfile] { override def shouldBeApplied(profile: ColumnProfile, numRecords: Long): Boolean = { profile.completeness > minCompleteness && profile.completeness < maxCompleteness } From db9b7646b58260495e4c6e40ad605b8ba9c78f93 Mon Sep 17 00:00:00 2001 From: Tuan Pham Date: Wed, 1 May 2024 11:19:08 +1000 Subject: [PATCH 03/12] Add default completeness const --- .../suggestions/rules/RetainCompletenessRule.scala | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/main/scala/com/amazon/deequ/suggestions/rules/RetainCompletenessRule.scala b/src/main/scala/com/amazon/deequ/suggestions/rules/RetainCompletenessRule.scala index df2f49308..9f995a112 100644 --- a/src/main/scala/com/amazon/deequ/suggestions/rules/RetainCompletenessRule.scala +++ b/src/main/scala/com/amazon/deequ/suggestions/rules/RetainCompletenessRule.scala @@ -20,6 +20,7 @@ import com.amazon.deequ.constraints.Constraint.completenessConstraint import com.amazon.deequ.profiles.ColumnProfile import com.amazon.deequ.suggestions.CommonConstraintSuggestion import com.amazon.deequ.suggestions.ConstraintSuggestion +import com.amazon.deequ.suggestions.rules.RetainCompletenessRule._ import scala.math.BigDecimal.RoundingMode @@ -30,7 +31,10 @@ import scala.math.BigDecimal.RoundingMode * @param minCompleteness : minimum completeness threshold to determine if rule should be applied * @param maxCompleteness : maximum completeness threshold to determine if rule should be applied */ -case class RetainCompletenessRule(minCompleteness: Double = 0.2, maxCompleteness: Double = 1.0) extends ConstraintRule[ColumnProfile] { +case class RetainCompletenessRule( + minCompleteness: Double = defaultMinCompleteness, + maxCompleteness: Double = defaultMaxCompleteness +) extends ConstraintRule[ColumnProfile] { override def shouldBeApplied(profile: ColumnProfile, numRecords: Long): Boolean = { profile.completeness > minCompleteness && profile.completeness < maxCompleteness } @@ -67,3 +71,8 @@ case class RetainCompletenessRule(minCompleteness: Double = 0.2, maxCompleteness "we model its completeness as a binomial variable, estimate a confidence interval " + "and use this to define a lower bound for the completeness" } + +object RetainCompletenessRule { + private val defaultMinCompleteness: Double = 0.2 + private val defaultMaxCompleteness: Double = 1.0 +} From 91b1728fe488f14a6823caf9dbd94a7bf3f38659 Mon Sep 17 00:00:00 2001 From: Tuan Pham Date: Tue, 7 May 2024 08:44:26 +1000 Subject: [PATCH 04/12] Add ConfidenceIntervalStrategy --- .../FractionalCategoricalRangeRule.scala | 16 +++++---- .../rules/RetainCompletenessRule.scala | 16 +++------ .../interval/ConfidenceIntervalStrategy.scala | 34 +++++++++++++++++++ .../rules/interval/WaldIntervalStrategy.scala | 23 +++++++++++++ .../WilsonScoreIntervalStrategy.scala | 27 +++++++++++++++ .../rules/interval/IntervalStrategyTest.scala | 32 +++++++++++++++++ 6 files changed, 130 insertions(+), 18 deletions(-) create mode 100644 src/main/scala/com/amazon/deequ/suggestions/rules/interval/ConfidenceIntervalStrategy.scala create mode 100644 src/main/scala/com/amazon/deequ/suggestions/rules/interval/WaldIntervalStrategy.scala create mode 100644 src/main/scala/com/amazon/deequ/suggestions/rules/interval/WilsonScoreIntervalStrategy.scala create mode 100644 src/test/scala/com/amazon/deequ/suggestions/rules/interval/IntervalStrategyTest.scala diff --git a/src/main/scala/com/amazon/deequ/suggestions/rules/FractionalCategoricalRangeRule.scala b/src/main/scala/com/amazon/deequ/suggestions/rules/FractionalCategoricalRangeRule.scala index 55e410f33..4970a4d0f 100644 --- a/src/main/scala/com/amazon/deequ/suggestions/rules/FractionalCategoricalRangeRule.scala +++ b/src/main/scala/com/amazon/deequ/suggestions/rules/FractionalCategoricalRangeRule.scala @@ -23,16 +23,17 @@ import com.amazon.deequ.metrics.DistributionValue import com.amazon.deequ.profiles.ColumnProfile import com.amazon.deequ.suggestions.ConstraintSuggestion import com.amazon.deequ.suggestions.ConstraintSuggestionWithValue +import com.amazon.deequ.suggestions.rules.FractionalCategoricalRangeRule.defaultIntervalStrategy +import com.amazon.deequ.suggestions.rules.interval.{ConfidenceIntervalStrategy, WilsonScoreIntervalStrategy} import org.apache.commons.lang3.StringEscapeUtils -import scala.math.BigDecimal.RoundingMode - /** If we see a categorical range for most values in a column, we suggest an IS IN (...) * constraint that should hold for most values */ case class FractionalCategoricalRangeRule( targetDataCoverageFraction: Double = 0.9, categorySorter: Array[(String, DistributionValue)] => Array[(String, DistributionValue)] = - categories => categories.sortBy({ case (_, value) => value.absolute }).reverse + categories => categories.sortBy({ case (_, value) => value.absolute }).reverse, + intervalStrategy: ConfidenceIntervalStrategy = defaultIntervalStrategy ) extends ConstraintRule[ColumnProfile] { override def shouldBeApplied(profile: ColumnProfile, numRecords: Long): Boolean = { @@ -79,11 +80,8 @@ case class FractionalCategoricalRangeRule( val p = ratioSums val n = numRecords - val z = 1.96 - // TODO this needs to be more robust for p's close to 0 or 1 - val targetCompliance = BigDecimal(p - z * math.sqrt(p * (1 - p) / n)) - .setScale(2, RoundingMode.DOWN).toDouble + val targetCompliance = intervalStrategy.calculateTargetConfidenceInterval(p, n).lowerBound val description = s"'${profile.column}' has value range $categoriesSql for at least " + s"${targetCompliance * 100}% of values" @@ -128,3 +126,7 @@ case class FractionalCategoricalRangeRule( override val ruleDescription: String = "If we see a categorical range for most values " + "in a column, we suggest an IS IN (...) constraint that should hold for most values" } + +object FractionalCategoricalRangeRule { + private val defaultIntervalStrategy: ConfidenceIntervalStrategy = WilsonScoreIntervalStrategy() +} diff --git a/src/main/scala/com/amazon/deequ/suggestions/rules/RetainCompletenessRule.scala b/src/main/scala/com/amazon/deequ/suggestions/rules/RetainCompletenessRule.scala index 9f995a112..35a287b6f 100644 --- a/src/main/scala/com/amazon/deequ/suggestions/rules/RetainCompletenessRule.scala +++ b/src/main/scala/com/amazon/deequ/suggestions/rules/RetainCompletenessRule.scala @@ -21,8 +21,7 @@ import com.amazon.deequ.profiles.ColumnProfile import com.amazon.deequ.suggestions.CommonConstraintSuggestion import com.amazon.deequ.suggestions.ConstraintSuggestion import com.amazon.deequ.suggestions.rules.RetainCompletenessRule._ - -import scala.math.BigDecimal.RoundingMode +import com.amazon.deequ.suggestions.rules.interval.{ConfidenceIntervalStrategy, WilsonScoreIntervalStrategy} /** * If a column is incomplete in the sample, we model its completeness as a binomial variable, @@ -33,21 +32,15 @@ import scala.math.BigDecimal.RoundingMode */ case class RetainCompletenessRule( minCompleteness: Double = defaultMinCompleteness, - maxCompleteness: Double = defaultMaxCompleteness + maxCompleteness: Double = defaultMaxCompleteness, + intervalStrategy: ConfidenceIntervalStrategy = defaultIntervalStrategy ) extends ConstraintRule[ColumnProfile] { override def shouldBeApplied(profile: ColumnProfile, numRecords: Long): Boolean = { profile.completeness > minCompleteness && profile.completeness < maxCompleteness } override def candidate(profile: ColumnProfile, numRecords: Long): ConstraintSuggestion = { - - val p = profile.completeness - val n = numRecords - val z = 1.96 - - // TODO this needs to be more robust for p's close to 0 or 1 - val targetCompleteness = BigDecimal(p - z * math.sqrt(p * (1 - p) / n)) - .setScale(2, RoundingMode.DOWN).toDouble + val targetCompleteness = intervalStrategy.calculateTargetConfidenceInterval(profile.completeness, numRecords).lowerBound val constraint = completenessConstraint(profile.column, _ >= targetCompleteness) @@ -75,4 +68,5 @@ case class RetainCompletenessRule( object RetainCompletenessRule { private val defaultMinCompleteness: Double = 0.2 private val defaultMaxCompleteness: Double = 1.0 + private val defaultIntervalStrategy: ConfidenceIntervalStrategy = WilsonScoreIntervalStrategy() } diff --git a/src/main/scala/com/amazon/deequ/suggestions/rules/interval/ConfidenceIntervalStrategy.scala b/src/main/scala/com/amazon/deequ/suggestions/rules/interval/ConfidenceIntervalStrategy.scala new file mode 100644 index 000000000..097bd9118 --- /dev/null +++ b/src/main/scala/com/amazon/deequ/suggestions/rules/interval/ConfidenceIntervalStrategy.scala @@ -0,0 +1,34 @@ +package com.amazon.deequ.suggestions.rules.interval + +import breeze.stats.distributions.{Gaussian, Rand} +import com.amazon.deequ.suggestions.rules.interval.ConfidenceIntervalStrategy.{ConfidenceInterval, defaultConfidence} + +/** + * Strategy for calculate confidence interval + * */ +trait ConfidenceIntervalStrategy { + + /** + * Generated confidence interval interval + * @param pHat sample of the population that share a trait + * @param numRecords overall number of records + * @param confidence confidence level of method used to estimate the interval. + * @return + */ + def calculateTargetConfidenceInterval(pHat: Double, numRecords: Long, confidence: Double = defaultConfidence): ConfidenceInterval + + def validateInput(pHat: Double, confidence: Double): Unit = { + require(0.0 <= pHat && pHat <= 1.0, "pHat must be between 0.0 and 1.0") + require(0.0 <= confidence && confidence <= 1.0, "confidence must be between 0.0 and 1.0") + } + + def calculateZScore(confidence: Double): Double = Gaussian(0, 1)(Rand).inverseCdf(1 - ((1.0 - confidence)/ 2.0)) +} + +object ConfidenceIntervalStrategy { + val defaultConfidence = 0.95 + + case class ConfidenceInterval(lowerBound: Double, upperBound: Double) +} + + diff --git a/src/main/scala/com/amazon/deequ/suggestions/rules/interval/WaldIntervalStrategy.scala b/src/main/scala/com/amazon/deequ/suggestions/rules/interval/WaldIntervalStrategy.scala new file mode 100644 index 000000000..6e8d1d066 --- /dev/null +++ b/src/main/scala/com/amazon/deequ/suggestions/rules/interval/WaldIntervalStrategy.scala @@ -0,0 +1,23 @@ +package com.amazon.deequ.suggestions.rules.interval + +import com.amazon.deequ.suggestions.rules.interval.ConfidenceIntervalStrategy.{ConfidenceInterval, defaultConfidence} + +import scala.math.BigDecimal.RoundingMode + +/** + * Implements the Wald Interval method for creating a binomial proportion confidence interval. + * + * @see + * Normal approximation interval (Wikipedia) + */ +case class WaldIntervalStrategy() extends ConfidenceIntervalStrategy { + def calculateTargetConfidenceInterval(pHat: Double, numRecords: Long, confidence: Double = defaultConfidence): ConfidenceInterval = { + validateInput(pHat, confidence) + val successRatio = BigDecimal(pHat) + val marginOfError = BigDecimal(calculateZScore(confidence) * math.sqrt(pHat * (1 - pHat) / numRecords)) + val lowerBound = (successRatio - marginOfError).setScale(2, RoundingMode.DOWN).toDouble + val upperBound = (successRatio + marginOfError).setScale(2, RoundingMode.UP).toDouble + ConfidenceInterval(lowerBound, upperBound) + } +} diff --git a/src/main/scala/com/amazon/deequ/suggestions/rules/interval/WilsonScoreIntervalStrategy.scala b/src/main/scala/com/amazon/deequ/suggestions/rules/interval/WilsonScoreIntervalStrategy.scala new file mode 100644 index 000000000..e76b8a0e0 --- /dev/null +++ b/src/main/scala/com/amazon/deequ/suggestions/rules/interval/WilsonScoreIntervalStrategy.scala @@ -0,0 +1,27 @@ +package com.amazon.deequ.suggestions.rules.interval + +import com.amazon.deequ.suggestions.rules.interval.ConfidenceIntervalStrategy.{ConfidenceInterval, defaultConfidence} + +import scala.math.BigDecimal.RoundingMode + +/** + * Using Wilson score method for creating a binomial proportion confidence interval. + * + * @see + * Wilson score interval (Wikipedia) + */ +case class WilsonScoreIntervalStrategy() extends ConfidenceIntervalStrategy { + + def calculateTargetConfidenceInterval(pHat: Double, numRecords: Long, confidence: Double = defaultConfidence): ConfidenceInterval = { + validateInput(pHat, confidence) + val zScore = calculateZScore(confidence) + val zSquareOverN = math.pow(zScore, 2) / numRecords + val factor = 1.0 / (1 + zSquareOverN) + val adjustedSuccessRatio = pHat + zSquareOverN/2 + val marginOfError = zScore * math.sqrt(pHat * (1 - pHat)/numRecords + zSquareOverN/(4 * numRecords)) + val lowerBound = BigDecimal(factor * (adjustedSuccessRatio - marginOfError)).setScale(2, RoundingMode.DOWN).toDouble + val upperBound = BigDecimal(factor * (adjustedSuccessRatio + marginOfError)).setScale(2, RoundingMode.UP).toDouble + ConfidenceInterval(lowerBound, upperBound) + } +} diff --git a/src/test/scala/com/amazon/deequ/suggestions/rules/interval/IntervalStrategyTest.scala b/src/test/scala/com/amazon/deequ/suggestions/rules/interval/IntervalStrategyTest.scala new file mode 100644 index 000000000..708fd285f --- /dev/null +++ b/src/test/scala/com/amazon/deequ/suggestions/rules/interval/IntervalStrategyTest.scala @@ -0,0 +1,32 @@ +package com.amazon.deequ.suggestions.rules.interval + +import com.amazon.deequ.SparkContextSpec +import com.amazon.deequ.suggestions.rules.interval.ConfidenceIntervalStrategy.ConfidenceInterval +import com.amazon.deequ.utils.FixtureSupport +import org.scalamock.scalatest.MockFactory +import org.scalatest.wordspec.AnyWordSpec + +class IntervalStrategyTest extends AnyWordSpec with FixtureSupport with SparkContextSpec + with MockFactory { + "WaldIntervalStrategy" should { + "be calculated correctly" in { + assert(WaldIntervalStrategy().calculateTargetConfidenceInterval(1.0, 20L) == ConfidenceInterval(1.0, 1.0)) + assert(WaldIntervalStrategy().calculateTargetConfidenceInterval(0.5, 100L) == ConfidenceInterval(0.4, 0.6)) + assert(WaldIntervalStrategy().calculateTargetConfidenceInterval(0.4, 100L) == ConfidenceInterval(0.3, 0.5)) + assert(WaldIntervalStrategy().calculateTargetConfidenceInterval(0.6, 100L) == ConfidenceInterval(0.5, 0.7)) + assert(WaldIntervalStrategy().calculateTargetConfidenceInterval(0.90, 100L) == ConfidenceInterval(0.84, 0.96)) + assert(WaldIntervalStrategy().calculateTargetConfidenceInterval(1.0, 100L) == ConfidenceInterval(1.0, 1.0)) + } + } + + "WilsonIntervalStrategy" should { + "be calculated correctly" in { + assert(WilsonScoreIntervalStrategy().calculateTargetConfidenceInterval(1.0, 20L) == ConfidenceInterval(0.83, 1.0)) + assert(WilsonScoreIntervalStrategy().calculateTargetConfidenceInterval(0.5, 100L) == ConfidenceInterval(0.4, 0.6)) + assert(WilsonScoreIntervalStrategy().calculateTargetConfidenceInterval(0.4, 100L) == ConfidenceInterval(0.3, 0.5)) + assert(WilsonScoreIntervalStrategy().calculateTargetConfidenceInterval(0.6, 100L) == ConfidenceInterval(0.5, 0.7)) + assert(WilsonScoreIntervalStrategy().calculateTargetConfidenceInterval(0.90, 100L) == ConfidenceInterval(0.82, 0.95)) + assert(WilsonScoreIntervalStrategy().calculateTargetConfidenceInterval(1.0, 100L) == ConfidenceInterval(0.96, 1.0)) + } + } +} From 8cbffcdb76bd15c94434c0c2faffd4212a379658 Mon Sep 17 00:00:00 2001 From: Tuan Pham Date: Tue, 7 May 2024 23:18:15 +1000 Subject: [PATCH 05/12] Add Separate Wilson and Wald Interval Test --- .../rules/ConstraintRulesTest.scala | 92 +++++++++++-------- .../rules/interval/IntervalStrategyTest.scala | 41 +++++---- 2 files changed, 80 insertions(+), 53 deletions(-) diff --git a/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala b/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala index 701a5d983..328691fdc 100644 --- a/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala +++ b/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala @@ -22,10 +22,13 @@ import com.amazon.deequ.checks.{Check, CheckLevel} import com.amazon.deequ.constraints.ConstrainableDataTypes import com.amazon.deequ.metrics.{Distribution, DistributionValue} import com.amazon.deequ.profiles._ +import com.amazon.deequ.suggestions.rules.interval.{WaldIntervalStrategy, WilsonScoreIntervalStrategy} import com.amazon.deequ.utils.FixtureSupport import com.amazon.deequ.{SparkContextSpec, VerificationSuite} import org.scalamock.scalatest.MockFactory +import org.scalatest.Inspectors.forAll import org.scalatest.WordSpec +import org.scalatest.prop.Tables.Table class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContextSpec with MockFactory{ @@ -132,6 +135,7 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext val complete = StandardColumnProfile("col1", 1.0, 100, String, false, Map.empty, None) val tenPercent = StandardColumnProfile("col1", 0.1, 100, String, false, Map.empty, None) val incomplete = StandardColumnProfile("col1", .25, 100, String, false, Map.empty, None) + val waldIntervalStrategy = WaldIntervalStrategy() assert(!RetainCompletenessRule().shouldBeApplied(complete, 1000)) assert(!RetainCompletenessRule(0.05, 0.9).shouldBeApplied(complete, 1000)) @@ -139,74 +143,90 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext assert(RetainCompletenessRule(0.0).shouldBeApplied(tenPercent, 1000)) assert(RetainCompletenessRule(0.0).shouldBeApplied(incomplete, 1000)) assert(RetainCompletenessRule().shouldBeApplied(incomplete, 1000)) + assert(!RetainCompletenessRule(intervalStrategy = waldIntervalStrategy).shouldBeApplied(complete, 1000)) + assert(!RetainCompletenessRule(0.05, 0.9, waldIntervalStrategy).shouldBeApplied(complete, 1000)) + assert(RetainCompletenessRule(0.05, 0.9, waldIntervalStrategy).shouldBeApplied(tenPercent, 1000)) } "return evaluable constraint candidates" in withSparkSession { session => + val table = Table(("strategy", "result"), (WaldIntervalStrategy(), true), (WilsonScoreIntervalStrategy(), true)) + forAll(table) { case (strategy, result) => + val dfWithColumnCandidate = getDfFull(session) - val dfWithColumnCandidate = getDfFull(session) + val fakeColumnProfile = getFakeColumnProfileWithNameAndCompleteness("att1", 0.5) - val fakeColumnProfile = getFakeColumnProfileWithNameAndCompleteness("att1", 0.5) + val check = Check(CheckLevel.Warning, "some") + .addConstraint(RetainCompletenessRule(intervalStrategy = strategy).candidate(fakeColumnProfile, 100).constraint) - val check = Check(CheckLevel.Warning, "some") - .addConstraint(RetainCompletenessRule().candidate(fakeColumnProfile, 100).constraint) + val verificationResult = VerificationSuite() + .onData(dfWithColumnCandidate) + .addCheck(check) + .run() - val verificationResult = VerificationSuite() - .onData(dfWithColumnCandidate) - .addCheck(check) - .run() + val metricResult = verificationResult.metrics.head._2 - val metricResult = verificationResult.metrics.head._2 + assert(metricResult.value.isSuccess == result) + } - assert(metricResult.value.isSuccess) } "return working code to add constraint to check" in withSparkSession { session => + val table = Table( + ("strategy", "colCompleteness", "targetCompleteness", "result"), + (WaldIntervalStrategy(), 0.5, 0.4, true), + (WilsonScoreIntervalStrategy(), 0.4, 0.3, true) + ) + forAll(table) { case (strategy, colCompleteness, targetCompleteness, result) => - val dfWithColumnCandidate = getDfFull(session) + val dfWithColumnCandidate = getDfFull(session) - val fakeColumnProfile = getFakeColumnProfileWithNameAndCompleteness("att1", 0.5) + val fakeColumnProfile = getFakeColumnProfileWithNameAndCompleteness("att1", colCompleteness) - val codeForConstraint = RetainCompletenessRule().candidate(fakeColumnProfile, 100) - .codeForConstraint + val codeForConstraint = RetainCompletenessRule(intervalStrategy = strategy).candidate(fakeColumnProfile, 100) + .codeForConstraint - val expectedCodeForConstraint = """.hasCompleteness("att1", _ >= 0.4, - | Some("It should be above 0.4!"))""".stripMargin.replaceAll("\n", "") + val expectedCodeForConstraint = s""".hasCompleteness("att1", _ >= $targetCompleteness, + | Some("It should be above $targetCompleteness!"))""".stripMargin.replaceAll("\n", "") - assert(expectedCodeForConstraint == codeForConstraint) + assert(expectedCodeForConstraint == codeForConstraint) - val check = Check(CheckLevel.Warning, "some") - .hasCompleteness("att1", _ >= 0.4, Some("It should be above 0.4!")) + val check = Check(CheckLevel.Warning, "some") + .hasCompleteness("att1", _ >= targetCompleteness, Some(s"It should be above $targetCompleteness")) - val verificationResult = VerificationSuite() - .onData(dfWithColumnCandidate) - .addCheck(check) - .run() + val verificationResult = VerificationSuite() + .onData(dfWithColumnCandidate) + .addCheck(check) + .run() - val metricResult = verificationResult.metrics.head._2 + val metricResult = verificationResult.metrics.head._2 + + assert(metricResult.value.isSuccess == result) + } - assert(metricResult.value.isSuccess) } "return evaluable constraint candidates with custom min/max completeness" in withSparkSession { session => + val table = Table(("strategy", "result"), (WaldIntervalStrategy(), true), (WilsonScoreIntervalStrategy(), true)) + forAll(table) { case (strategy, result) => + val dfWithColumnCandidate = getDfFull(session) - val dfWithColumnCandidate = getDfFull(session) - - val fakeColumnProfile = getFakeColumnProfileWithNameAndCompleteness("att1", 0.5) + val fakeColumnProfile = getFakeColumnProfileWithNameAndCompleteness("att1", 0.5) - val check = Check(CheckLevel.Warning, "some") - .addConstraint(RetainCompletenessRule(0.4, 0.6).candidate(fakeColumnProfile, 100).constraint) + val check = Check(CheckLevel.Warning, "some") + .addConstraint(RetainCompletenessRule(0.4, 0.6, strategy).candidate(fakeColumnProfile, 100).constraint) - val verificationResult = VerificationSuite() - .onData(dfWithColumnCandidate) - .addCheck(check) - .run() + val verificationResult = VerificationSuite() + .onData(dfWithColumnCandidate) + .addCheck(check) + .run() - val metricResult = verificationResult.metrics.head._2 + val metricResult = verificationResult.metrics.head._2 - assert(metricResult.value.isSuccess) + assert(metricResult.value.isSuccess == result) + } } } diff --git a/src/test/scala/com/amazon/deequ/suggestions/rules/interval/IntervalStrategyTest.scala b/src/test/scala/com/amazon/deequ/suggestions/rules/interval/IntervalStrategyTest.scala index 708fd285f..2d1021795 100644 --- a/src/test/scala/com/amazon/deequ/suggestions/rules/interval/IntervalStrategyTest.scala +++ b/src/test/scala/com/amazon/deequ/suggestions/rules/interval/IntervalStrategyTest.scala @@ -4,29 +4,36 @@ import com.amazon.deequ.SparkContextSpec import com.amazon.deequ.suggestions.rules.interval.ConfidenceIntervalStrategy.ConfidenceInterval import com.amazon.deequ.utils.FixtureSupport import org.scalamock.scalatest.MockFactory +import org.scalatest.Inspectors.forAll +import org.scalatest.prop.Tables.Table import org.scalatest.wordspec.AnyWordSpec class IntervalStrategyTest extends AnyWordSpec with FixtureSupport with SparkContextSpec with MockFactory { - "WaldIntervalStrategy" should { + "ConfidenceIntervalStrategy" should { "be calculated correctly" in { - assert(WaldIntervalStrategy().calculateTargetConfidenceInterval(1.0, 20L) == ConfidenceInterval(1.0, 1.0)) - assert(WaldIntervalStrategy().calculateTargetConfidenceInterval(0.5, 100L) == ConfidenceInterval(0.4, 0.6)) - assert(WaldIntervalStrategy().calculateTargetConfidenceInterval(0.4, 100L) == ConfidenceInterval(0.3, 0.5)) - assert(WaldIntervalStrategy().calculateTargetConfidenceInterval(0.6, 100L) == ConfidenceInterval(0.5, 0.7)) - assert(WaldIntervalStrategy().calculateTargetConfidenceInterval(0.90, 100L) == ConfidenceInterval(0.84, 0.96)) - assert(WaldIntervalStrategy().calculateTargetConfidenceInterval(1.0, 100L) == ConfidenceInterval(1.0, 1.0)) - } - } + val waldStrategy = WaldIntervalStrategy() + val wilsonStrategy = WilsonScoreIntervalStrategy() + val table = Table( + ("strategy", "pHat", "numRecord", "lowerBound", "upperBound"), + (waldStrategy, 1.0, 20L, 1.0, 1.0), + (waldStrategy, 0.5, 100L, 0.4, 0.6), + (waldStrategy, 0.4, 100L, 0.3, 0.5), + (waldStrategy, 0.6, 100L, 0.5, 0.7), + (waldStrategy, 0.9, 100L, 0.84, 0.96), + (waldStrategy, 1.0, 100L, 1.0, 1.0), - "WilsonIntervalStrategy" should { - "be calculated correctly" in { - assert(WilsonScoreIntervalStrategy().calculateTargetConfidenceInterval(1.0, 20L) == ConfidenceInterval(0.83, 1.0)) - assert(WilsonScoreIntervalStrategy().calculateTargetConfidenceInterval(0.5, 100L) == ConfidenceInterval(0.4, 0.6)) - assert(WilsonScoreIntervalStrategy().calculateTargetConfidenceInterval(0.4, 100L) == ConfidenceInterval(0.3, 0.5)) - assert(WilsonScoreIntervalStrategy().calculateTargetConfidenceInterval(0.6, 100L) == ConfidenceInterval(0.5, 0.7)) - assert(WilsonScoreIntervalStrategy().calculateTargetConfidenceInterval(0.90, 100L) == ConfidenceInterval(0.82, 0.95)) - assert(WilsonScoreIntervalStrategy().calculateTargetConfidenceInterval(1.0, 100L) == ConfidenceInterval(0.96, 1.0)) + (wilsonStrategy, 0.01, 20L, 0.00, 0.18), + (wilsonStrategy, 1.0, 20L, 0.83, 1.0), + (wilsonStrategy, 0.5, 100L, 0.4, 0.6), + (wilsonStrategy, 0.4, 100L, 0.3, 0.5), + (wilsonStrategy, 0.6, 100L, 0.5, 0.7), + (wilsonStrategy, 0.9, 100L, 0.82, 0.95), + (wilsonStrategy, 1.0, 100L, 0.96, 1.0), + ) + forAll(table) { case (strategy, pHat, numRecords, lowerBound, upperBound) => + assert(strategy.calculateTargetConfidenceInterval(pHat, numRecords) == ConfidenceInterval(lowerBound, upperBound)) + } } } } From 3a9916fd93847a81ef16b59ce5077c70575f68a1 Mon Sep 17 00:00:00 2001 From: Tuan Pham Date: Thu, 16 May 2024 17:38:24 +1000 Subject: [PATCH 06/12] Add License information, Fix formatting --- .../rules/RetainCompletenessRule.scala | 5 ++++- .../interval/ConfidenceIntervalStrategy.scala | 22 ++++++++++++++++++- .../rules/interval/WaldIntervalStrategy.scala | 22 ++++++++++++++++++- .../WilsonScoreIntervalStrategy.scala | 21 +++++++++++++++++- .../rules/ConstraintRulesTest.scala | 4 +++- 5 files changed, 69 insertions(+), 5 deletions(-) diff --git a/src/main/scala/com/amazon/deequ/suggestions/rules/RetainCompletenessRule.scala b/src/main/scala/com/amazon/deequ/suggestions/rules/RetainCompletenessRule.scala index 35a287b6f..7ac015fb2 100644 --- a/src/main/scala/com/amazon/deequ/suggestions/rules/RetainCompletenessRule.scala +++ b/src/main/scala/com/amazon/deequ/suggestions/rules/RetainCompletenessRule.scala @@ -40,7 +40,10 @@ case class RetainCompletenessRule( } override def candidate(profile: ColumnProfile, numRecords: Long): ConstraintSuggestion = { - val targetCompleteness = intervalStrategy.calculateTargetConfidenceInterval(profile.completeness, numRecords).lowerBound + val targetCompleteness = intervalStrategy.calculateTargetConfidenceInterval( + profile.completeness, + numRecords + ).lowerBound val constraint = completenessConstraint(profile.column, _ >= targetCompleteness) diff --git a/src/main/scala/com/amazon/deequ/suggestions/rules/interval/ConfidenceIntervalStrategy.scala b/src/main/scala/com/amazon/deequ/suggestions/rules/interval/ConfidenceIntervalStrategy.scala index 097bd9118..e3fbd8622 100644 --- a/src/main/scala/com/amazon/deequ/suggestions/rules/interval/ConfidenceIntervalStrategy.scala +++ b/src/main/scala/com/amazon/deequ/suggestions/rules/interval/ConfidenceIntervalStrategy.scala @@ -1,3 +1,19 @@ +/** + * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"). You may not + * use this file except in compliance with the License. A copy of the License + * is located at + * + * http://aws.amazon.com/apache2.0/ + * + * or in the "license" file accompanying this file. This file is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either + * express or implied. See the License for the specific language governing + * permissions and limitations under the License. + * + */ + package com.amazon.deequ.suggestions.rules.interval import breeze.stats.distributions.{Gaussian, Rand} @@ -15,7 +31,11 @@ trait ConfidenceIntervalStrategy { * @param confidence confidence level of method used to estimate the interval. * @return */ - def calculateTargetConfidenceInterval(pHat: Double, numRecords: Long, confidence: Double = defaultConfidence): ConfidenceInterval + def calculateTargetConfidenceInterval( + pHat: Double, + numRecords: Long, + confidence: Double = defaultConfidence + ): ConfidenceInterval def validateInput(pHat: Double, confidence: Double): Unit = { require(0.0 <= pHat && pHat <= 1.0, "pHat must be between 0.0 and 1.0") diff --git a/src/main/scala/com/amazon/deequ/suggestions/rules/interval/WaldIntervalStrategy.scala b/src/main/scala/com/amazon/deequ/suggestions/rules/interval/WaldIntervalStrategy.scala index 6e8d1d066..ecfd6fb77 100644 --- a/src/main/scala/com/amazon/deequ/suggestions/rules/interval/WaldIntervalStrategy.scala +++ b/src/main/scala/com/amazon/deequ/suggestions/rules/interval/WaldIntervalStrategy.scala @@ -1,3 +1,19 @@ +/** + * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"). You may not + * use this file except in compliance with the License. A copy of the License + * is located at + * + * http://aws.amazon.com/apache2.0/ + * + * or in the "license" file accompanying this file. This file is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either + * express or implied. See the License for the specific language governing + * permissions and limitations under the License. + * + */ + package com.amazon.deequ.suggestions.rules.interval import com.amazon.deequ.suggestions.rules.interval.ConfidenceIntervalStrategy.{ConfidenceInterval, defaultConfidence} @@ -12,7 +28,11 @@ import scala.math.BigDecimal.RoundingMode * Normal approximation interval (Wikipedia) */ case class WaldIntervalStrategy() extends ConfidenceIntervalStrategy { - def calculateTargetConfidenceInterval(pHat: Double, numRecords: Long, confidence: Double = defaultConfidence): ConfidenceInterval = { + def calculateTargetConfidenceInterval( + pHat: Double, + numRecords: Long, + confidence: Double = defaultConfidence + ): ConfidenceInterval = { validateInput(pHat, confidence) val successRatio = BigDecimal(pHat) val marginOfError = BigDecimal(calculateZScore(confidence) * math.sqrt(pHat * (1 - pHat) / numRecords)) diff --git a/src/main/scala/com/amazon/deequ/suggestions/rules/interval/WilsonScoreIntervalStrategy.scala b/src/main/scala/com/amazon/deequ/suggestions/rules/interval/WilsonScoreIntervalStrategy.scala index e76b8a0e0..46afbfdb1 100644 --- a/src/main/scala/com/amazon/deequ/suggestions/rules/interval/WilsonScoreIntervalStrategy.scala +++ b/src/main/scala/com/amazon/deequ/suggestions/rules/interval/WilsonScoreIntervalStrategy.scala @@ -1,3 +1,19 @@ +/** + * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"). You may not + * use this file except in compliance with the License. A copy of the License + * is located at + * + * http://aws.amazon.com/apache2.0/ + * + * or in the "license" file accompanying this file. This file is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either + * express or implied. See the License for the specific language governing + * permissions and limitations under the License. + * + */ + package com.amazon.deequ.suggestions.rules.interval import com.amazon.deequ.suggestions.rules.interval.ConfidenceIntervalStrategy.{ConfidenceInterval, defaultConfidence} @@ -13,7 +29,10 @@ import scala.math.BigDecimal.RoundingMode */ case class WilsonScoreIntervalStrategy() extends ConfidenceIntervalStrategy { - def calculateTargetConfidenceInterval(pHat: Double, numRecords: Long, confidence: Double = defaultConfidence): ConfidenceInterval = { + def calculateTargetConfidenceInterval( + pHat: Double, numRecords: Long, + confidence: Double = defaultConfidence + ): ConfidenceInterval = { validateInput(pHat, confidence) val zScore = calculateZScore(confidence) val zSquareOverN = math.pow(zScore, 2) / numRecords diff --git a/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala b/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala index 328691fdc..7a1366e6a 100644 --- a/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala +++ b/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala @@ -157,7 +157,9 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext val fakeColumnProfile = getFakeColumnProfileWithNameAndCompleteness("att1", 0.5) val check = Check(CheckLevel.Warning, "some") - .addConstraint(RetainCompletenessRule(intervalStrategy = strategy).candidate(fakeColumnProfile, 100).constraint) + .addConstraint( + RetainCompletenessRule(intervalStrategy = strategy).candidate(fakeColumnProfile, 100).constraint + ) val verificationResult = VerificationSuite() .onData(dfWithColumnCandidate) From d27cb9bcd9c87147aaa522909ed254fada97ad34 Mon Sep 17 00:00:00 2001 From: Tuan Pham Date: Thu, 16 May 2024 17:40:10 +1000 Subject: [PATCH 07/12] Add License information --- .../rules/interval/IntervalStrategyTest.scala | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/test/scala/com/amazon/deequ/suggestions/rules/interval/IntervalStrategyTest.scala b/src/test/scala/com/amazon/deequ/suggestions/rules/interval/IntervalStrategyTest.scala index 2d1021795..f0d8d296b 100644 --- a/src/test/scala/com/amazon/deequ/suggestions/rules/interval/IntervalStrategyTest.scala +++ b/src/test/scala/com/amazon/deequ/suggestions/rules/interval/IntervalStrategyTest.scala @@ -1,3 +1,19 @@ +/** + * Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"). You may not + * use this file except in compliance with the License. A copy of the License + * is located at + * + * http://aws.amazon.com/apache2.0/ + * + * or in the "license" file accompanying this file. This file is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either + * express or implied. See the License for the specific language governing + * permissions and limitations under the License. + * + */ + package com.amazon.deequ.suggestions.rules.interval import com.amazon.deequ.SparkContextSpec From 3f849f83a58beb2abe39f40051b7ab32447b8bc1 Mon Sep 17 00:00:00 2001 From: Tuan Pham Date: Mon, 20 May 2024 13:35:30 +1000 Subject: [PATCH 08/12] formatting fix --- .../suggestions/rules/interval/IntervalStrategyTest.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/test/scala/com/amazon/deequ/suggestions/rules/interval/IntervalStrategyTest.scala b/src/test/scala/com/amazon/deequ/suggestions/rules/interval/IntervalStrategyTest.scala index f0d8d296b..7759477ec 100644 --- a/src/test/scala/com/amazon/deequ/suggestions/rules/interval/IntervalStrategyTest.scala +++ b/src/test/scala/com/amazon/deequ/suggestions/rules/interval/IntervalStrategyTest.scala @@ -38,7 +38,6 @@ class IntervalStrategyTest extends AnyWordSpec with FixtureSupport with SparkCon (waldStrategy, 0.6, 100L, 0.5, 0.7), (waldStrategy, 0.9, 100L, 0.84, 0.96), (waldStrategy, 1.0, 100L, 1.0, 1.0), - (wilsonStrategy, 0.01, 20L, 0.00, 0.18), (wilsonStrategy, 1.0, 20L, 0.83, 1.0), (wilsonStrategy, 0.5, 100L, 0.4, 0.6), @@ -48,7 +47,8 @@ class IntervalStrategyTest extends AnyWordSpec with FixtureSupport with SparkCon (wilsonStrategy, 1.0, 100L, 0.96, 1.0), ) forAll(table) { case (strategy, pHat, numRecords, lowerBound, upperBound) => - assert(strategy.calculateTargetConfidenceInterval(pHat, numRecords) == ConfidenceInterval(lowerBound, upperBound)) + val actualInterval = strategy.calculateTargetConfidenceInterval(pHat, numRecords) + assert(actualInterval == ConfidenceInterval(lowerBound, upperBound)) } } } From 71d6e3f1a87d965218b9d072ffa79c72af9355c7 Mon Sep 17 00:00:00 2001 From: Tuan Pham Date: Mon, 20 May 2024 21:56:20 +1000 Subject: [PATCH 09/12] Update documentation --- .../examples/ConstraintSuggestionExample.scala | 6 ++++++ .../deequ/examples/constraint_suggestion_example.md | 13 +++++++++++++ .../rules/interval/WaldIntervalStrategy.scala | 7 +++++-- 3 files changed, 24 insertions(+), 2 deletions(-) diff --git a/src/main/scala/com/amazon/deequ/examples/ConstraintSuggestionExample.scala b/src/main/scala/com/amazon/deequ/examples/ConstraintSuggestionExample.scala index 8aa0fb6c5..fc8f458bf 100644 --- a/src/main/scala/com/amazon/deequ/examples/ConstraintSuggestionExample.scala +++ b/src/main/scala/com/amazon/deequ/examples/ConstraintSuggestionExample.scala @@ -17,6 +17,8 @@ package com.amazon.deequ.examples import com.amazon.deequ.examples.ExampleUtils.withSpark +import com.amazon.deequ.suggestions.rules.RetainCompletenessRule +import com.amazon.deequ.suggestions.rules.interval.WilsonScoreIntervalStrategy import com.amazon.deequ.suggestions.{ConstraintSuggestionRunner, Rules} private[examples] object ConstraintSuggestionExample extends App { @@ -51,6 +53,10 @@ private[examples] object ConstraintSuggestionExample extends App { val suggestionResult = ConstraintSuggestionRunner() .onData(data) .addConstraintRules(Rules.EXTENDED) + // We can also add our own constraint and customize constraint parameters + .addConstraintRule( + RetainCompletenessRule(intervalStrategy = WilsonScoreIntervalStrategy()) + ) .run() // We can now investigate the constraints that deequ suggested. We get a textual description diff --git a/src/main/scala/com/amazon/deequ/examples/constraint_suggestion_example.md b/src/main/scala/com/amazon/deequ/examples/constraint_suggestion_example.md index df159a9c9..472f63c7d 100644 --- a/src/main/scala/com/amazon/deequ/examples/constraint_suggestion_example.md +++ b/src/main/scala/com/amazon/deequ/examples/constraint_suggestion_example.md @@ -43,6 +43,17 @@ val suggestionResult = ConstraintSuggestionRunner() .run() ``` +Alternatively, we also support customizing and adding individual constraint rule using `addConstraintRule()` +```scala +val suggestionResult = ConstraintSuggestionRunner() + .onData(data) + + .addConstraintRule( + RetainCompletenessRule(intervalStrategy = WilsonScoreIntervalStrategy()) + ) + .run() +``` + We can now investigate the constraints that deequ suggested. We get a textual description and the corresponding scala code for each suggested constraint. Note that the constraint suggestion is based on heuristic rules and assumes that the data it is shown is 'static' and correct, which might often not be the case in the real world. Therefore the suggestions should always be manually reviewed before being applied in real deployments. ```scala suggestionResult.constraintSuggestions.foreach { case (column, suggestions) => @@ -92,3 +103,5 @@ The corresponding scala code is .isContainedIn("status", Array("DELAYED", "UNKNO Currently, we leave it up to the user to decide whether they want to apply the suggested constraints or not, and provide the corresponding Scala code for convenience. For larger datasets, it makes sense to evaluate the suggested constraints on some held-out portion of the data to see whether they hold or not. You can test this by adding an invocation of `.useTrainTestSplitWithTestsetRatio(0.1)` to the `ConstraintSuggestionRunner`. With this configuration, it would compute constraint suggestions on 90% of the data and evaluate the suggested constraints on the remaining 10%. Finally, we would also like to note that the constraint suggestion code provides access to the underlying [column profiles](https://github.com/awslabs/deequ/blob/master/src/main/scala/com/amazon/deequ/examples/data_profiling_example.md) that it computed via `suggestionResult.columnProfiles`. + +An [executable and extended version of this example](https://github.com/awslabs/deequ/blob/master/src/main/scala/com/amazon/deequ/examples/.scala) is part of our code base. diff --git a/src/main/scala/com/amazon/deequ/suggestions/rules/interval/WaldIntervalStrategy.scala b/src/main/scala/com/amazon/deequ/suggestions/rules/interval/WaldIntervalStrategy.scala index ecfd6fb77..15574bf17 100644 --- a/src/main/scala/com/amazon/deequ/suggestions/rules/interval/WaldIntervalStrategy.scala +++ b/src/main/scala/com/amazon/deequ/suggestions/rules/interval/WaldIntervalStrategy.scala @@ -21,12 +21,15 @@ import com.amazon.deequ.suggestions.rules.interval.ConfidenceIntervalStrategy.{C import scala.math.BigDecimal.RoundingMode /** - * Implements the Wald Interval method for creating a binomial proportion confidence interval. - * + * Implements the Wald Interval method for creating a binomial proportion confidence interval. Provided for backwards + * compatibility. using [[WaldIntervalStrategy]] for calculating confidence interval can be problematic when dealing + * with small sample sizes or proportions close to 0 or 1. It also have poorer coverage and might produce confidence + * limit outside the range of [0,1] * @see * Normal approximation interval (Wikipedia) */ +@deprecated("WilsonScoreIntervalStrategy is recommended for calculating confidence interval") case class WaldIntervalStrategy() extends ConfidenceIntervalStrategy { def calculateTargetConfidenceInterval( pHat: Double, From 387ab8115fd5ebc40de02696355a81725044d9a1 Mon Sep 17 00:00:00 2001 From: Tuan Pham Date: Wed, 22 May 2024 09:33:51 +1000 Subject: [PATCH 10/12] Make WaldInterval the default strategy for now --- .../deequ/suggestions/rules/RetainCompletenessRule.scala | 4 ++-- .../suggestions/rules/interval/IntervalStrategyTest.scala | 6 +++++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/main/scala/com/amazon/deequ/suggestions/rules/RetainCompletenessRule.scala b/src/main/scala/com/amazon/deequ/suggestions/rules/RetainCompletenessRule.scala index 7ac015fb2..c7a079cab 100644 --- a/src/main/scala/com/amazon/deequ/suggestions/rules/RetainCompletenessRule.scala +++ b/src/main/scala/com/amazon/deequ/suggestions/rules/RetainCompletenessRule.scala @@ -21,7 +21,7 @@ import com.amazon.deequ.profiles.ColumnProfile import com.amazon.deequ.suggestions.CommonConstraintSuggestion import com.amazon.deequ.suggestions.ConstraintSuggestion import com.amazon.deequ.suggestions.rules.RetainCompletenessRule._ -import com.amazon.deequ.suggestions.rules.interval.{ConfidenceIntervalStrategy, WilsonScoreIntervalStrategy} +import com.amazon.deequ.suggestions.rules.interval.{ConfidenceIntervalStrategy, WaldIntervalStrategy, WilsonScoreIntervalStrategy} /** * If a column is incomplete in the sample, we model its completeness as a binomial variable, @@ -71,5 +71,5 @@ case class RetainCompletenessRule( object RetainCompletenessRule { private val defaultMinCompleteness: Double = 0.2 private val defaultMaxCompleteness: Double = 1.0 - private val defaultIntervalStrategy: ConfidenceIntervalStrategy = WilsonScoreIntervalStrategy() + private val defaultIntervalStrategy: ConfidenceIntervalStrategy = WaldIntervalStrategy() } diff --git a/src/test/scala/com/amazon/deequ/suggestions/rules/interval/IntervalStrategyTest.scala b/src/test/scala/com/amazon/deequ/suggestions/rules/interval/IntervalStrategyTest.scala index 7759477ec..54e6cd1e1 100644 --- a/src/test/scala/com/amazon/deequ/suggestions/rules/interval/IntervalStrategyTest.scala +++ b/src/test/scala/com/amazon/deequ/suggestions/rules/interval/IntervalStrategyTest.scala @@ -26,10 +26,12 @@ import org.scalatest.wordspec.AnyWordSpec class IntervalStrategyTest extends AnyWordSpec with FixtureSupport with SparkContextSpec with MockFactory { + "ConfidenceIntervalStrategy" should { "be calculated correctly" in { val waldStrategy = WaldIntervalStrategy() val wilsonStrategy = WilsonScoreIntervalStrategy() + val table = Table( ("strategy", "pHat", "numRecord", "lowerBound", "upperBound"), (waldStrategy, 1.0, 20L, 1.0, 1.0), @@ -38,14 +40,16 @@ class IntervalStrategyTest extends AnyWordSpec with FixtureSupport with SparkCon (waldStrategy, 0.6, 100L, 0.5, 0.7), (waldStrategy, 0.9, 100L, 0.84, 0.96), (waldStrategy, 1.0, 100L, 1.0, 1.0), + (wilsonStrategy, 0.01, 20L, 0.00, 0.18), (wilsonStrategy, 1.0, 20L, 0.83, 1.0), (wilsonStrategy, 0.5, 100L, 0.4, 0.6), (wilsonStrategy, 0.4, 100L, 0.3, 0.5), (wilsonStrategy, 0.6, 100L, 0.5, 0.7), (wilsonStrategy, 0.9, 100L, 0.82, 0.95), - (wilsonStrategy, 1.0, 100L, 0.96, 1.0), + (wilsonStrategy, 1.0, 100L, 0.96, 1.0) ) + forAll(table) { case (strategy, pHat, numRecords, lowerBound, upperBound) => val actualInterval = strategy.calculateTargetConfidenceInterval(pHat, numRecords) assert(actualInterval == ConfidenceInterval(lowerBound, upperBound)) From 913c795c14243240e2e8f15bf78666917243a218 Mon Sep 17 00:00:00 2001 From: Tuan Pham Date: Thu, 23 May 2024 13:35:11 +1000 Subject: [PATCH 11/12] Formatting import to per line --- .../rules/FractionalCategoricalRangeRule.scala | 8 ++------ .../deequ/suggestions/rules/RetainCompletenessRule.scala | 4 ++-- .../rules/interval/ConfidenceIntervalStrategy.scala | 3 ++- .../suggestions/rules/interval/WaldIntervalStrategy.scala | 5 ++++- .../rules/interval/WilsonScoreIntervalStrategy.scala | 5 ++++- .../deequ/suggestions/rules/ConstraintRulesTest.scala | 5 ++++- 6 files changed, 18 insertions(+), 12 deletions(-) diff --git a/src/main/scala/com/amazon/deequ/suggestions/rules/FractionalCategoricalRangeRule.scala b/src/main/scala/com/amazon/deequ/suggestions/rules/FractionalCategoricalRangeRule.scala index 4970a4d0f..f9dd192e8 100644 --- a/src/main/scala/com/amazon/deequ/suggestions/rules/FractionalCategoricalRangeRule.scala +++ b/src/main/scala/com/amazon/deequ/suggestions/rules/FractionalCategoricalRangeRule.scala @@ -23,8 +23,8 @@ import com.amazon.deequ.metrics.DistributionValue import com.amazon.deequ.profiles.ColumnProfile import com.amazon.deequ.suggestions.ConstraintSuggestion import com.amazon.deequ.suggestions.ConstraintSuggestionWithValue -import com.amazon.deequ.suggestions.rules.FractionalCategoricalRangeRule.defaultIntervalStrategy -import com.amazon.deequ.suggestions.rules.interval.{ConfidenceIntervalStrategy, WilsonScoreIntervalStrategy} +import com.amazon.deequ.suggestions.rules.interval.ConfidenceIntervalStrategy.defaultIntervalStrategy +import com.amazon.deequ.suggestions.rules.interval.ConfidenceIntervalStrategy import org.apache.commons.lang3.StringEscapeUtils /** If we see a categorical range for most values in a column, we suggest an IS IN (...) @@ -126,7 +126,3 @@ case class FractionalCategoricalRangeRule( override val ruleDescription: String = "If we see a categorical range for most values " + "in a column, we suggest an IS IN (...) constraint that should hold for most values" } - -object FractionalCategoricalRangeRule { - private val defaultIntervalStrategy: ConfidenceIntervalStrategy = WilsonScoreIntervalStrategy() -} diff --git a/src/main/scala/com/amazon/deequ/suggestions/rules/RetainCompletenessRule.scala b/src/main/scala/com/amazon/deequ/suggestions/rules/RetainCompletenessRule.scala index c7a079cab..be5bd101f 100644 --- a/src/main/scala/com/amazon/deequ/suggestions/rules/RetainCompletenessRule.scala +++ b/src/main/scala/com/amazon/deequ/suggestions/rules/RetainCompletenessRule.scala @@ -21,7 +21,8 @@ import com.amazon.deequ.profiles.ColumnProfile import com.amazon.deequ.suggestions.CommonConstraintSuggestion import com.amazon.deequ.suggestions.ConstraintSuggestion import com.amazon.deequ.suggestions.rules.RetainCompletenessRule._ -import com.amazon.deequ.suggestions.rules.interval.{ConfidenceIntervalStrategy, WaldIntervalStrategy, WilsonScoreIntervalStrategy} +import com.amazon.deequ.suggestions.rules.interval.ConfidenceIntervalStrategy.defaultIntervalStrategy +import com.amazon.deequ.suggestions.rules.interval.ConfidenceIntervalStrategy /** * If a column is incomplete in the sample, we model its completeness as a binomial variable, @@ -71,5 +72,4 @@ case class RetainCompletenessRule( object RetainCompletenessRule { private val defaultMinCompleteness: Double = 0.2 private val defaultMaxCompleteness: Double = 1.0 - private val defaultIntervalStrategy: ConfidenceIntervalStrategy = WaldIntervalStrategy() } diff --git a/src/main/scala/com/amazon/deequ/suggestions/rules/interval/ConfidenceIntervalStrategy.scala b/src/main/scala/com/amazon/deequ/suggestions/rules/interval/ConfidenceIntervalStrategy.scala index e3fbd8622..0c12e03a5 100644 --- a/src/main/scala/com/amazon/deequ/suggestions/rules/interval/ConfidenceIntervalStrategy.scala +++ b/src/main/scala/com/amazon/deequ/suggestions/rules/interval/ConfidenceIntervalStrategy.scala @@ -17,7 +17,7 @@ package com.amazon.deequ.suggestions.rules.interval import breeze.stats.distributions.{Gaussian, Rand} -import com.amazon.deequ.suggestions.rules.interval.ConfidenceIntervalStrategy.{ConfidenceInterval, defaultConfidence} +import com.amazon.deequ.suggestions.rules.interval.ConfidenceIntervalStrategy._ /** * Strategy for calculate confidence interval @@ -47,6 +47,7 @@ trait ConfidenceIntervalStrategy { object ConfidenceIntervalStrategy { val defaultConfidence = 0.95 + val defaultIntervalStrategy: ConfidenceIntervalStrategy = WaldIntervalStrategy() case class ConfidenceInterval(lowerBound: Double, upperBound: Double) } diff --git a/src/main/scala/com/amazon/deequ/suggestions/rules/interval/WaldIntervalStrategy.scala b/src/main/scala/com/amazon/deequ/suggestions/rules/interval/WaldIntervalStrategy.scala index 15574bf17..773aa0337 100644 --- a/src/main/scala/com/amazon/deequ/suggestions/rules/interval/WaldIntervalStrategy.scala +++ b/src/main/scala/com/amazon/deequ/suggestions/rules/interval/WaldIntervalStrategy.scala @@ -16,7 +16,10 @@ package com.amazon.deequ.suggestions.rules.interval -import com.amazon.deequ.suggestions.rules.interval.ConfidenceIntervalStrategy.{ConfidenceInterval, defaultConfidence} +import com.amazon.deequ.suggestions.rules.interval.ConfidenceIntervalStrategy.{ + ConfidenceInterval, + defaultConfidence +} import scala.math.BigDecimal.RoundingMode diff --git a/src/main/scala/com/amazon/deequ/suggestions/rules/interval/WilsonScoreIntervalStrategy.scala b/src/main/scala/com/amazon/deequ/suggestions/rules/interval/WilsonScoreIntervalStrategy.scala index 46afbfdb1..79b66bd2b 100644 --- a/src/main/scala/com/amazon/deequ/suggestions/rules/interval/WilsonScoreIntervalStrategy.scala +++ b/src/main/scala/com/amazon/deequ/suggestions/rules/interval/WilsonScoreIntervalStrategy.scala @@ -16,7 +16,10 @@ package com.amazon.deequ.suggestions.rules.interval -import com.amazon.deequ.suggestions.rules.interval.ConfidenceIntervalStrategy.{ConfidenceInterval, defaultConfidence} +import com.amazon.deequ.suggestions.rules.interval.ConfidenceIntervalStrategy.{ + ConfidenceInterval, + defaultConfidence +} import scala.math.BigDecimal.RoundingMode diff --git a/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala b/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala index 7a1366e6a..8c20e193d 100644 --- a/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala +++ b/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala @@ -22,7 +22,10 @@ import com.amazon.deequ.checks.{Check, CheckLevel} import com.amazon.deequ.constraints.ConstrainableDataTypes import com.amazon.deequ.metrics.{Distribution, DistributionValue} import com.amazon.deequ.profiles._ -import com.amazon.deequ.suggestions.rules.interval.{WaldIntervalStrategy, WilsonScoreIntervalStrategy} +import com.amazon.deequ.suggestions.rules.interval.{ + WaldIntervalStrategy, + WilsonScoreIntervalStrategy +} import com.amazon.deequ.utils.FixtureSupport import com.amazon.deequ.{SparkContextSpec, VerificationSuite} import org.scalamock.scalatest.MockFactory From bfe2c78cb396f1ed44b33fe37a571b5f99eb04b6 Mon Sep 17 00:00:00 2001 From: Tuan Pham Date: Thu, 23 May 2024 18:39:01 +1000 Subject: [PATCH 12/12] Separate group import to per line import --- .../suggestions/rules/interval/WaldIntervalStrategy.scala | 6 ++---- .../rules/interval/WilsonScoreIntervalStrategy.scala | 6 ++---- .../deequ/suggestions/rules/ConstraintRulesTest.scala | 6 ++---- 3 files changed, 6 insertions(+), 12 deletions(-) diff --git a/src/main/scala/com/amazon/deequ/suggestions/rules/interval/WaldIntervalStrategy.scala b/src/main/scala/com/amazon/deequ/suggestions/rules/interval/WaldIntervalStrategy.scala index 773aa0337..154d8ebfe 100644 --- a/src/main/scala/com/amazon/deequ/suggestions/rules/interval/WaldIntervalStrategy.scala +++ b/src/main/scala/com/amazon/deequ/suggestions/rules/interval/WaldIntervalStrategy.scala @@ -16,10 +16,8 @@ package com.amazon.deequ.suggestions.rules.interval -import com.amazon.deequ.suggestions.rules.interval.ConfidenceIntervalStrategy.{ - ConfidenceInterval, - defaultConfidence -} +import com.amazon.deequ.suggestions.rules.interval.ConfidenceIntervalStrategy.ConfidenceInterval +import com.amazon.deequ.suggestions.rules.interval.ConfidenceIntervalStrategy.defaultConfidence import scala.math.BigDecimal.RoundingMode diff --git a/src/main/scala/com/amazon/deequ/suggestions/rules/interval/WilsonScoreIntervalStrategy.scala b/src/main/scala/com/amazon/deequ/suggestions/rules/interval/WilsonScoreIntervalStrategy.scala index 79b66bd2b..6e8371ea5 100644 --- a/src/main/scala/com/amazon/deequ/suggestions/rules/interval/WilsonScoreIntervalStrategy.scala +++ b/src/main/scala/com/amazon/deequ/suggestions/rules/interval/WilsonScoreIntervalStrategy.scala @@ -16,10 +16,8 @@ package com.amazon.deequ.suggestions.rules.interval -import com.amazon.deequ.suggestions.rules.interval.ConfidenceIntervalStrategy.{ - ConfidenceInterval, - defaultConfidence -} +import com.amazon.deequ.suggestions.rules.interval.ConfidenceIntervalStrategy.ConfidenceInterval +import com.amazon.deequ.suggestions.rules.interval.ConfidenceIntervalStrategy.defaultConfidence import scala.math.BigDecimal.RoundingMode diff --git a/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala b/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala index 8c20e193d..7b56e3938 100644 --- a/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala +++ b/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala @@ -22,10 +22,8 @@ import com.amazon.deequ.checks.{Check, CheckLevel} import com.amazon.deequ.constraints.ConstrainableDataTypes import com.amazon.deequ.metrics.{Distribution, DistributionValue} import com.amazon.deequ.profiles._ -import com.amazon.deequ.suggestions.rules.interval.{ - WaldIntervalStrategy, - WilsonScoreIntervalStrategy -} +import com.amazon.deequ.suggestions.rules.interval.WaldIntervalStrategy +import com.amazon.deequ.suggestions.rules.interval.WilsonScoreIntervalStrategy import com.amazon.deequ.utils.FixtureSupport import com.amazon.deequ.{SparkContextSpec, VerificationSuite} import org.scalamock.scalatest.MockFactory