From f043d19e13ae70a3dfde1deecddd69c5101ea155 Mon Sep 17 00:00:00 2001 From: Lance Bragstad Date: Thu, 16 Feb 2023 16:13:07 -0600 Subject: [PATCH] OCPBUGS-3009: Prune stale CCRs before aggregating scan results Previously, the compliance operator would leave CCRs around and then just overwrite them on subsequent scans. While the most recent scan data was accurate, because it was overwriting existing check results, it gave the impression that some changes weren't taking effect. For example, if you create a tailored profile, run a scan, exclude a rule, and rerun the scan, it appears the change you just made never took effect because the result from the rule you ignored still exists. To avoid this, let's prune stale results when we aggregate new results. --- CHANGELOG.md | 4 + cmd/manager/aggregator.go | 48 ++++++++ config/rbac/remediation_aggregator_role.yaml | 2 + tests/e2e/framework/common.go | 38 +++++++ tests/e2e/parallel/main_test.go | 113 +++++++++++++++++++ tests/e2e/serial/main_test.go | 21 ++-- 6 files changed, 217 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 36cd9230a..b82907d14 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -97,6 +97,10 @@ Versioning](https://semver.org/spec/v2.0.0.html). - The operator now parses links from the compliance content and renders it in custom resources accordingly. +- `ComplianceCheckResult` objects are automatically cleaned up on each scan. + This prevents the [illusion](https://issues.redhat.com/browse/OCPBUGS-3009) + that changes to profiles, like excluding rules, aren't taking effect because + stale `ComplianceCheckResult` objects are leftover from previous runs. - The operator have the ability to hide warnings for certain failed to fetched resources, this is useful when the user does not want to see the warnings diff --git a/cmd/manager/aggregator.go b/cmd/manager/aggregator.go index 18e5f925b..d1eaf9e40 100644 --- a/cmd/manager/aggregator.go +++ b/cmd/manager/aggregator.go @@ -38,6 +38,7 @@ import ( "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/version" @@ -557,6 +558,33 @@ func createResults(crClient aggregatorCrClient, scan *compv1alpha1.ComplianceSca // aggregator need to know it's using gRPC under the hood, probably // not). + // Find all the existing scan results. As we iterate through the list + // of the most recent results below, we should remove entries from the + // list of existing results. By the end of the loop, we should have a + // list of results that are effectively stale. For example, checks that + // have been disabled through a tailored profile. If we don't clean + // these up during the aggregation phase, it will look like they're + // still valid, even though they're from an old scan. + staleComplianceCheckResults := make(map[string]compv1alpha1.ComplianceCheckResult) + complianceCheckResults := compv1alpha1.ComplianceCheckResultList{} + withLabel := map[string]string{ + compv1alpha1.ComplianceScanLabel: scan.Name, + } + lo := runtimeclient.ListOptions{ + Namespace: scan.Namespace, + LabelSelector: labels.SelectorFromSet(withLabel), + } + err := crClient.getClient().List(context.TODO(), &complianceCheckResults, &lo) + if err != nil { + return fmt.Errorf("Unable to fetch existing ComplianceCheckResultList: %w", err) + } + for _, r := range complianceCheckResults.Items { + // Use a map so that we can find specific + // ComplianceCheckResults without iterating over the list for + // every new result from the latest scan. + staleComplianceCheckResults[r.Name] = r + } + for _, pr := range consistentResults { if pr == nil || pr.CheckResult == nil { cmdLog.Info("nil result or result.check, this shouldn't happen") @@ -592,6 +620,14 @@ func createResults(crClient aggregatorCrClient, scan *compv1alpha1.ComplianceSca if err := createOrUpdateOneResult(crClient, scan, checkResultLabels, checkResultAnnotations, checkResultExists, pr.CheckResult); err != nil { return fmt.Errorf("cannot create or update checkResult %s: %v", pr.CheckResult.Name, err) } + + // Remove the ComplianceCheckResult from the list of stale + // results so we don't delete it later. + _, ok := staleComplianceCheckResults[foundCheckResult.Name] + if ok { + delete(staleComplianceCheckResults, foundCheckResult.Name) + } + // Handle forwarding. f.SendComplianceCheckResult(pr.CheckResult) @@ -615,6 +651,18 @@ func createResults(crClient aggregatorCrClient, scan *compv1alpha1.ComplianceSca } } + // If there are any ComplianceCheckResults left in + // staleComplianceCheckResults, they were from previous scans and we + // should delete them. Otherwise, we give users the impression changes + // they've made to their scans, profiles, or settings haven't taken + // effect. + for _, result := range staleComplianceCheckResults { + err := crClient.getClient().Delete(context.TODO(), &result) + if err != nil { + return fmt.Errorf("Unable to delete stale ComplianceCheckResult %s: %w", result.Name, err) + } + } + return nil } diff --git a/config/rbac/remediation_aggregator_role.yaml b/config/rbac/remediation_aggregator_role.yaml index 59e6fa8da..0916fb074 100644 --- a/config/rbac/remediation_aggregator_role.yaml +++ b/config/rbac/remediation_aggregator_role.yaml @@ -52,8 +52,10 @@ rules: verbs: - create - get + - list - update - patch + - delete - apiGroups: - compliance.openshift.io resources: diff --git a/tests/e2e/framework/common.go b/tests/e2e/framework/common.go index e07d2fa69..00eb7e6d1 100644 --- a/tests/e2e/framework/common.go +++ b/tests/e2e/framework/common.go @@ -1647,6 +1647,26 @@ func (f *Framework) AssertCheckRemediation(name, namespace string, shouldHaveRem return nil } +func (f *Framework) AssertRemediationExists(name, namespace string) error { + var r compv1alpha1.ComplianceRemediation + err := f.Client.Get(context.TODO(), types.NamespacedName{Name: name, Namespace: namespace}, &r) + if err != nil { + return fmt.Errorf("Failed to assert ComplianceRemediation %s exists: %w", name, err) + } + return nil +} + +func (f *Framework) AssertRemediationDoesNotExists(name, namespace string) error { + var r compv1alpha1.ComplianceRemediation + err := f.Client.Get(context.TODO(), types.NamespacedName{Name: name, Namespace: namespace}, &r) + if apierrors.IsNotFound(err) { + return nil + } else if err != nil { + return fmt.Errorf("Failed to assert ComplianceRemediation %s does not exist: %w", name, err) + } + return fmt.Errorf("Failed to assert ComplianceRemediation %s does not exist.", name) +} + func (f *Framework) TaintNode(node *core.Node, taint core.Taint) error { taintedNode := node.DeepCopy() if taintedNode.Spec.Taints == nil { @@ -2334,3 +2354,21 @@ func (f *Framework) AssertCronJobIsNotSuspended(name string) error { log.Printf("CronJob %s is active", name) return nil } + +func (f *Framework) AssertScanDoesNotContainCheck(scanName, checkName, namespace string) error { + var getCheck compv1alpha1.ComplianceCheckResult + err := f.Client.Get(context.TODO(), types.NamespacedName{Name: checkName, Namespace: namespace}, &getCheck) + if apierrors.IsNotFound(err) { + // The check doesn't exist at all, which means it also doesn't exist in *any* scans + log.Printf("didn't find ComplianceCheckResult %s", checkName) + return nil + } else if err != nil { + return err + } + // Make sure the check we found actually belongs to the scan we're given. This is extra + // validation in case the same check is used across other scans. + if getCheck.Labels[compv1alpha1.ComplianceScanLabel] == scanName { + return fmt.Errorf("found ComplianceCheckResult %s in ComplianceScan %s", checkName, scanName) + } + return nil +} diff --git a/tests/e2e/parallel/main_test.go b/tests/e2e/parallel/main_test.go index 5151ac808..81794e08f 100644 --- a/tests/e2e/parallel/main_test.go +++ b/tests/e2e/parallel/main_test.go @@ -2855,5 +2855,118 @@ func TestRuleHasProfileAnnotation(t *testing.T) { t.Fatalf("expected to find profile %s in rule %s", profileName, rule.Name) } } +} + +func TestScanCleansUpComplianceCheckResults(t *testing.T) { + f := framework.Global + t.Parallel() + + tpName := framework.GetObjNameFromTest(t) + bindingName := tpName + "-binding" + + // create a tailored profile + tp := &compv1alpha1.TailoredProfile{ + ObjectMeta: metav1.ObjectMeta{ + Name: tpName, + Namespace: f.OperatorNamespace, + }, + Spec: compv1alpha1.TailoredProfileSpec{ + Title: tpName, + Description: tpName, + Extends: "ocp4-cis", + }, + } + + err := f.Client.Create(context.TODO(), tp, nil) + if err != nil { + t.Fatal(err) + } + defer f.Client.Delete(context.TODO(), tp) + // run a scan + ssb := compv1alpha1.ScanSettingBinding{ + ObjectMeta: metav1.ObjectMeta{ + Name: bindingName, + Namespace: f.OperatorNamespace, + }, + Profiles: []compv1alpha1.NamedObjectReference{ + { + Name: tpName, + Kind: "TailoredProfile", + APIGroup: "compliance.openshift.io/v1alpha1", + }, + }, + SettingsRef: &compv1alpha1.NamedObjectReference{ + Name: "default", + Kind: "ScanSetting", + APIGroup: "compliance.openshift.io/v1alpha1", + }, + } + err = f.Client.Create(context.TODO(), &ssb, nil) + if err != nil { + t.Fatal(err) + } + defer f.Client.Delete(context.TODO(), &ssb) + + if err := f.WaitForSuiteScansStatus(f.OperatorNamespace, bindingName, compv1alpha1.PhaseDone, compv1alpha1.ResultNonCompliant); err != nil { + t.Fatal(err) + } + + // verify a compliance check result exists + checkName := tpName + "-audit-profile-set" + checkResult := compv1alpha1.ComplianceCheckResult{ + ObjectMeta: metav1.ObjectMeta{ + Name: checkName, + Namespace: f.OperatorNamespace, + }, + ID: "xccdf_org.ssgproject.content_rule_audit_profile_set", + Status: compv1alpha1.CheckResultFail, + Severity: compv1alpha1.CheckResultSeverityMedium, + } + err = f.AssertHasCheck(bindingName, tpName, checkResult) + if err != nil { + t.Fatal(err) + } + if err := f.AssertRemediationExists(checkName, f.OperatorNamespace); err != nil { + t.Fatal(err) + } + + // update tailored profile to exclude the rule before we kick off another run + tpGet := &compv1alpha1.TailoredProfile{} + err = f.Client.Get(context.TODO(), types.NamespacedName{Name: tpName, Namespace: f.OperatorNamespace}, tpGet) + if err != nil { + t.Fatal(err) + } + + tpUpdate := tpGet.DeepCopy() + ruleName := "ocp4-audit-profile-set" + tpUpdate.Spec.DisableRules = []compv1alpha1.RuleReferenceSpec{ + { + Name: ruleName, + Rationale: "testing to ensure scan results are cleaned up", + }, + } + + err = f.Client.Update(context.TODO(), tpUpdate) + if err != nil { + t.Fatal(err) + } + + // rerun the scan + err = f.ReRunScan(tpName, f.OperatorNamespace) + if err != nil { + t.Fatal(err) + } + if err := f.WaitForSuiteScansStatus(f.OperatorNamespace, bindingName, compv1alpha1.PhaseDone, compv1alpha1.ResultNonCompliant); err != nil { + t.Fatal(err) + } + + // verify the compliance check result doesn't exist, which will also + // mean the compliance remediation should also be gone + if err = f.AssertScanDoesNotContainCheck(tpName, checkName, f.OperatorNamespace); err != nil { + t.Fatal(err) + } + if err = f.AssertRemediationDoesNotExists(checkName, f.OperatorNamespace); err != nil { + t.Fatal(err) + } } diff --git a/tests/e2e/serial/main_test.go b/tests/e2e/serial/main_test.go index 86a47482d..5f013cbe8 100644 --- a/tests/e2e/serial/main_test.go +++ b/tests/e2e/serial/main_test.go @@ -401,6 +401,15 @@ func TestAutoRemediate(t *testing.T) { t.Fatal(err) } + // Fetch remediation here so we can clean up the machine config later. + // We do this before the rescan takes place because the rescan will + // prune the remediation after the check passes. + rem := &compv1alpha1.ComplianceRemediation{} + err = f.Client.Get(context.TODO(), types.NamespacedName{Name: remName, Namespace: f.OperatorNamespace}, rem) + if err != nil { + t.Fatal(err) + } + // We can re-run the scan at this moment and check that it's now compliant // and it's reflected in a CheckResult err = f.ReRunScan(scanName, f.OperatorNamespace) @@ -436,15 +445,9 @@ func TestAutoRemediate(t *testing.T) { t.Fatal(err) } - // The test should not leave junk around, let's remove the MC and wait for the nodes to stabilize - // again - log.Printf("Removing applied remediation\n") - // Fetch remediation here so it can be deleted - rem := &compv1alpha1.ComplianceRemediation{} - err = f.Client.Get(context.TODO(), types.NamespacedName{Name: remName, Namespace: f.OperatorNamespace}, rem) - if err != nil { - t.Fatal(err) - } + // The test should not leave junk around, let's remove the MC and wait + // for the nodes to stabilize again + log.Printf("Removing applied machine config\n") mcfgToBeDeleted := rem.Spec.Current.Object.DeepCopy() mcfgToBeDeleted.SetName(rem.GetMcName()) err = f.Client.Delete(context.TODO(), mcfgToBeDeleted)