Merge pull request #123 from dasmeta/DMVP-5664-flagger-canary-deploym…

…ents-ability DMVP-5664 flagger canary deployments ability
dasmeta · Nov 6, 2024 · f6ec885 · f6ec885
2 parents 8da817f + d1728c2
commit f6ec885
Show file tree

Hide file tree

Showing 11 changed files with 346 additions and 4 deletions.
diff --git a/charts/base/Chart.yaml b/charts/base/Chart.yaml
@@ -15,10 +15,10 @@ type: application
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
 # Versions are expected to follow Semantic Versioning (https://semver.org/)
-version: 0.2.6
+version: 0.2.7
 
 # This is the version number of the application being deployed. This version number should be
 # incremented each time you make changes to the application. Versions are not expected to
 # follow Semantic Versioning. They should reflect the version the application is using.
 # It is recommended to use it with quotes.
-appVersion: "0.2.6"
+appVersion: "0.2.7"
diff --git a/charts/base/README.md b/charts/base/README.md
@@ -602,3 +602,71 @@ terminationGracePeriodSeconds: 65
 ### Deployment use chart-hooks
 annotations:
   "helm.sh/hook": pre-install,pre-upgrade
+
+
+### custom rollout strategy(canary,blue/gree) configs by using flagger
+```yaml
+# This config allows to enable custom rollout strategies by using different providers/operators
+# right now only flagger (https://flagger.app/) operator supported and tested for canary with nginx
+## NOTE for flagger operator:
+## - flagger supports several service meshes and ingresses as provider for traffic splitting, and by default we have using nginx here, so you have to check docs and have at least one used for you app
+## - you need to have flagger tool/operator already installed to be able to use its crd, this can be done by installing flagger helm https://artifacthub.io/packages/helm/flagger/flagger
+## - also there is need to have at least one metric server/provider enabled(it supports) like prometheus as it uses metrics for checking success rates, the flagger helm allows to install prometheus
+## - with flagger enabled we disable native kubernetes service as flagger creates/overrides this service
+## - with separate installed prometheus operator(not one that comes with flagger helm) the default `request-success-rate` and `request-duration` metrics templates may not work so you may need to create custom metric templates, the canary+nginx+prometheus metric template can be created by using `dasmeta/flagger-metric-template` chart
+rolloutStrategy:
+  enabled: true
+  operator: flagger
+  configs: # here are all supported flagger configs
+    provider: nginx # the flagger ingress/service-mesh provider (default nginx)
+    progressDeadlineSeconds: 61 # the maximum time in seconds for the canary deployment to make progress before it is rollback (default 600s)
+    canaryReadyThreshold: 51 # minimum percentage of canary pods that must be ready before considering canary ready for traffic shifting (default 100)
+    primaryReadyThreshold: 51 # minimum percentage of primary pods that must be ready before considering primary ready for traffic shifting (default 100)
+    interval: 11s # schedule interval (default 60s)
+    threshold: 11 # max number of failed metric checks before rollback (default 10)
+    maxWeight: 31 # max traffic percentage (0-100) routed to canary (default 30)
+    stepWeight: 11 # canary increment step percentage (0-100) (default 10)
+    # min and max replicas count for primary hpa, default to main app hpa, the main app hpa values also being used for canary deploy hpa so we use this options to have custom values for primary hpa
+    primaryScalerMinReplicas: 3
+    primaryScalerMaxReplicas: 7
+    metrics: # metrics template configs to use for identifying if canary deploy handles request normally, the `request-success-rate` and `request-duration` named ones are available by default, and you can create custom metric templates
+      - name: request-success-rate
+        # minimum req success rate (non 5xx responses) percentage (0-100)
+        thresholdRange:
+          min: 99
+        interval: 1m
+      - name: request-duration
+        # maximum req duration P99, milliseconds
+        thresholdRange:
+          max: 500
+        interval: 1m
+      # - name: request-success-rate-custom
+      #   interval: 1m
+      #   templateRef:
+      #     name: request-success-rate-custom
+      #     namespace: ingress-nginx
+      #   # minimum req success rate (non 5xx responses) percentage (0-100)
+      #   thresholdRange:
+      #     min: 99
+      # - name: request-duration-custom
+      #   interval: 1m
+      #   templateRef:
+      #     name: request-duration-custom
+      #     namespace: ingress-nginx
+      #   # maximum req duration P99, milliseconds
+      #   thresholdRange:
+      #     max: 500
+    webhooks: # (optional) webhooks can be used for load testing before traffic switching to canaries by using `pre-rollout` type and also generating traffic
+      - name: acceptance-test
+        type: pre-rollout
+        url: http://flagger-loadtester.localhost/
+        timeout: 30s
+        metadata:
+          type: bash
+          cmd: "curl -sd 'test' http://http-echo-canary/ping | grep ping"
+      - name: load-test
+        url: http://flagger-loadtester.localhost/
+        timeout: 5s
+        metadata:
+          cmd: "hey -z 1m -q 3 -c 1 http://http-echo.localhost/ping"
+```
diff --git a/charts/base/templates/rollout-strategy.yaml b/charts/base/templates/rollout-strategy.yaml
@@ -0,0 +1,60 @@
+{{- if .Values.rolloutStrategy.enabled -}}
+{{- if eq .Values.rolloutStrategy.operator "flagger" -}}
+apiVersion: flagger.app/v1beta1
+kind: Canary
+metadata:
+  name: {{ include "base.fullname" . }}
+spec:
+  provider: {{ .Values.rolloutStrategy.configs.provider | default "nginx" }}
+  # deployment reference
+  targetRef:
+    apiVersion: apps/v1
+    kind: Deployment
+    name: {{ include "base.fullname" . }}
+  # ingress reference (TODO: there is supports for multiple ingresses, check and implement multiple ingress support here also )
+  ingressRef:
+    apiVersion: networking.k8s.io/v1
+    kind: Ingress
+    name: {{ include "base.fullname" . }}
+  {{- if .Values.autoscaling.enabled }}
+  # HPA reference (optional)
+  autoscalerRef:
+    apiVersion: autoscaling/v2
+    kind: HorizontalPodAutoscaler
+    name: {{ include "base.fullname" . }}
+    primaryScalerReplicas:
+      # min and max replicas count for primary hpa, default to main app hpa, the main app hpa values also being used for canary deploy hpa so we use this options to have custom values for primary hpa
+      minReplicas: {{ .Values.rolloutStrategy.configs.primaryScalerMinReplicas | default .Values.autoscaling.minReplicas }}
+      maxReplicas: {{ .Values.rolloutStrategy.configs.primaryScalerMaxReplicas | default .Values.autoscaling.maxReplicas }}
+  {{- end }}
+
+  # the maximum time in seconds for the canary deployment to make progress before it is rollback (default 600s)
+  progressDeadlineSeconds: {{ .Values.rolloutStrategy.configs.progressDeadlineSeconds | default 600 }}
+  service:
+    # ClusterIP port number
+    port: {{ .Values.service.port }}
+    # container port number or name
+    targetPort: {{ .Values.service.targetPort | default .Values.containerPort }}
+  analysis:
+    # minimum percentage of canary pods that must be ready before considering canary ready for traffic shifting (default 100)
+    canaryReadyThreshold: {{ .Values.rolloutStrategy.configs.canaryReadyThreshold | default 100 }}
+    # minimum percentage of primary pods that must be ready before considering primary ready for traffic shifting (default 100)
+    primaryReadyThreshold: {{ .Values.rolloutStrategy.configs.primaryReadyThreshold | default 100 }}
+    # schedule interval (default 60s)
+    interval: {{ .Values.rolloutStrategy.configs.interval | default "60s" }}
+    # max number of failed metric checks before rollback (default 10)
+    threshold: {{ .Values.rolloutStrategy.configs.threshold | default 10 }}
+    # max traffic percentage (0-100) routed to canary (default 30)
+    maxWeight: {{ .Values.rolloutStrategy.configs.maxWeight | default 30 }}
+    # canary increment step percentage (0-100) (default 10)
+    stepWeight: {{ .Values.rolloutStrategy.configs.stepWeight | default 10 }}
+    # metrics checks
+    metrics:
+      {{- toYaml .Values.rolloutStrategy.configs.metrics | nindent 6 }}
+    {{- if .Values.rolloutStrategy.configs.webhooks }}
+    # testing (optional)
+    webhooks:
+      {{- toYaml .Values.rolloutStrategy.configs.webhooks | nindent 6 }}
+    {{- end }}
+{{- end }}
+{{- end }}
diff --git a/charts/base/templates/service.yaml b/charts/base/templates/service.yaml
@@ -1,4 +1,4 @@
-{{- if .Values.service.enabled -}}
+{{- if and .Values.service.enabled (not (and .Values.rolloutStrategy.enabled (eq .Values.rolloutStrategy.operator "flagger") ) ) -}}
 apiVersion: v1
 kind: Service
 metadata:

diff --git a/charts/base/values.yaml b/charts/base/values.yaml
@@ -49,7 +49,6 @@ job:
   serviceAccount:
     create: false
     annotations: {}
-
 
 podAnnotations: {}
 
@@ -237,3 +236,70 @@ serviceMonitor:
   interval: 30s
   targetPort: 80
   path: /metrics
+
+# This config allows to enable custom rollout strategies by using different providers/operators
+# right now only flagger (https://flagger.app/) operator supported and tested for canary with nginx
+## NOTE for flagger operator:
+## - flagger supports several service meshes and ingresses as provider for traffic splitting, and by default we have using nginx here, so you have to check docs and have at least one used for you app
+## - you need to have flagger tool/operator already installed to be able to use its crd, this can be done by installing flagger helm https://artifacthub.io/packages/helm/flagger/flagger
+## - also there is need to have at least one metric server/provider enabled(it supports) like prometheus as it uses metrics for checking success rates, the flagger helm allows to install prometheus
+## - with flagger enabled we disable native kubernetes service as flagger creates/overrides this service
+## - with separate installed prometheus operator(not one that comes with flagger helm) the default `request-success-rate` and `request-duration` metrics templates may not work so you may need to create custom metric templates, the canary+nginx+prometheus metric template can be created by using `dasmeta/flagger-metric-template` chart
+rolloutStrategy:
+  enabled: false
+  operator: flagger
+  configs: {}
+  # here are all supported flagger configs
+  # configs:
+  #   provider: nginx # the flagger ingress/service-mesh provider (default nginx)
+  #   progressDeadlineSeconds: 61 # the maximum time in seconds for the canary deployment to make progress before it is rollback (default 600s)
+  #   canaryReadyThreshold: 51 # minimum percentage of canary pods that must be ready before considering canary ready for traffic shifting (default 100)
+  #   primaryReadyThreshold: 51 # minimum percentage of primary pods that must be ready before considering primary ready for traffic shifting (default 100)
+  #   interval: 11s # schedule interval (default 60s)
+  #   threshold: 11 # max number of failed metric checks before rollback (default 10)
+  #   maxWeight: 31 # max traffic percentage (0-100) routed to canary (default 30)
+  #   stepWeight: 11 # canary increment step percentage (0-100) (default 10)
+  #   # min and max replicas count for primary hpa, default to main app hpa, the main app hpa values also being used for canary deploy hpa so we use this options to have custom values for primary hpa
+  #   primaryScalerMinReplicas: 3
+  #   primaryScalerMaxReplicas: 7
+  #   metrics: # metrics template configs to use for identifying if canary deploy handles request normally, the `request-success-rate` and `request-duration` named ones are available by default, and you can create custom metric templates
+  #     - name: request-success-rate
+  #       # minimum req success rate (non 5xx responses) percentage (0-100)
+  #       thresholdRange:
+  #         min: 99
+  #       interval: 1m
+  #     - name: request-duration
+  #       # maximum req duration P99, milliseconds
+  #       thresholdRange:
+  #         max: 500
+  #       interval: 1m
+  #     # - name: request-success-rate-custom
+  #     #   interval: 1m
+  #     #   templateRef:
+  #     #     name: request-success-rate-custom
+  #     #     namespace: ingress-nginx
+  #     #   # minimum req success rate (non 5xx responses) percentage (0-100)
+  #     #   thresholdRange:
+  #     #     min: 99
+  #     # - name: request-duration-custom
+  #     #   interval: 1m
+  #     #   templateRef:
+  #     #     name: request-duration-custom
+  #     #     namespace: ingress-nginx
+  #     #   # maximum req duration P99, milliseconds
+  #     #   thresholdRange:
+  #     #     max: 500
+  #
+  #   webhooks: # (optional) webhooks can be used for load testing before traffic switching to canaries by using `pre-rollout` type and also generating traffic
+  #     - name: acceptance-test
+  #       type: pre-rollout
+  #       url: http://flagger-loadtester.localhost/
+  #       timeout: 30s
+  #       metadata:
+  #         type: bash
+  #         cmd: "curl -sd 'test' http://http-echo-canary/ping | grep ping"
+  #     - name: load-test
+  #       url: http://flagger-loadtester.localhost/
+  #       timeout: 5s
+  #       metadata:
+  #         cmd: "hey -z 1m -q 3 -c 1 http://http-echo.localhost/ping"
diff --git a/charts/flagger-metric-template/.helmignore b/charts/flagger-metric-template/.helmignore
@@ -0,0 +1,23 @@
+# Patterns to ignore when building packages.
+# This supports shell glob matching, relative path matching, and
+# negation (prefixed with !). Only one pattern per line.
+.DS_Store
+# Common VCS dirs
+.git/
+.gitignore
+.bzr/
+.bzrignore
+.hg/
+.hgignore
+.svn/
+# Common backup files
+*.swp
+*.bak
+*.tmp
+*.orig
+*~
+# Various IDEs
+.project
+.idea/
+*.tmproj
+.vscode/
diff --git a/charts/flagger-metric-template/Chart.yaml b/charts/flagger-metric-template/Chart.yaml
@@ -0,0 +1,24 @@
+apiVersion: v2
+name: flagger-metric-template
+description: A Helm chart for Kubernetes to create Flagger metric templates
+
+# A chart can be either an 'application' or a 'library' chart.
+#
+# Application charts are a collection of templates that can be packaged into versioned archives
+# to be deployed.
+#
+# Library charts provide useful utilities or functions for the chart developer. They're included as
+# a dependency of application charts to inject those utilities and functions into the rendering
+# pipeline. Library charts do not define any templates and therefore cannot be deployed.
+type: application
+
+# This is the chart version. This version number should be incremented each time you make changes
+# to the chart and its templates, including the app version.
+# Versions are expected to follow Semantic Versioning (https://semver.org/)
+version: 0.1.0
+
+# This is the version number of the application being deployed. This version number should be
+# incremented each time you make changes to the application. Versions are not expected to
+# follow Semantic Versioning. They should reflect the version the application is using.
+# It is recommended to use it with quotes.
+appVersion: "0.1.0"
diff --git a/charts/flagger-metric-template/README.md b/charts/flagger-metric-template/README.md
@@ -0,0 +1,11 @@
+# This helm chart allows to create flagger custom metric templates to use in canary rollout
+
+## There is option named `createNginxCustomMetricTemplates`(which is true by default) to create nginx custom metrics named `request-success-rate-custom` and `request-duration-custom` 
+
+## example of custom metric templates
+```yaml
+metricTemplates:
+  - name: my-custom-request-rate-metric-template
+    query: |
+      sum(rate(nginx_ingress_controller_requests{exported_namespace="{{ namespace }}",ingress="{{ ingress }}",status!~"5.*"}[1m]))/sum(rate(nginx_ingress_controller_requests{exported_namespace="{{ namespace }}",ingress="{{ ingress }}"}[1m]))*100
+```
diff --git a/charts/flagger-metric-template/templates/flagger-canary-nginx-custom.metric-template.yaml b/charts/flagger-metric-template/templates/flagger-canary-nginx-custom.metric-template.yaml
@@ -0,0 +1,60 @@
+{{- if .Values.createNginxCustomMetricTemplates -}}
+apiVersion: flagger.app/v1beta1
+kind: MetricTemplate
+metadata:
+  name: request-success-rate-custom
+spec:
+  provider:
+    {{- toYaml .Values.provider | nindent 4 }}
+  query: |
+    sum(
+      rate(
+        nginx_ingress_controller_requests{
+          exported_namespace="{{ "{{" }} namespace {{ "}}" }}",
+          ingress="{{ "{{" }} ingress {{ "}}" }}",
+          canary!="",
+          status!~"5.*"
+        }[{{ "{{" }} interval {{ "}}" }}]
+      )
+    ) 
+    / 
+    sum(
+      rate(
+        nginx_ingress_controller_requests{
+          exported_namespace="{{ "{{" }} namespace {{ "}}" }}",
+          ingress="{{ "{{" }} ingress {{ "}}" }}",
+          canary!=""
+        }[{{ "{{" }} interval {{ "}}" }}]
+      )
+    ) 
+    * 100
+---
+apiVersion: flagger.app/v1beta1
+kind: MetricTemplate
+metadata:
+  name: request-duration-custom
+spec:
+  provider:
+    {{- toYaml .Values.provider | nindent 4 }}
+  query: |
+    sum(
+      rate(
+        nginx_ingress_controller_response_duration_seconds_sum{
+          exported_namespace="{{ "{{" }} namespace {{ "}}" }}",
+          ingress="{{ "{{" }} ingress {{ "}}" }}",
+          canary!=""
+        }[{{ "{{" }} interval {{ "}}" }}]
+      )
+    ) 
+    / 
+    sum(
+      rate(
+        nginx_ingress_controller_response_duration_seconds_count{
+          exported_namespace="{{ "{{" }} namespace {{ "}}" }}",
+          ingress="{{ "{{" }} ingress {{ "}}" }}",
+          canary!=""
+        }[{{ "{{" }} interval {{ "}}" }}]
+      )
+    ) 
+    * 1000
+{{- end }}
diff --git a/charts/flagger-metric-template/templates/metric-templates.yaml b/charts/flagger-metric-template/templates/metric-templates.yaml
@@ -0,0 +1,11 @@
+{{- range $key, $metricTemplate := .Values.metricTemplates }}
+---
+apiVersion: flagger.app/v1beta1
+kind: MetricTemplate
+metadata:
+  name: {{ $metricTemplate.name }}
+spec:
+  provider:
+    {{- toYaml $.Values.provider | nindent 4 }}
+  query: {{ $metricTemplate.query }}
+{{- end }}
diff --git a/charts/flagger-metric-template/values.yaml b/charts/flagger-metric-template/values.yaml
@@ -0,0 +1,19 @@
+# Default values for flagger-metric-template.
+# This is a YAML-formatted file.
+# Declare variables to be passed into your templates.
+
+# metrics provider config for metric-templates, for more info check doc https://docs.flagger.app/usage/metrics#custom-metrics
+provider:
+  type: prometheus
+  address: http://prometheus-service.monitoring:9090
+
+# Whether to create `request-success-rate-custom` and `request-duration-custom` nginx metric templates, this can be used in case we use custom prometheus operator instead of flagger helm included one
+createNginxCustomMetricTemplates: true
+
+# List of custom metric template configs
+metricTemplates: []
+## example for metric templates
+# metricTemplates:
+#   - name: my-custom-request-rate-metric-template
+#     query: |
+#       sum(rate(nginx_ingress_controller_requests{exported_namespace="{{ namespace }}",ingress="{{ ingress }}",status!~"5.*"}[1m]))/sum(rate(nginx_ingress_controller_requests{exported_namespace="{{ namespace }}",ingress="{{ ingress }}"}[1m]))*100