.github/workflows/e2e-test.yaml

name: e2e-test
on:
  pull_request:
  push:
    branches:
      - main
  workflow_dispatch:
    inputs:
      kernel-image:
        type: string
        description: 'The kernel image to use for the VMs. If not specified, a kernel will be built from source'
        required: false
      cluster:
        type: choice
        description: 'The cluster to run the tests on'
        options:
          - k3d
          - kind
        default: k3d
  workflow_call:
    inputs:
      tag:
        type: string
        description: 'Tag to use for images, skipping building'
        required: false
      push-yamls:
        type: boolean
        description: 'If true, pushes a tarball containing the rendered yaml manifests as an artifact'
        required: false

env:
  IMG_E2E_TEST: vm-postgres:15-bullseye

defaults:
  run:
    shell: bash -euo pipefail {0}

jobs:
  get-tag:
    outputs:
      tag: ${{ inputs.tag || steps.get-tag.outputs.tag }}
    runs-on: ubuntu-latest
    steps:
      - name: get tag
        if: ${{ inputs.tag == '' }}
        id: get-tag
        env:
          SHA: ${{ github.event.pull_request.head.sha || github.sha }}
        run: |
          test -n "$SHA"
          sha="${SHA::7}"
          echo "tag=$sha.$GITHUB_RUN_ID" | tee -a $GITHUB_OUTPUT

  build-images:
    needs: get-tag
    uses: ./.github/workflows/build-images.yaml
    with:
      skip: ${{ inputs.tag != '' }}
      tag: ${{ inputs.tag || needs.get-tag.outputs.tag }}
      kernel-image: ${{ inputs.kernel-image }}
      # note: setting to preserve runner pods will mean that if !skip, they'll be built with those
      # settings and used properly in the tests. But if skip (because inputs.tag != ''), then this
      # setting will have no effect and the release images will be normal.
      controller-preserve-runner-pods: true
    secrets: inherit

  build-test-vm:
    needs: get-tag
    uses: ./.github/workflows/build-test-vm.yaml
    with:
      skip: ${{ inputs.tag != '' }}
      tag: ${{ inputs.tag || needs.get-tag.outputs.tag }}
    secrets: inherit

  e2e-tests:
    needs: [ build-images, build-test-vm ]
    strategy:
      fail-fast: false
      matrix:
        cluster:
          - ${{ inputs.cluster || 'k3d' }}
    runs-on: [ self-hosted, gen3, large ]
    steps:
      - uses: actions/checkout@v4
        with:
          fetch-depth: 0 # fetch all, so that we also include tags

      - uses: actions/setup-go@v5
        with:
          go-version-file: 'go.mod'
          # Disable cache on self-hosted runners to avoid /usr/bin/tar errors, see https://github.com/actions/setup-go/issues/403
          cache: false
        # Sometimes setup-go gets stuck. Without this, it'll keep going until the job gets killed
        timeout-minutes: 10

      - name: Install dependencies
        run: |
          sudo apt install -y python3-venv
          make e2e-tools
          echo $(pwd)/bin >> $GITHUB_PATH

      - name: Check dependencies
        run: |
          kubectl version --client --output=yaml
          k3d version
          kind version
          kuttl version
          docker version

      - run: make render-release
        env:
          IMG_CONTROLLER:       ${{ needs.build-images.outputs.controller }}
          IMG_VXLAN_CONTROLLER: ${{ needs.build-images.outputs.vxlan-controller }}
          IMG_RUNNER:           ${{ needs.build-images.outputs.runner }}
          IMG_SCHEDULER:        ${{ needs.build-images.outputs.scheduler }}
          IMG_AUTOSCALER_AGENT: ${{ needs.build-images.outputs.autoscaler-agent }}

      - name: upload manifests
        # nb: use format(..) to catch both inputs.push-yamls = true AND inputs.push-yamls = 'true'.
        if: ${{ format('{0}', inputs.push-yamls) == 'true' }}
        uses: actions/upload-artifact@v4
        with:
          name: rendered_manifests
          # nb: prefix before wildcard is removed from the uploaded files, so the artifact should
          # contain e.g.
          #   - autoscale-scheduler.yaml
          #   - autoscaler-agent.yaml
          #   ...
          # ref https://github.com/actions/upload-artifact#upload-using-multiple-paths-and-exclusions
          path: rendered_manifests/*
          if-no-files-found: error
          retention-days: 2 # minimum is 1 day; 0 is default. These are only used temporarily.

      - name: set custom docker config directory
        uses: ./.github/actions/set-docker-config-dir

      - uses: docker/login-action@v3
        with:
          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}

      # https://docs.k3s.io/installation/private-registry#registries-configuration-file
      # https://github.com/neondatabase/autoscaling/issues/975
      - name: set k3d registries.yaml
        # TODO: Implement an equivalent for kind?
        # Relevant docs seem to be here: https://kind.sigs.k8s.io/docs/user/private-registries
        if: ${{ matrix.cluster == 'k3d' }}
        env:
          DOCKERHUB_USERNAME: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
          DOCKERHUB_PASSWORD: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
        run: |
          {
            echo "configs:"
            echo "  registry-1.docker.io:"
            echo "    auth:"
            echo "      username: $DOCKERHUB_USERNAME"
            echo "      password: $DOCKERHUB_PASSWORD"
          } >> $(pwd)/k3d/registries.yaml


      - run: make ${{ matrix.cluster }}-setup
        env:
          USE_REGISTRIES_FILE: true

      - name: deploy components
        timeout-minutes: 3
        run: |
          rendered () { echo "rendered_manifests/$1"; }

          kubectl apply -f $(rendered multus.yaml)
          kubectl -n kube-system rollout status daemonset kube-multus-ds
          kubectl apply -f $(rendered whereabouts.yaml)
          kubectl -n kube-system rollout status daemonset whereabouts
          kubectl apply -f $(rendered neonvm-runner-image-loader.yaml)
          kubectl -n neonvm-system rollout status daemonset neonvm-runner-image-loader
          kubectl apply -f $(rendered neonvm.yaml)
          kubectl -n neonvm-system rollout status daemonset neonvm-device-plugin
          kubectl apply -f $(rendered neonvm-controller.yaml)
          kubectl -n neonvm-system rollout status deployment neonvm-controller
          kubectl apply -f $(rendered neonvm-vxlan-controller.yaml)
          kubectl -n neonvm-system rollout status daemonset neonvm-vxlan-controller
          kubectl apply -f $(rendered autoscale-scheduler.yaml)
          kubectl -n kube-system rollout status deployment autoscale-scheduler
          kubectl apply -f $(rendered autoscaler-agent.yaml)
          kubectl -n kube-system rollout status daemonset autoscaler-agent

      - name: load e2e test vm image
        env:
          TEST_IMAGE: ${{ needs.build-test-vm.outputs.vm-postgres-16-bullseye }}
        timeout-minutes: 2
        run: |
          # Pull the docker image so we can re-tag it, because using a consistent tag inside the
          # cluster means we can avoid dynamically editing the image used in the kuttl files.
          docker pull "$TEST_IMAGE"
          docker image tag "$TEST_IMAGE" "$IMG_E2E_TEST"
          make load-example-vms

      - run: make e2e
        timeout-minutes: 15

      - name: Get k8s logs and events
        if: always()
        run: |
          if ! kubectl config current-context; then
            echo "skipping cluster logs because no cluster found in kubectl context"
            exit 0
          fi

          namespaces=$(kubectl get namespaces -o jsonpath='{.items[*].metadata.name}')
          for namespace in $namespaces; do
            if [[ "$namespace" == "neonvm-system" ]] || [[ "$namespace" == kuttl-test-* ]]; then
              tee_if_needed=$GITHUB_STEP_SUMMARY
            else
              tee_if_needed=/dev/null
            fi

            {
              echo "<details>"
              echo "<summary>Namespace=$namespace</summary>"
            } | tee -a $tee_if_needed

            pods=$(kubectl get pods -n $namespace -o jsonpath='{.items[*].metadata.name}')
            for pod in $pods; do
              {
                echo "<details>"
                echo "<summary>- Namespace=$namespace Pod=$pod Logs</summary>"
                echo "<pre>"
              } | tee -a $tee_if_needed

              restarts=$(
                kubectl get pod -n $namespace $pod -o jsonpath='{.status.containerStatuses[0].restartCount}' || echo '0'
              )
              {
                if [ "$restarts" -ne 0 ]; then
                  echo "CONTAINER RESTARTED $restarts TIME(S)"
                  echo "Previous logs:"
                  kubectl logs -n $namespace -p $pod || echo 'Error getting logs'
                  echo "Current logs:"
                  kubectl logs -n $namespace $pod || echo 'Error getting logs'
                else
                  echo "Logs:"
                  kubectl logs -n $namespace $pod || echo 'Error getting logs'
                fi
              } | tee -a $tee_if_needed
              {
                echo "</pre>"
                echo "</details>"
              } | tee -a $tee_if_needed

              {
                echo "<details>"
                echo "<summary>- Namespace=$namespace Pod=$pod Events</summary>"
                echo "<pre>"
              } | tee -a $tee_if_needed

              (kubectl get events --namespace $namespace --field-selector involvedObject.name=$pod || echo 'Error getting events') | tee -a $tee_if_needed

              {
                echo "</pre>"
                echo "</pre>"
                echo "</details>"
              } | tee -a $tee_if_needed
            done

            echo "</details>" | tee -a $tee_if_needed
          done

      - name: Cleanup
        if: always()
        run: make ${{ matrix.cluster }}-destroy