From 66ecf82670254cafae0e84f9fc30d6fa454e5e28 Mon Sep 17 00:00:00 2001
From: Daniel Lipovetsky <dlipovetsky@d2iq.com>
Date: Wed, 11 Oct 2023 13:51:59 -0700
Subject: [PATCH 1/2] feat: Update cloud-init customization

Changes relative to upstream:
* Add explanatory comments
* Do not use stderr output of preKubeadmCommands indicate an error with
  bootstrapping

Changes relative to our fork:
* Do not enable IPv6
* Do not remove cloud-init logs and seed
* Do not disable VMware customization
* Do not disable network configuration
* Do not truncate cloud-init-output.log
* Do not report status of HTTP proxy configuration
* Do not configure cloud-init to remove SSH keys on first boot
* Remove commands that are already executed as a result of being defined in `preKubeadmCommands`
---
 controllers/cluster_scripts/cloud_init.tmpl | 49 +++++++++++----------
 1 file changed, 26 insertions(+), 23 deletions(-)

diff --git a/controllers/cluster_scripts/cloud_init.tmpl b/controllers/cluster_scripts/cloud_init.tmpl
index ea5f93e19..ff1e9d9ab 100644
--- a/controllers/cluster_scripts/cloud_init.tmpl
+++ b/controllers/cluster_scripts/cloud_init.tmpl
@@ -3,6 +3,14 @@ users:
   - name: root
     lock_passwd: false
 write_files:
+# Due to a known issue with VMware Guest Customization, cloud-init believes every boot
+# is the first boot. This ensures that cloud-init does not remove SSH keys on a reboot.
+- path: /etc/cloud/cloud.cfg.d/cse.cfg
+  owner: root
+  content: |
+     ssh_deletekeys: false
+# The control_plane.sh script runs on the first control plane machine. The node.sh script
+# runs on every subsequent control plane machine, and every worker machine.
 - path: /root/ {{- if .ControlPlane -}} control_plane {{- else -}} node {{- end -}} .sh
   owner: root
   content: |
@@ -11,12 +19,14 @@ write_files:
       vmtoolsd --cmd "info-set guestinfo.post_customization_script_execution_status $?"
       ERROR_MESSAGE="$(date) $(caller): $BASH_COMMAND"
       echo "$ERROR_MESSAGE" &>> /var/log/capvcd/customization/error.log
-      if [[ -s /root/kubeadm.err ]]
+      vmtoolsd --cmd "info-set guestinfo.post_customization_script_execution_failure_reason $ERROR_MESSAGE"
+
+      CLOUD_INIT_OUTPUT=""
+      if [[ -f /var/log/cloud-init-output.log ]]
       then
-        KUBEADM_FAILURE=$(cat /root/kubeadm.err)
-        ERROR_MESSAGE="$ERROR_MESSAGE $KUBEADM_FAILURE"
+        CLOUD_INIT_OUTPUT=$(</var/log/cloud-init-output.log)
       fi
-      vmtoolsd --cmd "info-set guestinfo.post_customization_script_execution_failure_reason $ERROR_MESSAGE"
+      vmtoolsd --cmd "info-set guestinfo.post_customization_cloud_init_output $CLOUD_INIT_OUTPUT"
     }
     mkdir -p /var/log/capvcd/customization
     trap 'catch $? $LINENO' ERR EXIT
@@ -33,21 +43,21 @@ write_files:
     CSI_NODE_PATH=/root/csi-node.yaml {{- end }}
 
     vmtoolsd --cmd "info-set guestinfo.postcustomization.networkconfiguration.status in_progress"
-    hostname "{{ .MachineName }}"
-    echo "::1         ipv6-localhost ipv6-loopback" >/etc/hosts
-    echo "127.0.0.1   localhost" >>/etc/hosts
-    echo "{{ .MachineName }}" >/etc/hostname
-    echo "127.0.0.1" `hostname` >>/etc/hosts
+    echo 'net.ipv6.conf.all.disable_ipv6 = 1' >> /etc/sysctl.conf
+    echo 'net.ipv6.conf.default.disable_ipv6 = 1' >> /etc/sysctl.conf
+    echo 'net.ipv6.conf.lo.disable_ipv6 = 1' >> /etc/sysctl.conf
+    sudo sysctl -p
+    # also remove ipv6 localhost entry from /etc/hosts
+    sed -i 's/::1/127.0.0.1/g' /etc/hosts || true
     vmtoolsd --cmd "info-set guestinfo.postcustomization.networkconfiguration.status successful"
 
-    vmtoolsd --cmd "info-set guestinfo.metering.status in_progress"
+    # Our images do not ship the VCD metering service, but CAPVCD requires a successful status to finish bootstrapping.
     vmtoolsd --cmd "info-set guestinfo.metering.status successful"
 
-    vmtoolsd --cmd "info-set guestinfo.postcustomization.proxy.setting.status in_progress"
-    vmtoolsd --cmd "info-set guestinfo.postcustomization.proxy.setting.status successful"
-
     vmtoolsd --cmd "info-set {{ if .ControlPlane -}} guestinfo.postcustomization.kubeinit.status {{- else -}} guestinfo.postcustomization.kubeadm.node.join.status {{- end }} in_progress"
-    {{ .BootstrapRunCmd }}
+    {
+{{ .BootstrapRunCmd }}
+    }
     if [[ ! -f /run/cluster-api/bootstrap-success.complete ]]
     then
       echo "file /run/cluster-api/bootstrap-success.complete not found" &>> /var/log/capvcd/customization/error.log
@@ -58,12 +68,8 @@ write_files:
     echo "$(date) post customization script execution completed" &>> /var/log/capvcd/customization/status.log
     exit 0
 runcmd:
-- 'sudo cloud-init clean --seed --logs'
-- 'sudo cat /dev/null > /var/log/cloud-init-output.log'
+- 'cloud-init clean'
 {{ if .ControlPlane }}
-- '[ ! -f /run/kubeadm/konvoy-set-kube-proxy-configuration.sh] && sudo reboot'
-- '[ ! -f /run/konvoy/containerd-apply-patches.sh] && sudo reboot'
-- '[ ! -f /run/konvoy/restart-containerd-and-wait.sh] && sudo reboot'
 - '[ ! -f /root/control_plane.sh ] && sudo reboot'
 - '[ ! -f /run/kubeadm/kubeadm.yaml ] && sudo reboot'
 - bash /root/control_plane.sh
@@ -74,9 +80,6 @@ runcmd:
 {{ end }}
 timezone: UTC
 disable_root: false
-disable_vmware_customization: true
-network:
-  config: disabled
 preserve_hostname: false
 hostname: "{{ .MachineName }}"
-final_message: "The system is ready after $UPTIME seconds"
\ No newline at end of file
+final_message: "The system is ready after $UPTIME seconds"

From d8316d165661a26a725fb2a6302f7d60fe237af9 Mon Sep 17 00:00:00 2001
From: Daniel Lipovetsky <dlipovetsky@d2iq.com>
Date: Thu, 12 Oct 2023 19:45:19 -0700
Subject: [PATCH 2/2] Simplify cloud init

* Use shell script to clean cloud-init cache and reboot.
* Fix error handling of bootstrap script. Do not interpret stderr output
  as an indicator of failure. Do not rely on trap and errexit, because
  it does not work for command lists.
* Include last lines of output for error context.
* Ensure we have an IPv4 address for localhost.
* Remove unnecessary cloud-init configuration to preserve SSH host keys.
---
 controllers/cluster_scripts/cloud_init.tmpl | 128 ++++++++++----------
 1 file changed, 66 insertions(+), 62 deletions(-)

diff --git a/controllers/cluster_scripts/cloud_init.tmpl b/controllers/cluster_scripts/cloud_init.tmpl
index ff1e9d9ab..d7823a0f9 100644
--- a/controllers/cluster_scripts/cloud_init.tmpl
+++ b/controllers/cluster_scripts/cloud_init.tmpl
@@ -3,83 +3,87 @@ users:
   - name: root
     lock_passwd: false
 write_files:
-# Due to a known issue with VMware Guest Customization, cloud-init believes every boot
-# is the first boot. This ensures that cloud-init does not remove SSH keys on a reboot.
-- path: /etc/cloud/cloud.cfg.d/cse.cfg
+# On first boot, cloud-init writes all files defined in userdata. At the same time,
+# VMware Guest Customization configures networking, and reboots the machine when it is done.
+# Any files in /run are not preserved. We need cloud-init to fetch userdata and write the
+# files again. We clear the cloud-init cache, and reboot. Cloud-init thinks it is the
+# first boot, and fetches the userdata, and writes the files.
+- path: /root/replace-userdata-files.sh
   owner: root
   content: |
-     ssh_deletekeys: false
-# The control_plane.sh script runs on the first control plane machine. The node.sh script
-# runs on every subsequent control plane machine, and every worker machine.
-- path: /root/ {{- if .ControlPlane -}} control_plane {{- else -}} node {{- end -}} .sh
+    #!/usr/bin/env bash
+    function _log() {
+      echo "$(date -u +"%Y-%m-%d %H:%M:%S") $@" >> /var/log/capvcd/replace-userdata-files.log
+    }
+
+    mkdir -p /var/log/capvcd
+
+    _log "Checking for kubeadm configuration file"
+    if [ -f /run/kubeadm/kubeadm.yaml ] || [ -f /run/kubeadm/kubeadm-join-config.yaml ]; then
+      _log "kubeadm configuration file found, exiting"
+      exit 0
+    fi
+    _log "kubeadm configuration file not found, cleaning cloud-init cache and rebooting"
+    cloud-init clean
+    reboot
+- path: /root/bootstrap.sh
   owner: root
   content: |
     #!/usr/bin/env bash
-    catch() {
-      vmtoolsd --cmd "info-set guestinfo.post_customization_script_execution_status $?"
-      ERROR_MESSAGE="$(date) $(caller): $BASH_COMMAND"
-      echo "$ERROR_MESSAGE" &>> /var/log/capvcd/customization/error.log
-      vmtoolsd --cmd "info-set guestinfo.post_customization_script_execution_failure_reason $ERROR_MESSAGE"
 
-      CLOUD_INIT_OUTPUT=""
-      if [[ -f /var/log/cloud-init-output.log ]]
-      then
-        CLOUD_INIT_OUTPUT=$(</var/log/cloud-init-output.log)
-      fi
-      vmtoolsd --cmd "info-set guestinfo.post_customization_cloud_init_output $CLOUD_INIT_OUTPUT"
-    }
-    mkdir -p /var/log/capvcd/customization
-    trap 'catch $? $LINENO' ERR EXIT
-    set -eEx
+    mkdir -p /var/log/capvcd
+    (
+      # Prefix timestamp to commands in trace output.
+      PS4='$(date -u +"%Y-%m-%d %H:%M:%S")\011'
+      set -o xtrace
 
-    echo "$(date) Post Customization script execution in progress" &>> /var/log/capvcd/customization/status.log {{- if .ControlPlane }}
+      # Exit on the first error. Does not apply to commad lists, or pipelines.
+      set -o errexit
 
-    VCLOUD_BASIC_AUTH_PATH=/root/vcloud-basic-auth.yaml
-    VCLOUD_CONFIGMAP_PATH=/root/vcloud-configmap.yaml
-    VCLOUD_CCM_PATH=/root/cloud-director-ccm.yaml
-    VCLOUD_CSI_CONFIGMAP_PATH=/root/vcloud-csi-configmap.yaml
-    CSI_DRIVER_PATH=/root/csi-driver.yaml
-    CSI_CONTROLLER_PATH=/root/csi-controller.yaml
-    CSI_NODE_PATH=/root/csi-node.yaml {{- end }}
+      # Our images do not require any network customization,
+      # but CAPVCD requires a successful status to finish bootstrapping.
+      vmtoolsd --cmd "info-set guestinfo.postcustomization.networkconfiguration.status successful"
 
-    vmtoolsd --cmd "info-set guestinfo.postcustomization.networkconfiguration.status in_progress"
-    echo 'net.ipv6.conf.all.disable_ipv6 = 1' >> /etc/sysctl.conf
-    echo 'net.ipv6.conf.default.disable_ipv6 = 1' >> /etc/sysctl.conf
-    echo 'net.ipv6.conf.lo.disable_ipv6 = 1' >> /etc/sysctl.conf
-    sudo sysctl -p
-    # also remove ipv6 localhost entry from /etc/hosts
-    sed -i 's/::1/127.0.0.1/g' /etc/hosts || true
-    vmtoolsd --cmd "info-set guestinfo.postcustomization.networkconfiguration.status successful"
+      # Our images do not ship the VCD metering service,
+      # but CAPVCD requires a successful status to finish bootstrapping.
+      vmtoolsd --cmd "info-set guestinfo.metering.status successful"
 
-    # Our images do not ship the VCD metering service, but CAPVCD requires a successful status to finish bootstrapping.
-    vmtoolsd --cmd "info-set guestinfo.metering.status successful"
+      vmtoolsd --cmd "info-set {{ if .ControlPlane -}} guestinfo.postcustomization.kubeinit.status {{- else -}} guestinfo.postcustomization.kubeadm.node.join.status {{- end }} in_progress"
 
-    vmtoolsd --cmd "info-set {{ if .ControlPlane -}} guestinfo.postcustomization.kubeinit.status {{- else -}} guestinfo.postcustomization.kubeadm.node.join.status {{- end }} in_progress"
-    {
-{{ .BootstrapRunCmd }}
-    }
-    if [[ ! -f /run/cluster-api/bootstrap-success.complete ]]
-    then
-      echo "file /run/cluster-api/bootstrap-success.complete not found" &>> /var/log/capvcd/customization/error.log
-      exit 1
-    fi
-    vmtoolsd --cmd "info-set {{ if .ControlPlane -}} guestinfo.postcustomization.kubeinit.status {{- else -}} guestinfo.postcustomization.kubeadm.node.join.status {{- end }} successful"
+      # Run the preKubeadmCommands, and then kubeadm itself.
+      {{ .BootstrapRunCmd }}
+
+      # Kubeadm is the first command in a bash "list of commands," and its failure
+      # does not cause this subshell to exit. Therefore, we check the "sentinel" also created
+      # in the "list of commands," and exit if it is missing.
+      if [[ ! -f /run/cluster-api/bootstrap-success.complete ]]; then
+        echo "file /run/cluster-api/bootstrap-success.complete not found"
+        exit 1
+      fi
+
+      vmtoolsd --cmd "info-set {{ if .ControlPlane -}} guestinfo.postcustomization.kubeinit.status {{- else -}} guestinfo.postcustomization.kubeadm.node.join.status {{- end }} successful"
+
+      exit 0
+    ) &>> /var/log/capvcd/bootstrap.log
+    bootstrap_exit_code=$?
+
+    # Write the exit code to the VM metadata.
+    vmtoolsd --cmd "info-set guestinfo.post_customization_script_execution_status $bootstrap_exit_code"
+
+    # Use the last lines of the bootstrap log to give context about any failure.
+    TAIL_LOG="$(tail --lines=10 /var/log/capvcd/bootstrap.log)"
+    vmtoolsd --cmd "info-set guestinfo.post_customization_script_execution_failure_reason $TAIL_LOG"
 
-    echo "$(date) post customization script execution completed" &>> /var/log/capvcd/customization/status.log
-    exit 0
+    # Write cloud-init output for additional context.
+    vmtoolsd --cmd "info-set guestinfo.post_customization_cloud_init_output $(</var/log/cloud-init-output.log)"
 runcmd:
-- 'cloud-init clean'
-{{ if .ControlPlane }}
-- '[ ! -f /root/control_plane.sh ] && sudo reboot'
-- '[ ! -f /run/kubeadm/kubeadm.yaml ] && sudo reboot'
-- bash /root/control_plane.sh
-{{ else }}
-- '[ ! -f /root/node.sh ] && sudo reboot'
-- '[ ! -f /run/kubeadm/kubeadm-join-config.yaml ] && sudo reboot'
-- bash /root/node.sh
-{{ end }}
+- bash /root/replace-userdata-files.sh
+- bash /root/bootstrap.sh
 timezone: UTC
 disable_root: false
+# Ensure we have an IPv4 address for localhost
+manage_etc_hosts: localhost
+# Ensure that cloud-init can override the hostname.
 preserve_hostname: false
 hostname: "{{ .MachineName }}"
 final_message: "The system is ready after $UPTIME seconds"