From 66ecf82670254cafae0e84f9fc30d6fa454e5e28 Mon Sep 17 00:00:00 2001 From: Daniel Lipovetsky Date: Wed, 11 Oct 2023 13:51:59 -0700 Subject: [PATCH 1/2] feat: Update cloud-init customization Changes relative to upstream: * Add explanatory comments * Do not use stderr output of preKubeadmCommands indicate an error with bootstrapping Changes relative to our fork: * Do not enable IPv6 * Do not remove cloud-init logs and seed * Do not disable VMware customization * Do not disable network configuration * Do not truncate cloud-init-output.log * Do not report status of HTTP proxy configuration * Do not configure cloud-init to remove SSH keys on first boot * Remove commands that are already executed as a result of being defined in `preKubeadmCommands` --- controllers/cluster_scripts/cloud_init.tmpl | 49 +++++++++++---------- 1 file changed, 26 insertions(+), 23 deletions(-) diff --git a/controllers/cluster_scripts/cloud_init.tmpl b/controllers/cluster_scripts/cloud_init.tmpl index ea5f93e19..ff1e9d9ab 100644 --- a/controllers/cluster_scripts/cloud_init.tmpl +++ b/controllers/cluster_scripts/cloud_init.tmpl @@ -3,6 +3,14 @@ users: - name: root lock_passwd: false write_files: +# Due to a known issue with VMware Guest Customization, cloud-init believes every boot +# is the first boot. This ensures that cloud-init does not remove SSH keys on a reboot. +- path: /etc/cloud/cloud.cfg.d/cse.cfg + owner: root + content: | + ssh_deletekeys: false +# The control_plane.sh script runs on the first control plane machine. The node.sh script +# runs on every subsequent control plane machine, and every worker machine. - path: /root/ {{- if .ControlPlane -}} control_plane {{- else -}} node {{- end -}} .sh owner: root content: | @@ -11,12 +19,14 @@ write_files: vmtoolsd --cmd "info-set guestinfo.post_customization_script_execution_status $?" ERROR_MESSAGE="$(date) $(caller): $BASH_COMMAND" echo "$ERROR_MESSAGE" &>> /var/log/capvcd/customization/error.log - if [[ -s /root/kubeadm.err ]] + vmtoolsd --cmd "info-set guestinfo.post_customization_script_execution_failure_reason $ERROR_MESSAGE" + + CLOUD_INIT_OUTPUT="" + if [[ -f /var/log/cloud-init-output.log ]] then - KUBEADM_FAILURE=$(cat /root/kubeadm.err) - ERROR_MESSAGE="$ERROR_MESSAGE $KUBEADM_FAILURE" + CLOUD_INIT_OUTPUT=$(/etc/hosts - echo "127.0.0.1 localhost" >>/etc/hosts - echo "{{ .MachineName }}" >/etc/hostname - echo "127.0.0.1" `hostname` >>/etc/hosts + echo 'net.ipv6.conf.all.disable_ipv6 = 1' >> /etc/sysctl.conf + echo 'net.ipv6.conf.default.disable_ipv6 = 1' >> /etc/sysctl.conf + echo 'net.ipv6.conf.lo.disable_ipv6 = 1' >> /etc/sysctl.conf + sudo sysctl -p + # also remove ipv6 localhost entry from /etc/hosts + sed -i 's/::1/127.0.0.1/g' /etc/hosts || true vmtoolsd --cmd "info-set guestinfo.postcustomization.networkconfiguration.status successful" - vmtoolsd --cmd "info-set guestinfo.metering.status in_progress" + # Our images do not ship the VCD metering service, but CAPVCD requires a successful status to finish bootstrapping. vmtoolsd --cmd "info-set guestinfo.metering.status successful" - vmtoolsd --cmd "info-set guestinfo.postcustomization.proxy.setting.status in_progress" - vmtoolsd --cmd "info-set guestinfo.postcustomization.proxy.setting.status successful" - vmtoolsd --cmd "info-set {{ if .ControlPlane -}} guestinfo.postcustomization.kubeinit.status {{- else -}} guestinfo.postcustomization.kubeadm.node.join.status {{- end }} in_progress" - {{ .BootstrapRunCmd }} + { +{{ .BootstrapRunCmd }} + } if [[ ! -f /run/cluster-api/bootstrap-success.complete ]] then echo "file /run/cluster-api/bootstrap-success.complete not found" &>> /var/log/capvcd/customization/error.log @@ -58,12 +68,8 @@ write_files: echo "$(date) post customization script execution completed" &>> /var/log/capvcd/customization/status.log exit 0 runcmd: -- 'sudo cloud-init clean --seed --logs' -- 'sudo cat /dev/null > /var/log/cloud-init-output.log' +- 'cloud-init clean' {{ if .ControlPlane }} -- '[ ! -f /run/kubeadm/konvoy-set-kube-proxy-configuration.sh] && sudo reboot' -- '[ ! -f /run/konvoy/containerd-apply-patches.sh] && sudo reboot' -- '[ ! -f /run/konvoy/restart-containerd-and-wait.sh] && sudo reboot' - '[ ! -f /root/control_plane.sh ] && sudo reboot' - '[ ! -f /run/kubeadm/kubeadm.yaml ] && sudo reboot' - bash /root/control_plane.sh @@ -74,9 +80,6 @@ runcmd: {{ end }} timezone: UTC disable_root: false -disable_vmware_customization: true -network: - config: disabled preserve_hostname: false hostname: "{{ .MachineName }}" -final_message: "The system is ready after $UPTIME seconds" \ No newline at end of file +final_message: "The system is ready after $UPTIME seconds" From d8316d165661a26a725fb2a6302f7d60fe237af9 Mon Sep 17 00:00:00 2001 From: Daniel Lipovetsky Date: Thu, 12 Oct 2023 19:45:19 -0700 Subject: [PATCH 2/2] Simplify cloud init * Use shell script to clean cloud-init cache and reboot. * Fix error handling of bootstrap script. Do not interpret stderr output as an indicator of failure. Do not rely on trap and errexit, because it does not work for command lists. * Include last lines of output for error context. * Ensure we have an IPv4 address for localhost. * Remove unnecessary cloud-init configuration to preserve SSH host keys. --- controllers/cluster_scripts/cloud_init.tmpl | 128 ++++++++++---------- 1 file changed, 66 insertions(+), 62 deletions(-) diff --git a/controllers/cluster_scripts/cloud_init.tmpl b/controllers/cluster_scripts/cloud_init.tmpl index ff1e9d9ab..d7823a0f9 100644 --- a/controllers/cluster_scripts/cloud_init.tmpl +++ b/controllers/cluster_scripts/cloud_init.tmpl @@ -3,83 +3,87 @@ users: - name: root lock_passwd: false write_files: -# Due to a known issue with VMware Guest Customization, cloud-init believes every boot -# is the first boot. This ensures that cloud-init does not remove SSH keys on a reboot. -- path: /etc/cloud/cloud.cfg.d/cse.cfg +# On first boot, cloud-init writes all files defined in userdata. At the same time, +# VMware Guest Customization configures networking, and reboots the machine when it is done. +# Any files in /run are not preserved. We need cloud-init to fetch userdata and write the +# files again. We clear the cloud-init cache, and reboot. Cloud-init thinks it is the +# first boot, and fetches the userdata, and writes the files. +- path: /root/replace-userdata-files.sh owner: root content: | - ssh_deletekeys: false -# The control_plane.sh script runs on the first control plane machine. The node.sh script -# runs on every subsequent control plane machine, and every worker machine. -- path: /root/ {{- if .ControlPlane -}} control_plane {{- else -}} node {{- end -}} .sh + #!/usr/bin/env bash + function _log() { + echo "$(date -u +"%Y-%m-%d %H:%M:%S") $@" >> /var/log/capvcd/replace-userdata-files.log + } + + mkdir -p /var/log/capvcd + + _log "Checking for kubeadm configuration file" + if [ -f /run/kubeadm/kubeadm.yaml ] || [ -f /run/kubeadm/kubeadm-join-config.yaml ]; then + _log "kubeadm configuration file found, exiting" + exit 0 + fi + _log "kubeadm configuration file not found, cleaning cloud-init cache and rebooting" + cloud-init clean + reboot +- path: /root/bootstrap.sh owner: root content: | #!/usr/bin/env bash - catch() { - vmtoolsd --cmd "info-set guestinfo.post_customization_script_execution_status $?" - ERROR_MESSAGE="$(date) $(caller): $BASH_COMMAND" - echo "$ERROR_MESSAGE" &>> /var/log/capvcd/customization/error.log - vmtoolsd --cmd "info-set guestinfo.post_customization_script_execution_failure_reason $ERROR_MESSAGE" - CLOUD_INIT_OUTPUT="" - if [[ -f /var/log/cloud-init-output.log ]] - then - CLOUD_INIT_OUTPUT=$(> /var/log/capvcd/customization/status.log {{- if .ControlPlane }} + # Exit on the first error. Does not apply to commad lists, or pipelines. + set -o errexit - VCLOUD_BASIC_AUTH_PATH=/root/vcloud-basic-auth.yaml - VCLOUD_CONFIGMAP_PATH=/root/vcloud-configmap.yaml - VCLOUD_CCM_PATH=/root/cloud-director-ccm.yaml - VCLOUD_CSI_CONFIGMAP_PATH=/root/vcloud-csi-configmap.yaml - CSI_DRIVER_PATH=/root/csi-driver.yaml - CSI_CONTROLLER_PATH=/root/csi-controller.yaml - CSI_NODE_PATH=/root/csi-node.yaml {{- end }} + # Our images do not require any network customization, + # but CAPVCD requires a successful status to finish bootstrapping. + vmtoolsd --cmd "info-set guestinfo.postcustomization.networkconfiguration.status successful" - vmtoolsd --cmd "info-set guestinfo.postcustomization.networkconfiguration.status in_progress" - echo 'net.ipv6.conf.all.disable_ipv6 = 1' >> /etc/sysctl.conf - echo 'net.ipv6.conf.default.disable_ipv6 = 1' >> /etc/sysctl.conf - echo 'net.ipv6.conf.lo.disable_ipv6 = 1' >> /etc/sysctl.conf - sudo sysctl -p - # also remove ipv6 localhost entry from /etc/hosts - sed -i 's/::1/127.0.0.1/g' /etc/hosts || true - vmtoolsd --cmd "info-set guestinfo.postcustomization.networkconfiguration.status successful" + # Our images do not ship the VCD metering service, + # but CAPVCD requires a successful status to finish bootstrapping. + vmtoolsd --cmd "info-set guestinfo.metering.status successful" - # Our images do not ship the VCD metering service, but CAPVCD requires a successful status to finish bootstrapping. - vmtoolsd --cmd "info-set guestinfo.metering.status successful" + vmtoolsd --cmd "info-set {{ if .ControlPlane -}} guestinfo.postcustomization.kubeinit.status {{- else -}} guestinfo.postcustomization.kubeadm.node.join.status {{- end }} in_progress" - vmtoolsd --cmd "info-set {{ if .ControlPlane -}} guestinfo.postcustomization.kubeinit.status {{- else -}} guestinfo.postcustomization.kubeadm.node.join.status {{- end }} in_progress" - { -{{ .BootstrapRunCmd }} - } - if [[ ! -f /run/cluster-api/bootstrap-success.complete ]] - then - echo "file /run/cluster-api/bootstrap-success.complete not found" &>> /var/log/capvcd/customization/error.log - exit 1 - fi - vmtoolsd --cmd "info-set {{ if .ControlPlane -}} guestinfo.postcustomization.kubeinit.status {{- else -}} guestinfo.postcustomization.kubeadm.node.join.status {{- end }} successful" + # Run the preKubeadmCommands, and then kubeadm itself. + {{ .BootstrapRunCmd }} + + # Kubeadm is the first command in a bash "list of commands," and its failure + # does not cause this subshell to exit. Therefore, we check the "sentinel" also created + # in the "list of commands," and exit if it is missing. + if [[ ! -f /run/cluster-api/bootstrap-success.complete ]]; then + echo "file /run/cluster-api/bootstrap-success.complete not found" + exit 1 + fi + + vmtoolsd --cmd "info-set {{ if .ControlPlane -}} guestinfo.postcustomization.kubeinit.status {{- else -}} guestinfo.postcustomization.kubeadm.node.join.status {{- end }} successful" + + exit 0 + ) &>> /var/log/capvcd/bootstrap.log + bootstrap_exit_code=$? + + # Write the exit code to the VM metadata. + vmtoolsd --cmd "info-set guestinfo.post_customization_script_execution_status $bootstrap_exit_code" + + # Use the last lines of the bootstrap log to give context about any failure. + TAIL_LOG="$(tail --lines=10 /var/log/capvcd/bootstrap.log)" + vmtoolsd --cmd "info-set guestinfo.post_customization_script_execution_failure_reason $TAIL_LOG" - echo "$(date) post customization script execution completed" &>> /var/log/capvcd/customization/status.log - exit 0 + # Write cloud-init output for additional context. + vmtoolsd --cmd "info-set guestinfo.post_customization_cloud_init_output $(