diff --git a/README.md b/README.md index c7183f29..be546662 100644 --- a/README.md +++ b/README.md @@ -121,3 +121,6 @@ https://wikitech.wikimedia.org/wiki/PAWS/Admin#Deployment If the entire project is removed two parts of paws are not managed by tofu/ansible. Object storage container: An object storage container named "tofu-state" will need to be generated in horizon. This is where the state file for tofu resides. NFS: The NFS server is not included. And a fresh NFS server will be needed for paws to operate. + +# backup prometheus +see ansible/files/prometheus-data.sh for example of backup/restore diff --git a/ansible/files/prometheus-data.sh b/ansible/files/prometheus-data.sh new file mode 100644 index 00000000..c2152582 --- /dev/null +++ b/ansible/files/prometheus-data.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +export KUBECONFIG=/tmp/kube.config.previous +PREVIOUS_POD=$(kubectl get pods -n metrics --selector=app.kubernetes.io/component=server --no-headers -o custom-columns=":metadata.name") +kubectl -n metrics exec -it pod/${PREVIOUS_POD} -c prometheus-server -- tar cfz backup.tar.gz /data +kubectl cp metrics/${PREVIOUS_POD}:backup.tar.gz /tmp/prometheus.tar.gz -c prometheus-server + +sleep 150 # make a little gap of time to keep data from overlapping + +export KUBECONFIG=/tmp/kube.config.current +CURRENT_POD=$(kubectl get pods -n metrics --selector=app.kubernetes.io/component=server --no-headers -o custom-columns=":metadata.name") +kubectl -n metrics wait --for=condition=ready pod -l app.kubernetes.io/component=server --timeout=600s +kubectl cp /tmp/prometheus.tar.gz metrics/${CURRENT_POD}:backup.tar.gz -c prometheus-server +kubectl -n metrics exec -it pod/${CURRENT_POD} -c prometheus-server -- sh -c 'rm -rf /data/* ; tar xfz backup.tar.gz -C /' +kubectl -n metrics rollout restart deployment.apps/prometheus-server diff --git a/ansible/paws.yaml b/ansible/paws.yaml index 9e12ea8b..9fbc9ce3 100644 --- a/ansible/paws.yaml +++ b/ansible/paws.yaml @@ -70,6 +70,13 @@ - value: controller.config.allow-snippet-annotations=true value_type: string + - name: Create metrics namespace for prometheus + kubernetes.core.k8s: + name: metrics + kind: Namespace + state: present + register: prometheus + - name: Prometheus kubernetes.core.helm: name: prometheus @@ -85,6 +92,10 @@ template: "templates/prometheus-ingress.yaml.j2" namespace: metrics + - name: Pull in previous prometheus data + ansible.builtin.script: files/prometheus-data.sh + when: prometheus.changed + - name: Add jupyterhub chart repo kubernetes.core.helm_repository: name: jupyterhub diff --git a/deploy.sh b/deploy.sh index d6ea1c72..f5ebb42f 100644 --- a/deploy.sh +++ b/deploy.sh @@ -42,6 +42,9 @@ fi source secrets-${datacenter}.sh +# save current kube.config in case we need to transfer prometheus data +cp $(pwd)/tofu/kube.config /tmp/kube.config.previous || true # if it isn't there just keep going + python3 -m venv .venv/deploy source .venv/deploy/bin/activate pip install ansible==8.1.0 kubernetes==26.1.0 @@ -51,6 +54,7 @@ cd tofu AWS_ACCESS_KEY_ID=${ACCESS_KEY} AWS_SECRET_ACCESS_KEY=${SECRET_KEY} tofu init -backend-config=${datacenter}-backend.conf AWS_ACCESS_KEY_ID=${ACCESS_KEY} AWS_SECRET_ACCESS_KEY=${SECRET_KEY} tofu apply -var datacenter=${datacenter} export KUBECONFIG=$(pwd)/kube.config +cp $(pwd)/kube.config /tmp/kube.config.current if [ "${tofuonly}" = '1' ] then