From 59c39a8e1d0f3062b2b5e9bb47a178b9dfe27bed Mon Sep 17 00:00:00 2001 From: Artem Chernyshev Date: Wed, 10 May 2023 22:04:51 +0300 Subject: [PATCH] fix: introduce 5 second timeout for each Talos client call This should fix the issue when the controller gets stuck trying to do some requests to Talos nodes. Signed-off-by: Artem Chernyshev --- controllers/etcd.go | 17 +++++++++++++++++ controllers/health.go | 4 ++++ controllers/taloscontrolplane_controller.go | 4 ++++ 3 files changed, 25 insertions(+) diff --git a/controllers/etcd.go b/controllers/etcd.go index 7a36f7a..4cc6604 100644 --- a/controllers/etcd.go +++ b/controllers/etcd.go @@ -8,6 +8,7 @@ import ( "context" "fmt" "strings" + "time" controlplanev1 "github.com/siderolabs/cluster-api-control-plane-provider-talos/api/v1alpha3" "github.com/siderolabs/talos/pkg/machinery/api/machine" @@ -17,6 +18,10 @@ import ( ) func (r *TalosControlPlaneReconciler) etcdHealthcheck(ctx context.Context, tcp *controlplanev1.TalosControlPlane, cluster *clusterv1.Cluster, ownedMachines []clusterv1.Machine) error { + ctx, cancel := context.WithTimeout(ctx, time.Second*5) + + defer cancel() + machines := []clusterv1.Machine{} for _, machine := range ownedMachines { @@ -98,6 +103,10 @@ func (r *TalosControlPlaneReconciler) etcdHealthcheck(ctx context.Context, tcp * // gracefulEtcdLeave removes a given machine from the etcd cluster by forfeiting leadership // and issuing a "leave" request from the machine itself. func (r *TalosControlPlaneReconciler) gracefulEtcdLeave(ctx context.Context, c *talosclient.Client, cluster client.ObjectKey, machineToLeave clusterv1.Machine) error { + ctx, cancel := context.WithTimeout(ctx, time.Second*5) + + defer cancel() + r.Log.Info("verifying etcd status", "machine", machineToLeave.Name, "node", machineToLeave.Status.NodeRef.Name) svcs, err := c.ServiceInfo(ctx, "etcd") @@ -129,6 +138,10 @@ func (r *TalosControlPlaneReconciler) gracefulEtcdLeave(ctx context.Context, c * // forceEtcdLeave removes a given machine from the etcd cluster by telling another CP node to remove the member. // This is used in times when the machine was deleted out from under us. func (r *TalosControlPlaneReconciler) forceEtcdLeave(ctx context.Context, c *talosclient.Client, cluster client.ObjectKey, memberName string) error { + ctx, cancel := context.WithTimeout(ctx, time.Second*5) + + defer cancel() + r.Log.Info("removing etcd member", "memberName", memberName) return c.EtcdRemoveMember( @@ -142,6 +155,10 @@ func (r *TalosControlPlaneReconciler) forceEtcdLeave(ctx context.Context, c *tal // auditEtcd rolls through all etcd members to see if there's a matching controlplane machine // It uses the first controlplane node returned as the etcd endpoint func (r *TalosControlPlaneReconciler) auditEtcd(ctx context.Context, tcp *controlplanev1.TalosControlPlane, cluster client.ObjectKey, cpName string) error { + ctx, cancel := context.WithTimeout(ctx, time.Second*5) + + defer cancel() + machines, err := r.getControlPlaneMachinesForCluster(ctx, cluster, cpName) if err != nil { return err diff --git a/controllers/health.go b/controllers/health.go index d11597b..d8d4e87 100644 --- a/controllers/health.go +++ b/controllers/health.go @@ -29,6 +29,10 @@ func (e *errServiceUnhealthy) Error() string { } func (r *TalosControlPlaneReconciler) nodesHealthcheck(ctx context.Context, tcp *controlplanev1.TalosControlPlane, cluster *clusterv1.Cluster, machines []clusterv1.Machine) error { + ctx, cancel := context.WithTimeout(ctx, time.Second*5) + + defer cancel() + client, err := r.talosconfigForMachines(ctx, tcp, machines...) if err != nil { return err diff --git a/controllers/taloscontrolplane_controller.go b/controllers/taloscontrolplane_controller.go index 4bbcfad..547fbaa 100644 --- a/controllers/taloscontrolplane_controller.go +++ b/controllers/taloscontrolplane_controller.go @@ -408,6 +408,10 @@ func (r *TalosControlPlaneReconciler) bootControlPlane(ctx context.Context, clus } func (r *TalosControlPlaneReconciler) bootstrapCluster(ctx context.Context, tcp *controlplanev1.TalosControlPlane, cluster *clusterv1.Cluster, machines []clusterv1.Machine) error { + ctx, cancel := context.WithTimeout(ctx, time.Second*5) + + defer cancel() + c, err := r.talosconfigForMachines(ctx, tcp, machines...) if err != nil { return err