Skip to content

Commit

Permalink
fix: introduce 5 second timeout for each Talos client call
Browse files Browse the repository at this point in the history
This should fix the issue when the controller gets stuck trying to do
some requests to Talos nodes.

Signed-off-by: Artem Chernyshev <[email protected]>
  • Loading branch information
Unix4ever committed May 11, 2023
1 parent d2f21be commit 59c39a8
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 0 deletions.
17 changes: 17 additions & 0 deletions controllers/etcd.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"context"
"fmt"
"strings"
"time"

controlplanev1 "github.com/siderolabs/cluster-api-control-plane-provider-talos/api/v1alpha3"
"github.com/siderolabs/talos/pkg/machinery/api/machine"
Expand All @@ -17,6 +18,10 @@ import (
)

func (r *TalosControlPlaneReconciler) etcdHealthcheck(ctx context.Context, tcp *controlplanev1.TalosControlPlane, cluster *clusterv1.Cluster, ownedMachines []clusterv1.Machine) error {
ctx, cancel := context.WithTimeout(ctx, time.Second*5)

defer cancel()

machines := []clusterv1.Machine{}

for _, machine := range ownedMachines {
Expand Down Expand Up @@ -98,6 +103,10 @@ func (r *TalosControlPlaneReconciler) etcdHealthcheck(ctx context.Context, tcp *
// gracefulEtcdLeave removes a given machine from the etcd cluster by forfeiting leadership
// and issuing a "leave" request from the machine itself.
func (r *TalosControlPlaneReconciler) gracefulEtcdLeave(ctx context.Context, c *talosclient.Client, cluster client.ObjectKey, machineToLeave clusterv1.Machine) error {
ctx, cancel := context.WithTimeout(ctx, time.Second*5)

defer cancel()

r.Log.Info("verifying etcd status", "machine", machineToLeave.Name, "node", machineToLeave.Status.NodeRef.Name)

svcs, err := c.ServiceInfo(ctx, "etcd")
Expand Down Expand Up @@ -129,6 +138,10 @@ func (r *TalosControlPlaneReconciler) gracefulEtcdLeave(ctx context.Context, c *
// forceEtcdLeave removes a given machine from the etcd cluster by telling another CP node to remove the member.
// This is used in times when the machine was deleted out from under us.
func (r *TalosControlPlaneReconciler) forceEtcdLeave(ctx context.Context, c *talosclient.Client, cluster client.ObjectKey, memberName string) error {
ctx, cancel := context.WithTimeout(ctx, time.Second*5)

defer cancel()

r.Log.Info("removing etcd member", "memberName", memberName)

return c.EtcdRemoveMember(
Expand All @@ -142,6 +155,10 @@ func (r *TalosControlPlaneReconciler) forceEtcdLeave(ctx context.Context, c *tal
// auditEtcd rolls through all etcd members to see if there's a matching controlplane machine
// It uses the first controlplane node returned as the etcd endpoint
func (r *TalosControlPlaneReconciler) auditEtcd(ctx context.Context, tcp *controlplanev1.TalosControlPlane, cluster client.ObjectKey, cpName string) error {
ctx, cancel := context.WithTimeout(ctx, time.Second*5)

defer cancel()

machines, err := r.getControlPlaneMachinesForCluster(ctx, cluster, cpName)
if err != nil {
return err
Expand Down
4 changes: 4 additions & 0 deletions controllers/health.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@ func (e *errServiceUnhealthy) Error() string {
}

func (r *TalosControlPlaneReconciler) nodesHealthcheck(ctx context.Context, tcp *controlplanev1.TalosControlPlane, cluster *clusterv1.Cluster, machines []clusterv1.Machine) error {
ctx, cancel := context.WithTimeout(ctx, time.Second*5)

defer cancel()

client, err := r.talosconfigForMachines(ctx, tcp, machines...)
if err != nil {
return err
Expand Down
4 changes: 4 additions & 0 deletions controllers/taloscontrolplane_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -408,6 +408,10 @@ func (r *TalosControlPlaneReconciler) bootControlPlane(ctx context.Context, clus
}

func (r *TalosControlPlaneReconciler) bootstrapCluster(ctx context.Context, tcp *controlplanev1.TalosControlPlane, cluster *clusterv1.Cluster, machines []clusterv1.Machine) error {
ctx, cancel := context.WithTimeout(ctx, time.Second*5)

defer cancel()

c, err := r.talosconfigForMachines(ctx, tcp, machines...)
if err != nil {
return err
Expand Down

0 comments on commit 59c39a8

Please sign in to comment.