Skip to content

Commit

Permalink
feat: emit Kubernetes events for error conditions (#598)
Browse files Browse the repository at this point in the history
`k/cloud-provider` emits Events for most errors we return. In some
conditions we previously hid the errors from `k/cloud-provider` for
various reasons and only logged a warning. This is hard to discover for
users. With this change, we now also emit Kubernetes Events for these
cases.

---------

Co-authored-by: pauhull <[email protected]>
  • Loading branch information
apricote and phm07 authored Jan 12, 2024
1 parent f71c47c commit e8f9199
Show file tree
Hide file tree
Showing 7 changed files with 79 additions and 28 deletions.
11 changes: 10 additions & 1 deletion hcloud/cloud.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ import (
"strings"

hrobot "github.com/syself/hrobot-go"
corev1 "k8s.io/api/core/v1"
"k8s.io/client-go/kubernetes/scheme"
"k8s.io/client-go/tools/record"
cloudprovider "k8s.io/cloud-provider"
"k8s.io/klog/v2"

Expand All @@ -46,6 +49,7 @@ type cloud struct {
client *hcloud.Client
robotClient robot.Client
cfg config.HCCMConfiguration
recorder record.EventRecorder
networkID int64
}

Expand Down Expand Up @@ -122,11 +126,15 @@ func newCloud(_ io.Reader) (cloudprovider.Interface, error) {

klog.Infof("Hetzner Cloud k8s cloud controller %s started\n", providerVersion)

eventBroadcaster := record.NewBroadcaster()
recorder := eventBroadcaster.NewRecorder(scheme.Scheme, corev1.EventSource{Component: "hcloud-cloud-controller-manager"})

return &cloud{
client: client,
robotClient: robotClient,
cfg: cfg,
networkID: networkID,
recorder: recorder,
}, nil
}

Expand All @@ -139,7 +147,7 @@ func (c *cloud) Instances() (cloudprovider.Instances, bool) {
}

func (c *cloud) InstancesV2() (cloudprovider.InstancesV2, bool) {
return newInstances(c.client, c.robotClient, c.cfg.Instance.AddressFamily, c.networkID), true
return newInstances(c.client, c.robotClient, c.recorder, c.cfg.Instance.AddressFamily, c.networkID), true
}

func (c *cloud) Zones() (cloudprovider.Zones, bool) {
Expand All @@ -160,6 +168,7 @@ func (c *cloud) LoadBalancer() (cloudprovider.LoadBalancer, bool) {
NetworkClient: &c.client.Network,
NetworkID: c.networkID,
Cfg: c.cfg,
Recorder: c.recorder,
}

return newLoadBalancers(lbOps, c.cfg.LoadBalancer.DisablePrivateIngress, c.cfg.LoadBalancer.DisableIPv6), true
Expand Down
7 changes: 7 additions & 0 deletions hcloud/cloud_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@ import (

"github.com/stretchr/testify/assert"
hrobot "github.com/syself/hrobot-go"
corev1 "k8s.io/api/core/v1"
"k8s.io/client-go/kubernetes/scheme"
"k8s.io/client-go/tools/record"

"github.com/hetznercloud/hcloud-cloud-controller-manager/internal/testsupport"
"github.com/hetznercloud/hcloud-go/v2/hcloud"
Expand All @@ -38,6 +41,7 @@ type testEnv struct {
Mux *http.ServeMux
Client *hcloud.Client
RobotClient hrobot.RobotClient
Recorder record.EventRecorder
}

func (env *testEnv) Teardown() {
Expand All @@ -46,6 +50,7 @@ func (env *testEnv) Teardown() {
env.Mux = nil
env.Client = nil
env.RobotClient = nil
env.Recorder = nil
}

func newTestEnv() testEnv {
Expand All @@ -59,11 +64,13 @@ func newTestEnv() testEnv {
)
robotClient := hrobot.NewBasicAuthClient("", "")
robotClient.SetBaseURL(server.URL + "/robot")
recorder := record.NewBroadcaster().NewRecorder(scheme.Scheme, corev1.EventSource{Component: "hcloud-cloud-controller-manager"})
return testEnv{
Server: server,
Mux: mux,
Client: client,
RobotClient: robotClient,
Recorder: recorder,
}
}

Expand Down
7 changes: 5 additions & 2 deletions hcloud/instances.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (

hrobotmodels "github.com/syself/hrobot-go/models"
corev1 "k8s.io/api/core/v1"
"k8s.io/client-go/tools/record"
cloudprovider "k8s.io/cloud-provider"

"github.com/hetznercloud/hcloud-cloud-controller-manager/internal/config"
Expand All @@ -35,6 +36,7 @@ import (
type instances struct {
client *hcloud.Client
robotClient robot.Client
recorder record.EventRecorder
addressFamily config.AddressFamily
networkID int64
}
Expand All @@ -44,8 +46,8 @@ var (
errMissingRobotClient = errors.New("no robot client configured, make sure to enable Robot support in the configuration")
)

func newInstances(client *hcloud.Client, robotClient robot.Client, addressFamily config.AddressFamily, networkID int64) *instances {
return &instances{client, robotClient, addressFamily, networkID}
func newInstances(client *hcloud.Client, robotClient robot.Client, recorder record.EventRecorder, addressFamily config.AddressFamily, networkID int64) *instances {
return &instances{client, robotClient, recorder, addressFamily, networkID}
}

// lookupServer attempts to locate the corresponding [*hcloud.Server] or [*hrobotmodels.Server] for a given [*corev1.Node].
Expand Down Expand Up @@ -107,6 +109,7 @@ func (i *instances) lookupServer(
}

if cloudServer != nil && hrobotServer != nil {
i.recorder.Eventf(node, corev1.EventTypeWarning, "InstanceLookupFailed", "Node %s could not be uniquely associated with a Cloud or Robot server, as a server with this name exists in both APIs", node.Name)
return nil, fmt.Errorf("found both a cloud & robot server for name %q", node.Name)
}

Expand Down
8 changes: 4 additions & 4 deletions hcloud/instances_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ func TestInstances_InstanceExists(t *testing.T) {
})
})

instances := newInstances(env.Client, env.RobotClient, config.AddressFamilyIPv4, 0)
instances := newInstances(env.Client, env.RobotClient, env.Recorder, config.AddressFamilyIPv4, 0)

tests := []struct {
name string
Expand Down Expand Up @@ -209,7 +209,7 @@ func TestInstances_InstanceShutdown(t *testing.T) {
})
})

instances := newInstances(env.Client, env.RobotClient, config.AddressFamilyIPv4, 0)
instances := newInstances(env.Client, env.RobotClient, env.Recorder, config.AddressFamilyIPv4, 0)
env.Mux.HandleFunc("/robot/server/3", func(w http.ResponseWriter, r *http.Request) {
json.NewEncoder(w).Encode(hrobotmodels.ServerResponse{
Server: hrobotmodels.Server{
Expand Down Expand Up @@ -343,7 +343,7 @@ func TestInstances_InstanceMetadata(t *testing.T) {
})
})

instances := newInstances(env.Client, env.RobotClient, config.AddressFamilyIPv4, 0)
instances := newInstances(env.Client, env.RobotClient, env.Recorder, config.AddressFamilyIPv4, 0)

metadata, err := instances.InstanceMetadata(context.TODO(), &corev1.Node{
Spec: corev1.NodeSpec{ProviderID: "hcloud://1"},
Expand Down Expand Up @@ -384,7 +384,7 @@ func TestInstances_InstanceMetadataRobotServer(t *testing.T) {
})
})

instances := newInstances(env.Client, env.RobotClient, config.AddressFamilyIPv4, 0)
instances := newInstances(env.Client, env.RobotClient, env.Recorder, config.AddressFamilyIPv4, 0)

metadata, err := instances.InstanceMetadata(context.TODO(), &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Expand Down
2 changes: 1 addition & 1 deletion hcloud/routes.go
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ func (r *routes) CreateRoute(ctx context.Context, clusterName string, nameHint s

privNet, ok = findServerPrivateNetByID(srv, r.network.ID)
if !ok {
return fmt.Errorf("%s: server %v: network with id %d not attached to this server ", op, route.TargetNode, r.network.ID)
return fmt.Errorf("%s: server %v: network with id %d not attached to this server", op, route.TargetNode, r.network.ID)
}
}
ip := privNet.IP
Expand Down
70 changes: 50 additions & 20 deletions internal/hcops/load_balancer.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"time"

corev1 "k8s.io/api/core/v1"
"k8s.io/client-go/tools/record"
"k8s.io/klog/v2"

"github.com/hetznercloud/hcloud-cloud-controller-manager/internal/annotation"
Expand Down Expand Up @@ -78,6 +79,7 @@ type LoadBalancerOps struct {
RetryDelay time.Duration
NetworkID int64
Cfg config.HCCMConfiguration
Recorder record.EventRecorder
}

// GetByK8SServiceUID tries to find a Load Balancer by its Kubernetes service
Expand Down Expand Up @@ -571,7 +573,7 @@ func (l *LoadBalancerOps) ReconcileHCLBTargets(
// cluster.
k8sNodeIDsHCloud = make(map[int64]bool)
k8sNodeIDsRobot = make(map[int]bool)
k8sNodeNames = make(map[int64]string)
k8sNodes = make(map[int64]*corev1.Node)

robotIPsToIDs = make(map[string]int)
robotIDToIPv4 = make(map[int]string)
Expand Down Expand Up @@ -609,7 +611,7 @@ func (l *LoadBalancerOps) ReconcileHCLBTargets(
} else {
k8sNodeIDsRobot[int(id)] = true
}
k8sNodeNames[id] = node.Name
k8sNodes[id] = node
}

// List all robot servers to check whether the ip targets of the load balancer
Expand Down Expand Up @@ -641,15 +643,23 @@ func (l *LoadBalancerOps) ReconcileHCLBTargets(
continue
}

klog.InfoS("remove target", "op", op, "service", svc.ObjectMeta.Name, "targetName", k8sNodeNames[id])
// k8sNodes[id] can be nil if the node is currently being deleted.
var nodeName string
if node := k8sNodes[id]; node != nil {
nodeName = node.Name
} else {
nodeName = fmt.Sprintf("%d", id)
}

klog.InfoS("remove target", "op", op, "service", svc.ObjectMeta.Name, "targetName", nodeName)
// Target needs to be re-created or node currently not in use by k8s
// Load Balancer. Remove it from the HC Load Balancer
a, _, err := l.LBClient.RemoveServerTarget(ctx, lb, target.Server.Server)
if err != nil {
return changed, fmt.Errorf("%s: target: %s: %w", op, k8sNodeNames[id], err)
return changed, fmt.Errorf("%s: target: %s: %w", op, nodeName, err)
}
if err := WatchAction(ctx, l.ActionClient, a); err != nil {
return changed, fmt.Errorf("%s: target: %s: %w", op, k8sNodeNames[id], err)
return changed, fmt.Errorf("%s: target: %s: %w", op, nodeName, err)
}
changed = true
numberOfTargets--
Expand All @@ -665,13 +675,21 @@ func (l *LoadBalancerOps) ReconcileHCLBTargets(
continue
}

klog.InfoS("remove target", "op", op, "service", svc.ObjectMeta.Name, "targetName", k8sNodeNames[int64(id)])
// k8sNodes[id] can be nil if the node is currently being deleted.
var nodeName string
if node := k8sNodes[int64(id)]; node != nil {
nodeName = node.Name
} else {
nodeName = fmt.Sprintf("%d", id)
}

klog.InfoS("remove target", "op", op, "service", svc.ObjectMeta.Name, "targetName", nodeName)
// Node currently not in use by k8s Load Balancer. Remove it from the HC Load Balancer.
a, _, err := l.LBClient.RemoveIPTarget(ctx, lb, net.ParseIP(ip))
if err != nil {
var e error
if foundServer {
e = fmt.Errorf("%s: target: %s: %w", op, k8sNodeNames[int64(id)], err)
e = fmt.Errorf("%s: target: %s: %w", op, nodeName, err)
} else {
e = fmt.Errorf("%s: targetIP: %s: %w", op, ip, err)
}
Expand All @@ -680,7 +698,7 @@ func (l *LoadBalancerOps) ReconcileHCLBTargets(
if err := WatchAction(ctx, l.ActionClient, a); err != nil {
var e error
if foundServer {
e = fmt.Errorf("%s: target: %s: %w", op, k8sNodeNames[int64(id)], err)
e = fmt.Errorf("%s: target: %s: %w", op, nodeName, err)
} else {
e = fmt.Errorf("%s: targetIP: %s: %w", op, ip, err)
}
Expand All @@ -699,27 +717,29 @@ func (l *LoadBalancerOps) ReconcileHCLBTargets(
if hclbTargetIDs[id] {
continue
}
node := k8sNodes[id]

if numberOfTargets >= lb.LoadBalancerType.MaxTargets {
klog.InfoS("cannot add server target because max number of targets have been reached", "op", op, "service", svc.ObjectMeta.Name, "targetName", k8sNodeNames[id])
l.emitMaxTargetsReachedError(node, svc, op)
continue
}

klog.InfoS("add target", "op", op, "service", svc.ObjectMeta.Name, "targetName", k8sNodeNames[id])
klog.InfoS("add target", "op", op, "service", svc.ObjectMeta.Name, "targetName", node.Name)
opts := hcloud.LoadBalancerAddServerTargetOpts{
Server: &hcloud.Server{ID: id},
UsePrivateIP: &usePrivateIP,
}
a, _, err := l.LBClient.AddServerTarget(ctx, lb, opts)
if err != nil {
if hcloud.IsError(err, hcloud.ErrorCodeResourceLimitExceeded) {
klog.InfoS("resource limit exceeded", "err", err.Error(), "op", op, "service", svc.ObjectMeta.Name, "targetName", k8sNodeNames[id])
return false, nil
l.emitMaxTargetsReachedError(node, svc, op)
// Continue loop so that error is emitted for each node
continue
}
return changed, fmt.Errorf("%s: target %s: %w", op, k8sNodeNames[id], err)
return changed, fmt.Errorf("%s: target %s: %w", op, node.Name, err)
}
if err := WatchAction(ctx, l.ActionClient, a); err != nil {
return changed, fmt.Errorf("%s: target %s: %w", op, k8sNodeNames[id], err)
return changed, fmt.Errorf("%s: target %s: %w", op, node.Name, err)
}
changed = true
numberOfTargets++
Expand All @@ -730,36 +750,38 @@ func (l *LoadBalancerOps) ReconcileHCLBTargets(
// to the K8S Load Balancer as IP targets to the HC Load Balancer.
for id := range k8sNodeIDsRobot {
ip := robotIDToIPv4[id]
node := k8sNodes[int64(id)]

// Don't assign the node again if it is already assigned to the HC load
// balancer.
if hclbTargetIPs[ip] {
continue
}
if ip == "" {
l.Recorder.Eventf(node, corev1.EventTypeWarning, "ServerNotFound", "No server with id %d was found in Robot", id)
klog.InfoS("k8s node found but no corresponding server in robot", "id", id)
continue
}

if numberOfTargets >= lb.LoadBalancerType.MaxTargets {
klog.InfoS("cannot add ip target because max number of targets have been reached", "op", op, "service", svc.ObjectMeta.Name, "targetName", k8sNodeNames[int64(id)])
l.emitMaxTargetsReachedError(node, svc, op)
continue
}

klog.InfoS("add target", "op", op, "service", svc.ObjectMeta.Name, "targetName", k8sNodeNames[int64(id)], "ip", ip)
klog.InfoS("add target", "op", op, "service", svc.ObjectMeta.Name, "targetName", node, "ip", ip)
opts := hcloud.LoadBalancerAddIPTargetOpts{
IP: net.ParseIP(ip),
}
a, _, err := l.LBClient.AddIPTarget(ctx, lb, opts)
if err != nil {
if hcloud.IsError(err, hcloud.ErrorCodeResourceLimitExceeded) {
klog.InfoS("resource limit exceeded", "err", err.Error(), "op", op, "service", svc.ObjectMeta.Name, "targetName", k8sNodeNames[int64(id)])
return false, nil
l.emitMaxTargetsReachedError(node, svc, op)
continue
}
return changed, fmt.Errorf("%s: target %s: %w", op, k8sNodeNames[int64(id)], err)
return changed, fmt.Errorf("%s: target %s: %w", op, node, err)
}
if err := WatchAction(ctx, l.ActionClient, a); err != nil {
return changed, fmt.Errorf("%s: target %s: %w", op, k8sNodeNames[int64(id)], err)
return changed, fmt.Errorf("%s: target %s: %w", op, node, err)
}
changed = true
numberOfTargets++
Expand All @@ -769,6 +791,14 @@ func (l *LoadBalancerOps) ReconcileHCLBTargets(
return changed, nil
}

//nolint:unparam // op might get set to different values in the future
func (l *LoadBalancerOps) emitMaxTargetsReachedError(node *corev1.Node, svc *corev1.Service, op string) {
l.Recorder.Eventf(node, corev1.EventTypeWarning, "MaxTargetsReached",
"Node could not be added to Load Balancer for service %s because the max number of targets has been reached",
svc.ObjectMeta.Name)
klog.InfoS("cannot add server target because max number of targets have been reached", "op", op, "service", svc.ObjectMeta.Name, "targetName", node.Name)
}

func (l *LoadBalancerOps) getUsePrivateIP(svc *corev1.Service) (bool, error) {
usePrivateIP, err := annotation.LBUsePrivateIP.BoolFromService(svc)
if err != nil {
Expand Down
2 changes: 2 additions & 0 deletions internal/hcops/testing.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"testing"

hrobotmodels "github.com/syself/hrobot-go/models"
"k8s.io/client-go/tools/record"

"github.com/hetznercloud/hcloud-cloud-controller-manager/internal/mocks"
"github.com/hetznercloud/hcloud-go/v2/hcloud"
Expand Down Expand Up @@ -49,6 +50,7 @@ func NewLoadBalancerOpsFixture(t *testing.T) *LoadBalancerOpsFixture {
ActionClient: fx.ActionClient,
NetworkClient: fx.NetworkClient,
RobotClient: fx.RobotClient,
Recorder: &record.FakeRecorder{},
}

return fx
Expand Down

0 comments on commit e8f9199

Please sign in to comment.