Skip to content

Commit

Permalink
vm-runner: external traffic Prometheus metrics endpoint (#1153)
Browse files Browse the repository at this point in the history
Add /metrics Prometheus endpoint to neonvm-runner
exposing following metrics:
runner_vm_egress_bytes
runner_vm_ingress_bytes
runner_vm_network_fetch_errors_total
_bytes metrics use iptables and query rules configured in 

https://github.com/neondatabase/cloud/blob/main/compute-init/compute-init.sh#L143
 
Resolves: neondatabase/neon#4704
  • Loading branch information
myrrc authored Dec 6, 2024
1 parent 77bf555 commit 30eee9a
Show file tree
Hide file tree
Showing 6 changed files with 123 additions and 5 deletions.
112 changes: 107 additions & 5 deletions neonvm-runner/cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,14 @@ import (
"github.com/containerd/cgroups/v3"
"github.com/containerd/cgroups/v3/cgroup1"
"github.com/containerd/cgroups/v3/cgroup2"
"github.com/coreos/go-iptables/iptables"
"github.com/digitalocean/go-qemu/qmp"
"github.com/docker/libnetwork/types"
"github.com/jpillora/backoff"
"github.com/kdomanski/iso9660"
"github.com/opencontainers/runtime-spec/specs-go"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
"github.com/samber/lo"
"github.com/vishvananda/netlink"
"go.uber.org/zap"
Expand All @@ -44,6 +47,7 @@ import (

vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1"
"github.com/neondatabase/autoscaling/pkg/api"
"github.com/neondatabase/autoscaling/pkg/util"
"github.com/neondatabase/autoscaling/pkg/util/taskgroup"
)

Expand Down Expand Up @@ -91,6 +95,8 @@ const (
//
// See also: https://neondb.slack.com/archives/C03TN5G758R/p1693462680623239
cpuLimitOvercommitFactor = 4

protocolTCP string = "6"
)

var (
Expand Down Expand Up @@ -669,6 +675,91 @@ func main() {
}
}

type NetworkMonitoringMetrics struct {
IngressBytes, EgressBytes, Errors prometheus.Counter
IngressBytesRaw, EgressBytesRaw uint64 // Absolute values to calc increments for Counters
}

func NewMonitoringMetrics(reg *prometheus.Registry) *NetworkMonitoringMetrics {
m := &NetworkMonitoringMetrics{
IngressBytes: util.RegisterMetric(reg, prometheus.NewCounter(
prometheus.CounterOpts{
Name: "runner_vm_ingress_bytes",
Help: "Number of bytes received by the VM from the open internet",
},
)),
EgressBytes: util.RegisterMetric(reg, prometheus.NewCounter(
prometheus.CounterOpts{
Name: "runner_vm_egress_bytes",
Help: "Number of bytes sent by the VM to the open internet",
},
)),
IngressBytesRaw: 0,
EgressBytesRaw: 0,
Errors: util.RegisterMetric(reg, prometheus.NewCounter(
prometheus.CounterOpts{
Name: "runner_vm_network_fetch_errors_total",
Help: "Number of errors while fetching network monitoring data",
},
)),
}
return m
}

func shouldBeIgnored(ip net.IP) bool {
// We need to measure only external traffic to/from vm, so we filter internal traffic
// Don't filter on isUnspecified as it's an iptables rule, not a real ip
return ip.IsLoopback() || ip.IsPrivate()
}

func getNetworkBytesCounter(iptables *iptables.IPTables, chain string) (uint64, error) {
cnt := uint64(0)
rules, err := iptables.Stats("filter", chain)
if err != nil {
return cnt, err
}

for _, rawStat := range rules {
stat, err := iptables.ParseStat(rawStat)
if err != nil {
return cnt, err
}
src, dest := stat.Source.IP, stat.Destination.IP
if stat.Protocol == protocolTCP && !shouldBeIgnored(src) && !shouldBeIgnored(dest) {
cnt += stat.Bytes
}
}
return cnt, nil
}

func (m *NetworkMonitoringMetrics) update(logger *zap.Logger) {
// Rules configured at github.com/neondatabase/cloud/blob/main/compute-init/compute-init.sh#L98
iptables, err := iptables.New()
if err != nil {
logger.Error("initializing iptables failed", zap.Error(err))
m.Errors.Inc()
return
}

ingress, err := getNetworkBytesCounter(iptables, "INPUT")
if err != nil {
logger.Error("getting iptables input counter failed", zap.Error(err))
m.Errors.Inc()
return
}
m.IngressBytes.Add(float64(ingress - m.IngressBytesRaw))
m.IngressBytesRaw = ingress

egress, err := getNetworkBytesCounter(iptables, "OUTPUT")
if err != nil {
logger.Error("getting iptables output counter failed", zap.Error(err))
m.Errors.Inc()
return
}
m.EgressBytes.Add(float64(egress - m.EgressBytesRaw))
m.EgressBytesRaw = egress
}

func run(logger *zap.Logger) error {
cfg := newConfig(logger)

Expand Down Expand Up @@ -1077,7 +1168,8 @@ func runQEMU(
}

wg.Add(1)
go listenForCPUChanges(ctx, logger, vmSpec.RunnerPort, callbacks, &wg)
monitoring := vmSpec.EnableNetworkMonitoring != nil && *vmSpec.EnableNetworkMonitoring
go listenForHTTPRequests(ctx, logger, vmSpec.RunnerPort, callbacks, &wg, monitoring)
wg.Add(1)
go forwardLogs(ctx, logger, &wg)

Expand Down Expand Up @@ -1179,12 +1271,13 @@ type cpuServerCallbacks struct {
set func(*zap.Logger, vmv1.MilliCPU) error
}

func listenForCPUChanges(
func listenForHTTPRequests(
ctx context.Context,
logger *zap.Logger,
port int32,
callbacks cpuServerCallbacks,
wg *sync.WaitGroup,
networkMonitoring bool,
) {
defer wg.Done()
mux := http.NewServeMux()
Expand All @@ -1197,6 +1290,15 @@ func listenForCPUChanges(
mux.HandleFunc("/cpu_current", func(w http.ResponseWriter, r *http.Request) {
handleCPUCurrent(cpuCurrentLogger, w, r, callbacks.get)
})
if networkMonitoring {
reg := prometheus.NewRegistry()
metrics := NewMonitoringMetrics(reg)
mux.HandleFunc("/metrics", func(w http.ResponseWriter, r *http.Request) {
metrics.update(logger)
h := promhttp.HandlerFor(reg, promhttp.HandlerOpts{Registry: reg})
h.ServeHTTP(w, r)
})
}
server := http.Server{
Addr: fmt.Sprintf("0.0.0.0:%d", port),
Handler: mux,
Expand All @@ -1211,13 +1313,13 @@ func listenForCPUChanges(
select {
case err := <-errChan:
if errors.Is(err, http.ErrServerClosed) {
logger.Info("cpu_change server closed")
logger.Info("http server closed")
} else if err != nil {
logger.Fatal("cpu_change exited with error", zap.Error(err))
logger.Fatal("http server exited with error", zap.Error(err))
}
case <-ctx.Done():
err := server.Shutdown(context.Background())
logger.Info("shut down cpu_change server", zap.Error(err))
logger.Info("shut down http server", zap.Error(err))
}
}

Expand Down
5 changes: 5 additions & 0 deletions neonvm/apis/neonvm/v1/virtualmachine_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,11 @@ type VirtualMachineSpec struct {
// +kubebuilder:default:=QmpScaling
// +optional
CpuScalingMode *CpuScalingMode `json:"cpuScalingMode,omitempty"`

// Enable network monitoring on the VM
// +kubebuilder:default:=false
// +optional
EnableNetworkMonitoring *bool `json:"enableNetworkMonitoring,omitempty"`
}

func (spec *VirtualMachineSpec) Resources() VirtualMachineResources {
Expand Down
1 change: 1 addition & 0 deletions neonvm/apis/neonvm/v1/virtualmachine_webhook.go
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,7 @@ func (r *VirtualMachine) ValidateUpdate(old runtime.Object) (admission.Warnings,
{".spec.enableAcceleration", func(v *VirtualMachine) any { return v.Spec.EnableAcceleration }},
{".spec.enableSSH", func(v *VirtualMachine) any { return v.Spec.EnableSSH }},
{".spec.initScript", func(v *VirtualMachine) any { return v.Spec.InitScript }},
{".spec.enableNetworkMonitoring", func(v *VirtualMachine) any { return v.Spec.EnableNetworkMonitoring }},
}

for _, info := range immutableFields {
Expand Down
5 changes: 5 additions & 0 deletions neonvm/apis/neonvm/v1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions neonvm/config/crd/bases/vm.neon.tech_virtualmachines.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1157,6 +1157,10 @@ spec:
default: true
description: Use KVM acceleation
type: boolean
enableNetworkMonitoring:
default: false
description: Enable network monitoring on the VM
type: boolean
enableSSH:
default: true
description: |-
Expand Down
1 change: 1 addition & 0 deletions vm-deploy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ metadata:
autoscaling.neon.tech/testing-only-always-migrate: "false"
spec:
schedulerName: autoscale-scheduler
enableNetworkMonitoring: true
enableSSH: true
guest:
cpus: { min: 0.25, use: 0.25, max: 1.25 }
Expand Down

0 comments on commit 30eee9a

Please sign in to comment.