diff --git a/.golangci.yml b/.golangci.yml index 28e4237aff4..934407540e8 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -132,6 +132,7 @@ linters-settings: replace-allow-list: - gopkg.in/yaml.v3 - inet.af/tcpproxy + - github.com/insomniacslk/dhcp - github.com/vmware-tanzu/sonobuoy retract-allow-no-explanation: false exclude-forbidden: true diff --git a/go.mod b/go.mod index 0a905ec2b7c..0d8646e5845 100644 --- a/go.mod +++ b/go.mod @@ -3,6 +3,10 @@ module github.com/talos-systems/talos go 1.19 replace ( + // Forked insomniacslk/dhcp with DHCP spec compliancy fixes + // Upstream PR: https://github.com/insomniacslk/dhcp/pull/469 + github.com/insomniacslk/dhcp => github.com/twelho/dhcp v0.0.0-20220727131347-5d84f50b9c0c + // Use nested module. github.com/talos-systems/talos/pkg/machinery => ./pkg/machinery diff --git a/go.sum b/go.sum index f8395cef204..b92e1c17232 100644 --- a/go.sum +++ b/go.sum @@ -714,8 +714,6 @@ github.com/imdario/mergo v0.3.12 h1:b6R2BslTbIEToALKP7LxUvijTsNI9TAe80pLWN2g/HU= github.com/imdario/mergo v0.3.12/go.mod h1:jmQim1M+e3UYxmgPu/WyfjB3N3VflVyUjjjwH0dnCYA= github.com/inconshreveable/mousetrap v1.0.0 h1:Z8tu5sraLXCXIcARxBp/8cbvlwVa7Z1NHg9XEKhtSvM= github.com/inconshreveable/mousetrap v1.0.0/go.mod h1:PxqpIevigyE2G7u3NXJIT2ANytuPF1OarO4DADm73n8= -github.com/insomniacslk/dhcp v0.0.0-20220504074936-1ca156eafb9f h1:l1QCwn715k8nYkj4Ql50rzEog3WnMdrd4YYMMwemxEo= -github.com/insomniacslk/dhcp v0.0.0-20220504074936-1ca156eafb9f/go.mod h1:h+MxyHxRg9NH3terB1nfRIUaQEcI0XOVkdR9LNBlp8E= github.com/j-keck/arping v0.0.0-20160618110441-2cf9dc699c56/go.mod h1:ymszkNOg6tORTn+6F6j+Jc8TOr5osrynvN6ivFWZ2GA= github.com/jmespath/go-jmespath v0.0.0-20160202185014-0b12d6b521d8/go.mod h1:Nht3zPeWKUH0NzdCt2Blrr5ys8VGpn0CEB0cQHVjt7k= github.com/jmespath/go-jmespath v0.0.0-20160803190731-bd40a432e4c7/go.mod h1:Nht3zPeWKUH0NzdCt2Blrr5ys8VGpn0CEB0cQHVjt7k= @@ -1184,6 +1182,8 @@ github.com/tmc/grpc-websocket-proxy v0.0.0-20170815181823-89b8d40f7ca8/go.mod h1 github.com/tmc/grpc-websocket-proxy v0.0.0-20190109142713-0ad062ec5ee5/go.mod h1:ncp9v5uamzpCO7NfCPTXjqaC+bZgJeR0sMTm6dMHP7U= github.com/tmc/grpc-websocket-proxy v0.0.0-20201229170055-e5319fda7802/go.mod h1:ncp9v5uamzpCO7NfCPTXjqaC+bZgJeR0sMTm6dMHP7U= github.com/tv42/httpunix v0.0.0-20191220191345-2ba4b9c3382c/go.mod h1:hzIxponao9Kjc7aWznkXaL4U4TWaDSs8zcsY4Ka08nM= +github.com/twelho/dhcp v0.0.0-20220727131347-5d84f50b9c0c h1:fF28zFE6+GvLZ89j8HnJu6n1qxzyazzdPDuEbXKCtXg= +github.com/twelho/dhcp v0.0.0-20220727131347-5d84f50b9c0c/go.mod h1:h+MxyHxRg9NH3terB1nfRIUaQEcI0XOVkdR9LNBlp8E= github.com/u-root/u-root v0.9.0 h1:1dpUzrE0FyKrNEjxpKFOkyveuV1f3T0Ko5CQg4gTkCg= github.com/u-root/u-root v0.9.0/go.mod h1:ewc9w6JF1ayZCVC9Y5wsrUiCBw3nMmPC3QItvrEwmew= github.com/u-root/uio v0.0.0-20210528114334-82958018845c/go.mod h1:LpEX5FO/cB+WF4TYGY1V5qktpaZLkKkSegbr0V4eYXA= diff --git a/internal/app/machined/pkg/controllers/network/operator/dhcp4.go b/internal/app/machined/pkg/controllers/network/operator/dhcp4.go index c5cec74559e..a420aa1b374 100644 --- a/internal/app/machined/pkg/controllers/network/operator/dhcp4.go +++ b/internal/app/machined/pkg/controllers/network/operator/dhcp4.go @@ -13,11 +13,14 @@ import ( "sync" "time" + "github.com/cosi-project/runtime/pkg/resource" + "github.com/cosi-project/runtime/pkg/state" "github.com/insomniacslk/dhcp/dhcpv4" "github.com/insomniacslk/dhcp/dhcpv4/nclient4" "go.uber.org/zap" "inet.af/netaddr" + "github.com/talos-systems/talos/internal/app/machined/pkg/controllers/network/operator/utils" "github.com/talos-systems/talos/internal/app/machined/pkg/runtime" "github.com/talos-systems/talos/pkg/machinery/generic/slices" "github.com/talos-systems/talos/pkg/machinery/nethelpers" @@ -27,12 +30,13 @@ import ( // DHCP4 implements the DHCPv4 network operator. type DHCP4 struct { logger *zap.Logger + state state.State linkName string routeMetric uint32 requestMTU bool - offer *dhcpv4.DHCPv4 + lease *nclient4.Lease mu sync.Mutex addresses []network.AddressSpecSpec @@ -44,9 +48,10 @@ type DHCP4 struct { } // NewDHCP4 creates DHCPv4 operator. -func NewDHCP4(logger *zap.Logger, linkName string, routeMetric uint32, platform runtime.Platform) *DHCP4 { +func NewDHCP4(logger *zap.Logger, linkName string, routeMetric uint32, platform runtime.Platform, state state.State) *DHCP4 { return &DHCP4{ logger: logger, + state: state, linkName: linkName, routeMetric: routeMetric, // <3 azure @@ -61,25 +66,146 @@ func (d *DHCP4) Prefix() string { return fmt.Sprintf("dhcp4/%s", d.linkName) } +// extractHostname extracts a hostname from the given resource if it is a valid network.HostnameStatus. +func extractHostname(res resource.Resource) string { + if res, ok := res.(*network.HostnameStatus); ok { + return res.TypedSpec().Hostname + } + + return "" +} + +// setupHostnameWatch returns the initial hostname and a channel that outputs all events related to hostname changes. +func (d *DHCP4) setupHostnameWatch(ctx context.Context) (string, <-chan state.Event, error) { + hostnameWatchCh := make(chan state.Event) + if err := d.state.Watch(ctx, resource.NewMetadata( + network.NamespaceName, + network.HostnameStatusType, + network.HostnameID, + resource.VersionUndefined, + ), hostnameWatchCh); err != nil { + return "", nil, err + } + + return extractHostname((<-hostnameWatchCh).Resource), hostnameWatchCh, nil +} + +// knownHostname checks if the given hostname has been defined by this operator. +func (d *DHCP4) knownHostname(hostname string) bool { + for i := range d.hostname { + if d.hostname[i].Hostname == hostname { + return true + } + } + + return false +} + +// waitForNetworkReady waits for the network to be ready and the leased address to +// be assigned to the associated so that unicast operations can bind successfully. +func (d *DHCP4) waitForNetworkReady(ctx context.Context) error { + // if an IP address has been registered, wait for the address association to be ready + if len(d.addresses) > 0 { + _, err := d.state.WatchFor(ctx, + resource.NewMetadata( + network.NamespaceName, + network.AddressStatusType, + network.AddressID(d.linkName, d.addresses[0].Address), + resource.VersionUndefined, + ), + state.WithPhases(resource.PhaseRunning), + ) + if err != nil { + return err + } + } + + // wait for the network to be ready + _, err := d.state.WatchFor(ctx, + resource.NewMetadata( + network.NamespaceName, + network.StatusType, + network.StatusID, + resource.VersionUndefined, + ), + state.WithCondition(func(res resource.Resource) (bool, error) { + if res, ok := res.(*network.Status); ok { + return res.TypedSpec().AddressReady && res.TypedSpec().ConnectivityReady, nil + } + + return false, nil + }), + ) + + return err +} + // Run the operator loop. // -//nolint:gocyclo,dupl +//nolint:gocyclo,cyclop,dupl func (d *DHCP4) Run(ctx context.Context, notifyCh chan<- struct{}) { const minRenewDuration = 5 * time.Second // protect from renewing too often renewInterval := minRenewDuration + hostname, hostnameWatchCh, err := d.setupHostnameWatch(ctx) + if err != nil && !errors.Is(err, context.Canceled) { + d.logger.Warn("failed to watch for hostname changes", zap.Error(err)) + } + for { - leaseTime, err := d.renew(ctx) + // track if we need to acquire a new lease + newLease := d.lease == nil + + // perform a lease request or renewal + leaseTime, err := d.requestRenew(ctx, hostname) if err != nil && !errors.Is(err, context.Canceled) { - d.logger.Warn("renew failed", zap.Error(err), zap.String("link", d.linkName)) + d.logger.Warn("request/renew failed", zap.Error(err), zap.String("link", d.linkName)) + } + + if newLease { + // notify the underlying controller about the new lease + if err == nil { + select { + case notifyCh <- struct{}{}: + case <-ctx.Done(): + return + } + } + + // wait for networking to be established before transitioning to unicast operations + if err = d.waitForNetworkReady(ctx); err != nil && !errors.Is(err, context.Canceled) { + d.logger.Warn("failed to wait for networking to become ready", zap.Error(err)) + } } + // DHCP hostname parroting protection: if e.g. `dnsmasq` receives a request that both sends + // a hostname and requests one, it will "parrot" the sent hostname back if no other name has + // been defined for the requesting host. That causes update anomalies, since removing a + // hostname defined previously by e.g. the configuration layer causes a copy of that + // hostname to live on in a spec defined by this operator, even though it isn't sourced from + // DHCP. + // + // To avoid this issue, never send and request a hostname in the same operation. When + // negotiating a new lease, send the current hostname when acquiring the lease, and follow + // up with a dedicated INFORM request to ask the server for a DHCP-defined hostname. When + // renewing a lease, we're free to always request a hostname (to detect server-side + // changes), since any changes to the node hostname will cause a lease invalidation and + // re-start the negotiation process. More details below. if err == nil { - select { - case notifyCh <- struct{}{}: - case <-ctx.Done(): - return + // request the node hostname from the DHCP server + err = d.requestHostname(ctx) + if err != nil && !errors.Is(err, context.Canceled) { + d.logger.Warn("hostname request failed", zap.Error(err), zap.String("link", d.linkName)) + } + + // notify the underlying controller about the received hostname and/or renewed lease + if err == nil { + select { + case notifyCh <- struct{}{}: + case <-ctx.Done(): + return + } } } @@ -93,10 +219,39 @@ func (d *DHCP4) Run(ctx context.Context, notifyCh chan<- struct{}) { renewInterval = minRenewDuration } - select { - case <-ctx.Done(): - return - case <-time.After(renewInterval): + for { + select { + case <-ctx.Done(): + return + case <-time.After(renewInterval): + case event := <-hostnameWatchCh: + // If the hostname resource was deleted entirely, we must still inform the DHCP + // server that the node has no hostname anymore. `extractHostname` will return a + // blank hostname for a Tombstone resource generated by a deletion event. + hostname = extractHostname(event.Resource) + + // If, on first invocation, the DHCP server has given a new hostname for the node, + // and the network.HostnameSpecController decides to apply it as a preferred + // hostname, this operator would unnecessarily drop the lease and restart DHCP + // discovery. Thus, if the selected hostname has been sourced from this operator, + // we don't need to do anything. + if d.knownHostname(hostname) { + continue + } + + // While updating the hostname together with a RENEW request works for dnsmasq, it + // doesn't work with the Windows Server DHCP + DNS. A hostname update via an + // INIT-REBOOT request also gets ignored. Thus, the only reliable way to update the + // hostname seems to be to forget the old release and initiate a new DISCOVER flow + // with the new hostname. RFC 2131 doesn't define any better way to do this, and + // since according to the spec the DISCOVER cannot be targeted at the previous + // lessor, the node may switch DHCP servers on hostname change. This is not that + // big of a concern though, since a single network should not have multiple + // competing DHCP servers in the first place. + d.lease = nil + } + + break } } } @@ -149,8 +304,33 @@ func (d *DHCP4) TimeServerSpecs() []network.TimeServerSpecSpec { return d.timeservers } +func (d *DHCP4) parseHostnameFromAck(ack *dhcpv4.DHCPv4) { + d.mu.Lock() + defer d.mu.Unlock() + + if ack.HostName() != "" { + spec := network.HostnameSpecSpec{ + ConfigLayer: network.ConfigOperator, + } + + if err := spec.ParseFQDN(ack.HostName()); err == nil { + if ack.DomainName() != "" { + spec.Domainname = ack.DomainName() + } + + d.hostname = []network.HostnameSpecSpec{ + spec, + } + } else { + d.hostname = nil + } + } else { + d.hostname = nil + } +} + //nolint:gocyclo -func (d *DHCP4) parseAck(ack *dhcpv4.DHCPv4) { +func (d *DHCP4) parseNetworkConfigFromAck(ack *dhcpv4.DHCPv4) { d.mu.Lock() defer d.mu.Unlock() @@ -263,26 +443,6 @@ func (d *DHCP4) parseAck(ack *dhcpv4.DHCPv4) { d.resolvers = nil } - if ack.HostName() != "" { - spec := network.HostnameSpecSpec{ - ConfigLayer: network.ConfigOperator, - } - - if err = spec.ParseFQDN(ack.HostName()); err == nil { - if ack.DomainName() != "" { - spec.Domainname = ack.DomainName() - } - - d.hostname = []network.HostnameSpecSpec{ - spec, - } - } else { - d.hostname = nil - } - } else { - d.hostname = nil - } - if len(ack.NTPServers()) > 0 { convertIP := func(ip net.IP) string { result, _ := netaddr.FromStdIP(ip) @@ -301,12 +461,74 @@ func (d *DHCP4) parseAck(ack *dhcpv4.DHCPv4) { } } -func (d *DHCP4) renew(ctx context.Context) (time.Duration, error) { +func (d *DHCP4) newClient() (*nclient4.Client, error) { + var clientOpts []nclient4.ClientOpt + + // we have an existing lease, target the server with unicast + if d.lease != nil { + serverAddr, err := utils.ToUDPAddr(d.lease.Offer.ServerIPAddr, nclient4.ServerPort) + if err != nil { + return nil, err + } + + clientAddr, err := utils.ToUDPAddr(d.lease.Offer.YourIPAddr, nclient4.ClientPort) + if err != nil { + return nil, err + } + + // RFC 2131, section 4.3.2: + // DHCPREQUEST generated during RENEWING state: + // ... This message will be unicast, so no relay + // agents will be involved in its transmission. + clientOpts = append(clientOpts, + nclient4.WithServerAddr(serverAddr), + // WithUnicast must be specified manually, WithServerAddr is not enough + nclient4.WithUnicast(clientAddr), + ) + } + + // create a new client, the caller is responsible for closing it + return nclient4.New(d.linkName, clientOpts...) +} + +// requestHostname uses an INFORM request to request a hostname from DHCP as requesting +// it during a DISCOVER is not reliable when simultaneously sending the local hostname. +func (d *DHCP4) requestHostname(ctx context.Context) error { + opts := []dhcpv4.OptionCode{ + dhcpv4.OptionHostName, + dhcpv4.OptionDomainName, + } + + client, err := d.newClient() + if err != nil { + return err + } + + //nolint:errcheck + defer client.Close() + + d.logger.Debug("DHCP INFORM", zap.String("link", d.linkName)) + + // acquire a hostname using an additional INFORM request + ack, err := client.InformFromOffer(ctx, d.lease.Offer, dhcpv4.WithRequestedOptions(opts...)) + if err != nil { + return err + } + + d.logger.Debug("DHCP ACK", zap.String("link", d.linkName), zap.String("dhcp", collapseSummary(ack.Summary()))) + + // parse the hostname from the response + d.parseHostnameFromAck(ack) + + return nil +} + +func (d *DHCP4) requestRenew(ctx context.Context, hostname string) (time.Duration, error) { opts := []dhcpv4.OptionCode{ dhcpv4.OptionClasslessStaticRoute, dhcpv4.OptionDomainNameServer, + // TODO(twelho): This is unused until network.ResolverSpec supports search domains dhcpv4.OptionDNSDomainSearchList, - dhcpv4.OptionHostName, dhcpv4.OptionNTPServers, dhcpv4.OptionDomainName, } @@ -316,50 +538,41 @@ func (d *DHCP4) renew(ctx context.Context) (time.Duration, error) { } mods := []dhcpv4.Modifier{dhcpv4.WithRequestedOptions(opts...)} - clientOpts := []nclient4.ClientOpt{} - if d.offer != nil { - // do not use broadcast, but send the packet to DHCP server directly - addr, err := net.ResolveUDPAddr("udp", d.offer.ServerIPAddr.String()+":67") - if err != nil { - return 0, err - } - - // by default it's set to 0.0.0.0 which actually breaks lease renew - d.offer.ClientIPAddr = d.offer.YourIPAddr - - clientOpts = append(clientOpts, nclient4.WithServerAddr(addr)) + // if the node has a hostname, always send it to the DHCP + // server with option 12 during lease acquisition and renewal + if len(hostname) > 0 { + mods = append(mods, dhcpv4.WithOption(dhcpv4.OptHostName(hostname))) } - cli, err := nclient4.New(d.linkName, clientOpts...) + client, err := d.newClient() if err != nil { return 0, err } //nolint:errcheck - defer cli.Close() - - var lease *nclient4.Lease + defer client.Close() - if d.offer != nil { - lease, err = cli.RequestFromOffer(ctx, d.offer, mods...) + if d.lease != nil { + d.logger.Debug("DHCP RENEW", zap.String("link", d.linkName)) + err = client.Renew(ctx, d.lease, mods...) } else { - lease, err = cli.Request(ctx, mods...) + d.logger.Debug("DHCP REQUEST", zap.String("link", d.linkName)) + d.lease, err = client.Request(ctx, mods...) } if err != nil { - // clear offer if request fails to start with discover sequence next time - d.offer = nil + // explicitly clear the lease on failure to start with the discovery sequence next time + d.lease = nil return 0, err } - d.logger.Debug("DHCP ACK", zap.String("link", d.linkName), zap.String("dhcp", collapseSummary(lease.ACK.Summary()))) + d.logger.Debug("DHCP ACK", zap.String("link", d.linkName), zap.String("dhcp", collapseSummary(d.lease.ACK.Summary()))) - d.offer = lease.Offer - d.parseAck(lease.ACK) + d.parseNetworkConfigFromAck(d.lease.ACK) - return lease.ACK.IPAddressLeaseTime(time.Minute * 30), nil + return d.lease.ACK.IPAddressLeaseTime(time.Minute * 30), nil } func collapseSummary(summary string) string { diff --git a/internal/app/machined/pkg/controllers/network/operator/utils/utils.go b/internal/app/machined/pkg/controllers/network/operator/utils/utils.go new file mode 100644 index 00000000000..29e00f45ad2 --- /dev/null +++ b/internal/app/machined/pkg/controllers/network/operator/utils/utils.go @@ -0,0 +1,22 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at http://mozilla.org/MPL/2.0/. + +// Package utils provides common methods for operators. +package utils + +import ( + "fmt" + "net" + "net/netip" +) + +// ToUDPAddr combines the given net.IP and port to form a net.UDPAddr. +func ToUDPAddr(ip net.IP, port uint16) (*net.UDPAddr, error) { + addr, ok := netip.AddrFromSlice(ip) + if !ok { + return nil, fmt.Errorf("failed to parse %q as an IP address", ip) + } + + return net.UDPAddrFromAddrPort(netip.AddrPortFrom(addr, port)), nil +} diff --git a/internal/app/machined/pkg/controllers/network/operator_spec.go b/internal/app/machined/pkg/controllers/network/operator_spec.go index f7bbd151018..eda3bdf787c 100644 --- a/internal/app/machined/pkg/controllers/network/operator_spec.go +++ b/internal/app/machined/pkg/controllers/network/operator_spec.go @@ -421,7 +421,7 @@ func (ctrl *OperatorSpecController) newOperator(logger *zap.Logger, spec *networ case network.OperatorDHCP4: logger = logger.With(zap.String("operator", "dhcp4")) - return operator.NewDHCP4(logger, spec.LinkName, spec.DHCP4.RouteMetric, ctrl.V1alpha1Platform) + return operator.NewDHCP4(logger, spec.LinkName, spec.DHCP4.RouteMetric, ctrl.V1alpha1Platform, ctrl.State) case network.OperatorDHCP6: logger = logger.With(zap.String("operator", "dhcp6")) diff --git a/pkg/machinery/resources/network/hostname_spec.go b/pkg/machinery/resources/network/hostname_spec.go index b2dd7c29891..e8aab928073 100644 --- a/pkg/machinery/resources/network/hostname_spec.go +++ b/pkg/machinery/resources/network/hostname_spec.go @@ -22,7 +22,7 @@ type HostnameSpec = typed.Resource[HostnameSpecSpec, HostnameSpecRD] // HostnameID is the ID of the singleton instance. const HostnameID resource.ID = "hostname" -// HostnameSpecSpec describes node nostname. +// HostnameSpecSpec describes node hostname. // //gotagsrewrite:gen type HostnameSpecSpec struct {