k0sproject · emosbaugh · Oct 23, 2024 · Nov 5, 2024 · Nov 5, 2024 · Nov 8, 2024
@@ -666,15 +666,19 @@ func (c *command) startWorker(ctx context.Context, profile string, nodeConfig *v
 	return wc.Start(ctx)
 }
 
-// If we've got CA in place we assume the node has already joined previously
+// If we've got an etcd data directory in place for embedded etcd, or a ca for
+// external or other storage types, we assume the node has already joined
+// previously.
 func (c *command) needToJoin(nodeConfig *v1beta1.ClusterConfig) bool {
+	if nodeConfig.Spec.Storage.Type == v1beta1.EtcdStorageType && !nodeConfig.Spec.Storage.Etcd.IsExternalClusterUsed() {
+		// Use the main etcd data directory as the source of truth to determine if this node has already joined
+		// See https://etcd.io/docs/v3.5/learning/persistent-storage-files/#bbolt-btree-membersnapdb
+		return !file.Exists(filepath.Join(c.K0sVars.EtcdDataDir, "member", "snap", "db"))
+	}
 	if file.Exists(filepath.Join(c.K0sVars.CertRootDir, "ca.key")) &&
 		file.Exists(filepath.Join(c.K0sVars.CertRootDir, "ca.crt")) {
 		return false
 	}
-	if nodeConfig.Spec.Storage.Type == v1beta1.EtcdStorageType && !nodeConfig.Spec.Storage.Etcd.IsExternalClusterUsed() {
-		return !file.Exists(filepath.Join(c.K0sVars.EtcdDataDir, "member", "snap", "db"))
-	}
 	return true
 }
 

@@ -107,6 +107,11 @@ func (e *Etcd) syncEtcdConfig(ctx context.Context, etcdRequest v1beta1.EtcdReque
 			etcdResponse, err = e.JoinClient.JoinEtcd(ctx, etcdRequest)
 			return err
 		},
+		// When joining multiple nodes in parallel, etcd can lose consensus and will return 500 responses
+		// Allow for more time to recover (~ 15 minutes)
+		retry.Attempts(20),
+		retry.Delay(1*time.Second),
+		retry.MaxDelay(60*time.Second),
 		retry.Context(ctx),
 		retry.LastErrorOnly(true),
 		retry.OnRetry(func(attempt uint, err error) {
@@ -191,6 +196,8 @@ func (e *Etcd) Start(ctx context.Context) error {
 		"--enable-pprof":                "false",
 	}
 
+	// Use the main etcd data directory as the source of truth to determine if this node has already joined
+	// See https://etcd.io/docs/v3.5/learning/persistent-storage-files/#bbolt-btree-membersnapdb
 	if file.Exists(filepath.Join(e.K0sVars.EtcdDataDir, "member", "snap", "db")) {
 		logrus.Warnf("etcd db file(s) already exist, not gonna run join process")
 	} else if e.JoinClient != nil {