Browse Source

feat: fix cert-controller readiness probe (#2857)

readiness probes are being executed independently from the
leader election status. The current implementation depends on
leader election (client cache etc.) to run properly.
This commit fixes that by short-circuiting the readiness probes
when the mgr is not the leader.

This bug surfaces when `leader-election=true` and cert-controller `replicas>=2`.

Signed-off-by: Moritz Johner <beller.moritz@googlemail.com>
Moritz Johner 2 years ago
parent
commit
f5cd6816aa

+ 3 - 2
cmd/certcontroller.go

@@ -87,7 +87,8 @@ var certcontrollerCmd = &cobra.Command{
 			setupLog.Error(err, "unable to start manager")
 			os.Exit(1)
 		}
-		crdctrl := crds.New(mgr.GetClient(), mgr.GetScheme(),
+
+		crdctrl := crds.New(mgr.GetClient(), mgr.GetScheme(), mgr.Elected(),
 			ctrl.Log.WithName("controllers").WithName("webhook-certs-updater"),
 			crdRequeueInterval, serviceName, serviceNamespace, secretName, secretNamespace, crdNames)
 		if err := crdctrl.SetupWithManager(mgr, controller.Options{
@@ -97,7 +98,7 @@ var certcontrollerCmd = &cobra.Command{
 			os.Exit(1)
 		}
 
-		whc := webhookconfig.New(mgr.GetClient(), mgr.GetScheme(),
+		whc := webhookconfig.New(mgr.GetClient(), mgr.GetScheme(), mgr.Elected(),
 			ctrl.Log.WithName("controllers").WithName("webhook-certs-updater"),
 			serviceName, serviceNamespace,
 			secretName, secretNamespace, crdRequeueInterval)

+ 47 - 22
pkg/controllers/crds/crds_controller.go

@@ -72,26 +72,30 @@ type Reconciler struct {
 	RequeueInterval time.Duration
 
 	// the controller is ready when all crds are injected
-	rdyMu          *sync.Mutex
-	readyStatusMap map[string]bool
+	// and the controller is elected as leader
+	leaderChan       <-chan struct{}
+	leaderElected    bool
+	readyStatusMapMu *sync.Mutex
+	readyStatusMap   map[string]bool
 }
 
-func New(k8sClient client.Client, scheme *runtime.Scheme, logger logr.Logger,
+func New(k8sClient client.Client, scheme *runtime.Scheme, leaderChan <-chan struct{}, logger logr.Logger,
 	interval time.Duration, svcName, svcNamespace, secretName, secretNamespace string, resources []string) *Reconciler {
 	return &Reconciler{
-		Client:          k8sClient,
-		Log:             logger,
-		Scheme:          scheme,
-		SvcName:         svcName,
-		SvcNamespace:    svcNamespace,
-		SecretName:      secretName,
-		SecretNamespace: secretNamespace,
-		RequeueInterval: interval,
-		CrdResources:    resources,
-		CAName:          "external-secrets",
-		CAOrganization:  "external-secrets",
-		rdyMu:           &sync.Mutex{},
-		readyStatusMap:  map[string]bool{},
+		Client:           k8sClient,
+		Log:              logger,
+		Scheme:           scheme,
+		SvcName:          svcName,
+		SvcNamespace:     svcNamespace,
+		SecretName:       secretName,
+		SecretNamespace:  secretNamespace,
+		RequeueInterval:  interval,
+		CrdResources:     resources,
+		CAName:           "external-secrets",
+		CAOrganization:   "external-secrets",
+		leaderChan:       leaderChan,
+		readyStatusMapMu: &sync.Mutex{},
+		readyStatusMap:   map[string]bool{},
 	}
 }
 
@@ -117,14 +121,14 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Resu
 		err := r.updateCRD(ctx, req)
 		if err != nil {
 			log.Error(err, "failed to inject conversion webhook")
-			r.rdyMu.Lock()
+			r.readyStatusMapMu.Lock()
 			r.readyStatusMap[req.NamespacedName.Name] = false
-			r.rdyMu.Unlock()
+			r.readyStatusMapMu.Unlock()
 			return ctrl.Result{}, err
 		}
-		r.rdyMu.Lock()
+		r.readyStatusMapMu.Lock()
 		r.readyStatusMap[req.NamespacedName.Name] = true
-		r.rdyMu.Unlock()
+		r.readyStatusMapMu.Unlock()
 	}
 	return ctrl.Result{RequeueAfter: r.RequeueInterval}, nil
 }
@@ -132,14 +136,35 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Resu
 // ReadyCheck reviews if all webhook configs have been injected into the CRDs
 // and if the referenced webhook service is ready.
 func (r *Reconciler) ReadyCheck(_ *http.Request) error {
+	// skip readiness check if we're not leader
+	// as we depend on caches and being able to reconcile Webhooks
+	if !r.leaderElected {
+		select {
+		case <-r.leaderChan:
+			r.leaderElected = true
+		default:
+			return nil
+		}
+	}
+	if err := r.checkCRDs(); err != nil {
+		return err
+	}
+	return r.checkEndpoints()
+}
+
+func (r Reconciler) checkCRDs() error {
 	for _, res := range r.CrdResources {
-		r.rdyMu.Lock()
+		r.readyStatusMapMu.Lock()
 		rdy := r.readyStatusMap[res]
-		r.rdyMu.Unlock()
+		r.readyStatusMapMu.Unlock()
 		if !rdy {
 			return fmt.Errorf(errResNotReady, res)
 		}
 	}
+	return nil
+}
+
+func (r Reconciler) checkEndpoints() error {
 	var eps corev1.Endpoints
 	err := r.Get(context.TODO(), types.NamespacedName{
 		Name:      r.SvcName,

+ 3 - 1
pkg/controllers/crds/suite_test.go

@@ -77,7 +77,9 @@ var _ = BeforeSuite(func() {
 	Expect(err).ToNot(HaveOccurred())
 	Expect(k8sClient).ToNot(BeNil())
 
-	rec := New(k8sClient, k8sManager.GetScheme(), log, time.Second*1,
+	leaderChan := make(chan struct{})
+	close(leaderChan)
+	rec := New(k8sClient, k8sManager.GetScheme(), leaderChan, log, time.Second*1,
 		"foo", "default", "foo", "default", []string{
 			"secretstores.test.io",
 		})

+ 3 - 2
pkg/controllers/webhookconfig/suite_test.go

@@ -84,8 +84,9 @@ var _ = BeforeSuite(func() {
 	k8sClient, err = client.New(cfg, client.Options{Scheme: scheme.Scheme})
 	Expect(err).ToNot(HaveOccurred())
 	Expect(k8sClient).ToNot(BeNil())
-
-	reconciler = New(k8sClient, k8sManager.GetScheme(), ctrl.Log, ctrlSvcName, ctrlSvcNamespace, ctrlSecretName, ctrlSecretNamespace, time.Second)
+	leaderChan := make(chan struct{})
+	close(leaderChan)
+	reconciler = New(k8sClient, k8sManager.GetScheme(), leaderChan, ctrl.Log, ctrlSvcName, ctrlSvcNamespace, ctrlSecretName, ctrlSecretNamespace, time.Second)
 	reconciler.SetupWithManager(k8sManager, controller.Options{})
 	Expect(err).ToNot(HaveOccurred())
 

+ 28 - 11
pkg/controllers/webhookconfig/webhookconfig.go

@@ -46,11 +46,16 @@ type Reconciler struct {
 	SecretName      string
 	SecretNamespace string
 
-	rdyMu *sync.Mutex
-	ready bool
+	// store state for the readiness probe.
+	// we're ready when we're not the leader or
+	// if we've reconciled the webhook config when we're the leader.
+	leaderChan     <-chan struct{}
+	leaderElected  bool
+	webhookReadyMu *sync.Mutex
+	webhookReady   bool
 }
 
-func New(k8sClient client.Client, scheme *runtime.Scheme,
+func New(k8sClient client.Client, scheme *runtime.Scheme, leaderChan <-chan struct{},
 	log logr.Logger, svcName, svcNamespace, secretName, secretNamespace string,
 	requeueInterval time.Duration) *Reconciler {
 	return &Reconciler{
@@ -62,8 +67,10 @@ func New(k8sClient client.Client, scheme *runtime.Scheme,
 		SvcNamespace:    svcNamespace,
 		SecretName:      secretName,
 		SecretNamespace: secretNamespace,
-		rdyMu:           &sync.Mutex{},
-		ready:           false,
+		leaderChan:      leaderChan,
+		leaderElected:   false,
+		webhookReadyMu:  &sync.Mutex{},
+		webhookReady:    false,
 	}
 }
 
@@ -109,9 +116,9 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Resu
 
 	// right now we only have one single
 	// webhook config we care about
-	r.rdyMu.Lock()
-	defer r.rdyMu.Unlock()
-	r.ready = true
+	r.webhookReadyMu.Lock()
+	defer r.webhookReadyMu.Unlock()
+	r.webhookReady = true
 	return ctrl.Result{
 		RequeueAfter: r.RequeueDuration,
 	}, nil
@@ -126,9 +133,19 @@ func (r *Reconciler) SetupWithManager(mgr ctrl.Manager, opts controller.Options)
 }
 
 func (r *Reconciler) ReadyCheck(_ *http.Request) error {
-	r.rdyMu.Lock()
-	defer r.rdyMu.Unlock()
-	if !r.ready {
+	// skip readiness check if we're not leader
+	// as we depend on caches and being able to reconcile Webhooks
+	if !r.leaderElected {
+		select {
+		case <-r.leaderChan:
+			r.leaderElected = true
+		default:
+			return nil
+		}
+	}
+	r.webhookReadyMu.Lock()
+	defer r.webhookReadyMu.Unlock()
+	if !r.webhookReady {
 		return fmt.Errorf(errWebhookNotReady)
 	}
 	var eps v1.Endpoints