Skip to content

Commit 69e892b

Browse files
Avoid deadlock on start
1 parent e1880f5 commit 69e892b

File tree

2 files changed

+59
-28
lines changed

2 files changed

+59
-28
lines changed

pkg/manager/internal.go

Lines changed: 55 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -104,11 +104,23 @@ type controllerManager struct {
104104
// Healthz probe handler
105105
healthzHandler *healthz.Handler
106106

107-
mu sync.Mutex
108-
started bool
109-
startedLeader bool
107+
// addMu protects controllerManager from Add, AddHealthzCheck, AddMetricsExtraHandler, AddReadyzCheck being
108+
// called while the data they collect are being read.
109+
addMu sync.RWMutex
110+
111+
// started tracks if the check has been started.
112+
started bool
113+
114+
// leader runnable started.
115+
startedLeader bool
116+
117+
// healthz started. In other words, we should not add healthz or readyz to the manager
110118
healthzStarted bool
111-
errChan chan error
119+
120+
// cacheMu protects waitForCache from being executed twice concurrently.
121+
cacheMu sync.Mutex
122+
123+
errChan chan error
112124

113125
// controllerOptions are the global controller options.
114126
controllerOptions v1alpha1.ControllerConfigurationSpec
@@ -192,8 +204,8 @@ type hasCache interface {
192204

193205
// Add sets dependencies on i, and adds it to the list of Runnables to start.
194206
func (cm *controllerManager) Add(r Runnable) error {
195-
cm.mu.Lock()
196-
defer cm.mu.Unlock()
207+
cm.addMu.Lock()
208+
defer cm.addMu.Unlock()
197209
if cm.stopProcedureEngaged {
198210
return errors.New("can't accept new runnable as stop procedure is already engaged")
199211
}
@@ -248,8 +260,8 @@ func (cm *controllerManager) AddMetricsExtraHandler(path string, handler http.Ha
248260
return fmt.Errorf("overriding builtin %s endpoint is not allowed", defaultMetricsEndpoint)
249261
}
250262

251-
cm.mu.Lock()
252-
defer cm.mu.Unlock()
263+
cm.addMu.Lock()
264+
defer cm.addMu.Unlock()
253265

254266
if _, found := cm.metricsExtraHandlers[path]; found {
255267
return fmt.Errorf("can't register extra handler by duplicate path %q on metrics http server", path)
@@ -262,8 +274,8 @@ func (cm *controllerManager) AddMetricsExtraHandler(path string, handler http.Ha
262274

263275
// AddHealthzCheck allows you to add Healthz checker.
264276
func (cm *controllerManager) AddHealthzCheck(name string, check healthz.Checker) error {
265-
cm.mu.Lock()
266-
defer cm.mu.Unlock()
277+
cm.addMu.Lock()
278+
defer cm.addMu.Unlock()
267279

268280
if cm.stopProcedureEngaged {
269281
return errors.New("can't accept new healthCheck as stop procedure is already engaged")
@@ -283,8 +295,8 @@ func (cm *controllerManager) AddHealthzCheck(name string, check healthz.Checker)
283295

284296
// AddReadyzCheck allows you to add Readyz checker.
285297
func (cm *controllerManager) AddReadyzCheck(name string, check healthz.Checker) error {
286-
cm.mu.Lock()
287-
defer cm.mu.Unlock()
298+
cm.addMu.Lock()
299+
defer cm.addMu.Unlock()
288300

289301
if cm.stopProcedureEngaged {
290302
return errors.New("can't accept new ready check as stop procedure is already engaged")
@@ -367,8 +379,8 @@ func (cm *controllerManager) serveMetrics() {
367379
mux.Handle(defaultMetricsEndpoint, handler)
368380

369381
func() {
370-
cm.mu.Lock()
371-
defer cm.mu.Unlock()
382+
cm.addMu.RLock()
383+
defer cm.addMu.RUnlock()
372384

373385
for path, extraHandler := range cm.metricsExtraHandlers {
374386
mux.Handle(path, extraHandler)
@@ -401,8 +413,8 @@ func (cm *controllerManager) serveHealthProbes() {
401413
}
402414

403415
func() {
404-
cm.mu.Lock()
405-
defer cm.mu.Unlock()
416+
cm.addMu.RLock()
417+
defer cm.addMu.RUnlock()
406418

407419
if cm.readyzHandler != nil {
408420
mux.Handle(cm.readinessEndpointName, http.StripPrefix(cm.readinessEndpointName, cm.readyzHandler))
@@ -422,6 +434,9 @@ func (cm *controllerManager) serveHealthProbes() {
422434
}
423435
return nil
424436
}))
437+
438+
// Note: healthzStarted is used by AddMetricsExtraHandler, AddReadyzCheck, but it is safe to change here because
439+
// addMu.RLock() prevents the above functions to be executed concurrently with this operation.
425440
cm.healthzStarted = true
426441
}()
427442

@@ -535,8 +550,11 @@ func (cm *controllerManager) engageStopProcedure(stopComplete <-chan struct{}) e
535550
if cm.gracefulShutdownTimeout == 0 {
536551
return nil
537552
}
538-
cm.mu.Lock()
539-
defer cm.mu.Unlock()
553+
cm.addMu.RLock()
554+
defer cm.addMu.RUnlock()
555+
556+
// Note: stopProcedureEngaged is used by Add, AddMetricsExtraHandler, AddReadyzCheck, but it is safe to change here because
557+
// addMu.RLock() prevents the above functions to be executed concurrently with this operation.
540558
cm.stopProcedureEngaged = true
541559

542560
// we want to close this after the other runnables stop, because we don't
@@ -574,8 +592,8 @@ func (cm *controllerManager) waitForRunnableToEnd(shutdownCancel context.CancelF
574592
}
575593

576594
func (cm *controllerManager) startNonLeaderElectionRunnables() {
577-
cm.mu.Lock()
578-
defer cm.mu.Unlock()
595+
cm.addMu.RLock()
596+
defer cm.addMu.RUnlock()
579597

580598
// First start any webhook servers, which includes conversion, validation, and defaulting
581599
// webhooks that are registered.
@@ -605,8 +623,8 @@ func (cm *controllerManager) startNonLeaderElectionRunnables() {
605623
}
606624

607625
func (cm *controllerManager) startLeaderElectionRunnables() {
608-
cm.mu.Lock()
609-
defer cm.mu.Unlock()
626+
cm.addMu.RLock()
627+
defer cm.addMu.RUnlock()
610628

611629
cm.waitForCache(cm.internalCtx)
612630

@@ -617,10 +635,15 @@ func (cm *controllerManager) startLeaderElectionRunnables() {
617635
cm.startRunnable(c)
618636
}
619637

638+
// Note: startedLeader is used by Add, but it is safe to change here because
639+
// addMu.RLock() prevents the above function to be executed concurrently with this operation.
620640
cm.startedLeader = true
621641
}
622642

623643
func (cm *controllerManager) waitForCache(ctx context.Context) {
644+
cm.cacheMu.Lock()
645+
defer cm.cacheMu.Unlock()
646+
624647
if cm.started {
625648
return
626649
}
@@ -638,14 +661,22 @@ func (cm *controllerManager) waitForCache(ctx context.Context) {
638661
// cm.started as check if we already started the cache so it must always become true.
639662
// Making sure that the cache doesn't get started twice is needed to not get a "close
640663
// of closed channel" panic
664+
665+
// Note: started is used by Add, so it is required to get an addMu lock/RLock before
666+
// calling this func in order to prevent the above function to be executed concurrently
667+
// with this operation.
641668
cm.started = true
642669
}
643670

644671
func (cm *controllerManager) startLeaderElection() (err error) {
645672
ctx, cancel := context.WithCancel(context.Background())
646-
cm.mu.Lock()
673+
674+
// Note: leaderElectionCancel is used by engageStopProcedure, which already gets a addMu.Rlock;
675+
// thus, in order to prevent the above function to be executed concurrently with this operation, we
676+
// require and addMu.Lock also here.
677+
cm.addMu.Lock()
647678
cm.leaderElectionCancel = cancel
648-
cm.mu.Unlock()
679+
cm.addMu.Unlock()
649680

650681
if cm.onStoppedLeading == nil {
651682
cm.onStoppedLeading = func() {

pkg/manager/manager_test.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1350,8 +1350,8 @@ var _ = Describe("manger.Manager", func() {
13501350

13511351
// Wait for the Manager to start
13521352
Eventually(func() bool {
1353-
mgr.mu.Lock()
1354-
defer mgr.mu.Unlock()
1353+
mgr.addMu.Lock()
1354+
defer mgr.addMu.Unlock()
13551355
return mgr.started
13561356
}).Should(BeTrue())
13571357

@@ -1381,8 +1381,8 @@ var _ = Describe("manger.Manager", func() {
13811381

13821382
// Wait for the Manager to start
13831383
Eventually(func() bool {
1384-
mgr.mu.Lock()
1385-
defer mgr.mu.Unlock()
1384+
mgr.addMu.Lock()
1385+
defer mgr.addMu.Unlock()
13861386
return mgr.started
13871387
}).Should(BeTrue())
13881388

0 commit comments

Comments
 (0)