Skip to content

Commit bf1c037

Browse files
Retry on delete backend failure (#330)
* Retry create/delete of backend on failure and ignore reconcile if lb is not active
1 parent 3cff30b commit bf1c037

6 files changed

+270
-238
lines changed

cloud/scope/load_balancer_reconciler.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,9 @@ func (s *ClusterScope) ReconcileApiServerLB(ctx context.Context) error {
3737
return err
3838
}
3939
if lb != nil {
40+
if lb.LifecycleState != loadbalancer.LoadBalancerLifecycleStateActive {
41+
return errors.New(fmt.Sprintf("load balancer is in %s state. Waiting for ACTIVE state.", lb.LifecycleState))
42+
}
4043
lbIP, err := s.getLoadbalancerIp(*lb)
4144
if err != nil {
4245
return err

cloud/scope/load_balancer_reconciler_test.go

Lines changed: 64 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -97,11 +97,12 @@ func TestLBReconciliation(t *testing.T) {
9797
})).
9898
Return(loadbalancer.GetLoadBalancerResponse{
9999
LoadBalancer: loadbalancer.LoadBalancer{
100-
Id: common.String("lb-id"),
101-
FreeformTags: tags,
102-
DefinedTags: make(map[string]map[string]interface{}),
103-
IsPrivate: common.Bool(false),
104-
DisplayName: common.String(fmt.Sprintf("%s-%s", "cluster", "apiserver")),
100+
Id: common.String("lb-id"),
101+
LifecycleState: loadbalancer.LoadBalancerLifecycleStateActive,
102+
FreeformTags: tags,
103+
DefinedTags: make(map[string]map[string]interface{}),
104+
IsPrivate: common.Bool(false),
105+
DisplayName: common.String(fmt.Sprintf("%s-%s", "cluster", "apiserver")),
105106
IpAddresses: []loadbalancer.IpAddress{
106107
{
107108
IpAddress: common.String("2.2.2.2"),
@@ -123,11 +124,12 @@ func TestLBReconciliation(t *testing.T) {
123124
})).
124125
Return(loadbalancer.GetLoadBalancerResponse{
125126
LoadBalancer: loadbalancer.LoadBalancer{
126-
Id: common.String("lb-id"),
127-
FreeformTags: tags,
128-
DefinedTags: make(map[string]map[string]interface{}),
129-
IsPrivate: common.Bool(false),
130-
DisplayName: common.String(fmt.Sprintf("%s-%s", "cluster", "apiserver")),
127+
Id: common.String("lb-id"),
128+
LifecycleState: loadbalancer.LoadBalancerLifecycleStateActive,
129+
FreeformTags: tags,
130+
DefinedTags: make(map[string]map[string]interface{}),
131+
IsPrivate: common.Bool(false),
132+
DisplayName: common.String(fmt.Sprintf("%s-%s", "cluster", "apiserver")),
131133
},
132134
}, nil)
133135
},
@@ -143,11 +145,12 @@ func TestLBReconciliation(t *testing.T) {
143145
})).
144146
Return(loadbalancer.GetLoadBalancerResponse{
145147
LoadBalancer: loadbalancer.LoadBalancer{
146-
Id: common.String("lb-id"),
147-
FreeformTags: tags,
148-
DefinedTags: make(map[string]map[string]interface{}),
149-
IsPrivate: common.Bool(false),
150-
DisplayName: common.String(fmt.Sprintf("%s-%s", "cluster", "apiserver")),
148+
Id: common.String("lb-id"),
149+
LifecycleState: loadbalancer.LoadBalancerLifecycleStateActive,
150+
FreeformTags: tags,
151+
DefinedTags: make(map[string]map[string]interface{}),
152+
IsPrivate: common.Bool(false),
153+
DisplayName: common.String(fmt.Sprintf("%s-%s", "cluster", "apiserver")),
151154
IpAddresses: []loadbalancer.IpAddress{
152155
{
153156
IpAddress: common.String("2.2.2.2"),
@@ -189,11 +192,12 @@ func TestLBReconciliation(t *testing.T) {
189192
})).
190193
Return(loadbalancer.GetLoadBalancerResponse{
191194
LoadBalancer: loadbalancer.LoadBalancer{
192-
Id: common.String("lb-id"),
193-
FreeformTags: tags,
194-
DefinedTags: make(map[string]map[string]interface{}),
195-
IsPrivate: common.Bool(false),
196-
DisplayName: common.String(fmt.Sprintf("%s-%s", "cluster", "apiserver")),
195+
Id: common.String("lb-id"),
196+
LifecycleState: loadbalancer.LoadBalancerLifecycleStateActive,
197+
FreeformTags: tags,
198+
DefinedTags: make(map[string]map[string]interface{}),
199+
IsPrivate: common.Bool(false),
200+
DisplayName: common.String(fmt.Sprintf("%s-%s", "cluster", "apiserver")),
197201
IpAddresses: []loadbalancer.IpAddress{
198202
{
199203
IpAddress: common.String("2.2.2.2"),
@@ -537,11 +541,12 @@ func TestLBReconciliation(t *testing.T) {
537541
})).
538542
Return(loadbalancer.GetLoadBalancerResponse{
539543
LoadBalancer: loadbalancer.LoadBalancer{
540-
Id: common.String("lb-id"),
541-
FreeformTags: tags,
542-
DefinedTags: make(map[string]map[string]interface{}),
543-
IsPrivate: common.Bool(false),
544-
DisplayName: common.String(fmt.Sprintf("%s-%s", "cluster", "apiserver-test")),
544+
Id: common.String("lb-id"),
545+
LifecycleState: loadbalancer.LoadBalancerLifecycleStateActive,
546+
FreeformTags: tags,
547+
DefinedTags: make(map[string]map[string]interface{}),
548+
IsPrivate: common.Bool(false),
549+
DisplayName: common.String(fmt.Sprintf("%s-%s", "cluster", "apiserver-test")),
545550
IpAddresses: []loadbalancer.IpAddress{
546551
{
547552
IpAddress: common.String("2.2.2.2"),
@@ -570,6 +575,34 @@ func TestLBReconciliation(t *testing.T) {
570575
}, nil)
571576
},
572577
},
578+
{
579+
name: "lb not active",
580+
errorExpected: true,
581+
errorSubStringMatch: true,
582+
matchError: errors.New(fmt.Sprintf("load balancer is in %s state. Waiting for ACTIVE state.", loadbalancer.LoadBalancerLifecycleStateCreating)),
583+
testSpecificSetup: func(clusterScope *ClusterScope, lbClient *mock_lb.MockLoadBalancerClient) {
584+
ociClusterAccessor.OCICluster.Spec.NetworkSpec.APIServerLB.LoadBalancerId = common.String("lb-id")
585+
lbClient.EXPECT().GetLoadBalancer(gomock.Any(), gomock.Eq(loadbalancer.GetLoadBalancerRequest{
586+
LoadBalancerId: common.String("lb-id"),
587+
})).
588+
Return(loadbalancer.GetLoadBalancerResponse{
589+
LoadBalancer: loadbalancer.LoadBalancer{
590+
Id: common.String("lb-id"),
591+
LifecycleState: loadbalancer.LoadBalancerLifecycleStateCreating,
592+
FreeformTags: tags,
593+
DefinedTags: make(map[string]map[string]interface{}),
594+
IsPrivate: common.Bool(false),
595+
DisplayName: common.String(fmt.Sprintf("%s-%s", "cluster", "apiserver-test")),
596+
IpAddresses: []loadbalancer.IpAddress{
597+
{
598+
IpAddress: common.String("2.2.2.2"),
599+
IsPublic: common.Bool(true),
600+
},
601+
},
602+
},
603+
}, nil)
604+
},
605+
},
573606
{
574607
name: "lb update request failed",
575608
errorExpected: true,
@@ -582,11 +615,12 @@ func TestLBReconciliation(t *testing.T) {
582615
})).
583616
Return(loadbalancer.GetLoadBalancerResponse{
584617
LoadBalancer: loadbalancer.LoadBalancer{
585-
Id: common.String("lb-id"),
586-
FreeformTags: tags,
587-
DefinedTags: make(map[string]map[string]interface{}),
588-
IsPrivate: common.Bool(false),
589-
DisplayName: common.String(fmt.Sprintf("%s-%s", "cluster", "apiserver-test")),
618+
Id: common.String("lb-id"),
619+
LifecycleState: loadbalancer.LoadBalancerLifecycleStateActive,
620+
FreeformTags: tags,
621+
DefinedTags: make(map[string]map[string]interface{}),
622+
IsPrivate: common.Bool(false),
623+
DisplayName: common.String(fmt.Sprintf("%s-%s", "cluster", "apiserver-test")),
590624
IpAddresses: []loadbalancer.IpAddress{
591625
{
592626
IpAddress: common.String("2.2.2.2"),

cloud/scope/machine.go

Lines changed: 89 additions & 107 deletions
Original file line numberDiff line numberDiff line change
@@ -521,33 +521,29 @@ func (m *MachineScope) ReconcileCreateInstanceOnLB(ctx context.Context) error {
521521
backendName := instanceIp + ":" + strconv.Itoa(int(m.OCIClusterAccessor.GetControlPlaneEndpoint().Port))
522522
if !m.containsLBBackend(backendSet, backendName) {
523523
logger := m.Logger.WithValues("backend-set", *backendSet.Name)
524-
workRequest := m.OCIMachine.Status.CreateBackendWorkRequestId
525524
logger.Info("Checking work request status for create backend")
526-
if workRequest != "" {
527-
_, err = ociutil.AwaitLBWorkRequest(ctx, m.LoadBalancerClient, &workRequest)
528-
if err != nil {
529-
return err
530-
}
531-
} else {
532-
resp, err := m.LoadBalancerClient.CreateBackend(ctx, loadbalancer.CreateBackendRequest{
533-
LoadBalancerId: loadbalancerId,
534-
BackendSetName: backendSet.Name,
535-
CreateBackendDetails: loadbalancer.CreateBackendDetails{
536-
IpAddress: common.String(instanceIp),
537-
Port: common.Int(int(m.OCIClusterAccessor.GetControlPlaneEndpoint().Port)),
538-
},
539-
OpcRetryToken: ociutil.GetOPCRetryToken("%s-%s", "create-backend", string(m.OCIMachine.UID)),
540-
})
541-
if err != nil {
542-
return err
543-
}
544-
m.OCIMachine.Status.CreateBackendWorkRequestId = *resp.OpcWorkRequestId
545-
logger.Info("Add instance to LB backend-set", "WorkRequestId", resp.OpcWorkRequestId)
546-
logger.Info("Waiting for LB work request to be complete")
547-
_, err = ociutil.AwaitLBWorkRequest(ctx, m.LoadBalancerClient, resp.OpcWorkRequestId)
548-
if err != nil {
549-
return err
550-
}
525+
// we always try to create the backend if it exists during a reconcile loop and wait for the work request
526+
// to complete. If there is a work request in progress, in the rare case, CAPOCI pod restarts during the
527+
// work request, the create backend call may throw an error which is ok, as reconcile will go into
528+
// an exponential backoff
529+
resp, err := m.LoadBalancerClient.CreateBackend(ctx, loadbalancer.CreateBackendRequest{
530+
LoadBalancerId: loadbalancerId,
531+
BackendSetName: backendSet.Name,
532+
CreateBackendDetails: loadbalancer.CreateBackendDetails{
533+
IpAddress: common.String(instanceIp),
534+
Port: common.Int(int(m.OCIClusterAccessor.GetControlPlaneEndpoint().Port)),
535+
},
536+
OpcRetryToken: ociutil.GetOPCRetryToken("%s-%s", "create-backend", string(m.OCIMachine.UID)),
537+
})
538+
if err != nil {
539+
return err
540+
}
541+
m.OCIMachine.Status.CreateBackendWorkRequestId = *resp.OpcWorkRequestId
542+
logger.Info("Add instance to LB backend-set", "WorkRequestId", resp.OpcWorkRequestId)
543+
logger.Info("Waiting for LB work request to be complete")
544+
_, err = ociutil.AwaitLBWorkRequest(ctx, m.LoadBalancerClient, resp.OpcWorkRequestId)
545+
if err != nil {
546+
return err
551547
}
552548
}
553549

@@ -562,37 +558,32 @@ func (m *MachineScope) ReconcileCreateInstanceOnLB(ctx context.Context) error {
562558
if !m.containsNLBBackend(backendSet, m.Name()) {
563559
logger := m.Logger.WithValues("backend-set", *backendSet.Name)
564560
logger.Info("Checking work request status for create backend")
565-
workRequest := m.OCIMachine.Status.CreateBackendWorkRequestId
566-
if workRequest != "" {
567-
_, err = ociutil.AwaitNLBWorkRequest(ctx, m.NetworkLoadBalancerClient, &workRequest)
568-
if err != nil {
569-
return err
570-
}
571-
} else {
572-
resp, err := m.NetworkLoadBalancerClient.CreateBackend(ctx, networkloadbalancer.CreateBackendRequest{
573-
NetworkLoadBalancerId: loadbalancerId,
574-
BackendSetName: backendSet.Name,
575-
CreateBackendDetails: networkloadbalancer.CreateBackendDetails{
576-
IpAddress: common.String(instanceIp),
577-
Port: common.Int(int(m.OCIClusterAccessor.GetControlPlaneEndpoint().Port)),
578-
Name: common.String(m.Name()),
579-
},
580-
OpcRetryToken: ociutil.GetOPCRetryToken("%s-%s", "create-backend", string(m.OCIMachine.UID)),
581-
})
582-
if err != nil {
583-
return err
584-
}
585-
m.OCIMachine.Status.CreateBackendWorkRequestId = *resp.OpcWorkRequestId
586-
logger.Info("Add instance to NLB backend-set", "WorkRequestId", resp.OpcWorkRequestId)
587-
logger.Info("Waiting for NLB work request to be complete")
588-
_, err = ociutil.AwaitNLBWorkRequest(ctx, m.NetworkLoadBalancerClient, resp.OpcWorkRequestId)
589-
if err != nil {
590-
return err
591-
}
592-
logger.Info("NLB Backend addition work request is complete")
561+
// we always try to create the backend if it exists during a reconcile loop and wait for the work request
562+
// to complete. If there is a work request in progress, in the rare case, CAPOCI pod restarts during the
563+
// work request, the create backend call may throw an error which is ok, as reconcile will go into
564+
// an exponential backoff
565+
resp, err := m.NetworkLoadBalancerClient.CreateBackend(ctx, networkloadbalancer.CreateBackendRequest{
566+
NetworkLoadBalancerId: loadbalancerId,
567+
BackendSetName: backendSet.Name,
568+
CreateBackendDetails: networkloadbalancer.CreateBackendDetails{
569+
IpAddress: common.String(instanceIp),
570+
Port: common.Int(int(m.OCIClusterAccessor.GetControlPlaneEndpoint().Port)),
571+
Name: common.String(m.Name()),
572+
},
573+
OpcRetryToken: ociutil.GetOPCRetryToken("%s-%s", "create-backend", string(m.OCIMachine.UID)),
574+
})
575+
if err != nil {
576+
return err
577+
}
578+
m.OCIMachine.Status.CreateBackendWorkRequestId = *resp.OpcWorkRequestId
579+
logger.Info("Add instance to NLB backend-set", "WorkRequestId", resp.OpcWorkRequestId)
580+
logger.Info("Waiting for NLB work request to be complete")
581+
_, err = ociutil.AwaitNLBWorkRequest(ctx, m.NetworkLoadBalancerClient, resp.OpcWorkRequestId)
582+
if err != nil {
583+
return err
593584
}
585+
logger.Info("NLB Backend addition work request is complete")
594586
}
595-
596587
}
597588
return nil
598589
}
@@ -629,36 +620,31 @@ func (m *MachineScope) ReconcileDeleteInstanceOnLB(ctx context.Context) error {
629620
backendName := instanceIp + ":" + strconv.Itoa(int(m.OCIClusterAccessor.GetControlPlaneEndpoint().Port))
630621
if m.containsLBBackend(backendSet, backendName) {
631622
logger := m.Logger.WithValues("backend-set", *backendSet.Name)
632-
workRequest := m.OCIMachine.Status.DeleteBackendWorkRequestId
633-
if workRequest != "" {
634-
_, err = ociutil.AwaitLBWorkRequest(ctx, m.LoadBalancerClient, &workRequest)
635-
if err != nil {
636-
return err
637-
}
638-
} else {
639-
// in OCI CLI, the colon in the backend name is replaced by %3A
640-
// replace the colon in the backend name by %3A to avoid the error in PCA
641-
escapedBackendName := url.QueryEscape(backendName)
642-
643-
resp, err := m.LoadBalancerClient.DeleteBackend(ctx, loadbalancer.DeleteBackendRequest{
644-
LoadBalancerId: loadbalancerId,
645-
BackendSetName: backendSet.Name,
646-
BackendName: common.String(escapedBackendName),
647-
})
648-
if err != nil {
649-
logger.Error(err, "Delete instance from LB backend-set failed",
650-
"backendSetName", *backendSet.Name,
651-
"backendName", escapedBackendName,
652-
)
653-
return err
654-
}
655-
m.OCIMachine.Status.DeleteBackendWorkRequestId = *resp.OpcWorkRequestId
656-
logger.Info("Delete instance from LB backend-set", "WorkRequestId", resp.OpcWorkRequestId)
657-
logger.Info("Waiting for LB work request to be complete")
658-
_, err = ociutil.AwaitLBWorkRequest(ctx, m.LoadBalancerClient, resp.OpcWorkRequestId)
659-
if err != nil {
660-
return err
661-
}
623+
// in OCI CLI, the colon in the backend name is replaced by %3A
624+
// replace the colon in the backend name by %3A to avoid the error in PCA
625+
escapedBackendName := url.QueryEscape(backendName)
626+
// we always try to delete the backend if it exists during a reconcile loop and wait for the work request
627+
// to complete. If there is a work request in progress, in the rare case, CAPOCI pod restarts during the
628+
// work request, the delete backend call may throw an error which is ok, as reconcile will go into
629+
// an exponential backoff
630+
resp, err := m.LoadBalancerClient.DeleteBackend(ctx, loadbalancer.DeleteBackendRequest{
631+
LoadBalancerId: loadbalancerId,
632+
BackendSetName: backendSet.Name,
633+
BackendName: common.String(escapedBackendName),
634+
})
635+
if err != nil {
636+
logger.Error(err, "Delete instance from LB backend-set failed",
637+
"backendSetName", *backendSet.Name,
638+
"backendName", escapedBackendName,
639+
)
640+
return err
641+
}
642+
m.OCIMachine.Status.DeleteBackendWorkRequestId = *resp.OpcWorkRequestId
643+
logger.Info("Delete instance from LB backend-set", "WorkRequestId", resp.OpcWorkRequestId)
644+
logger.Info("Waiting for LB work request to be complete")
645+
_, err = ociutil.AwaitLBWorkRequest(ctx, m.LoadBalancerClient, resp.OpcWorkRequestId)
646+
if err != nil {
647+
return err
662648
}
663649
logger.Info("LB Backend addition work request is complete")
664650
}
@@ -672,28 +658,24 @@ func (m *MachineScope) ReconcileDeleteInstanceOnLB(ctx context.Context) error {
672658
backendSet := lb.BackendSets[APIServerLBBackendSetName]
673659
if m.containsNLBBackend(backendSet, m.Name()) {
674660
logger := m.Logger.WithValues("backend-set", *backendSet.Name)
675-
workRequest := m.OCIMachine.Status.DeleteBackendWorkRequestId
676-
if workRequest != "" {
677-
_, err = ociutil.AwaitNLBWorkRequest(ctx, m.NetworkLoadBalancerClient, &workRequest)
678-
if err != nil {
679-
return err
680-
}
681-
} else {
682-
resp, err := m.NetworkLoadBalancerClient.DeleteBackend(ctx, networkloadbalancer.DeleteBackendRequest{
683-
NetworkLoadBalancerId: loadbalancerId,
684-
BackendSetName: backendSet.Name,
685-
BackendName: common.String(m.Name()),
686-
})
687-
if err != nil {
688-
return err
689-
}
690-
m.OCIMachine.Status.DeleteBackendWorkRequestId = *resp.OpcWorkRequestId
691-
logger.Info("Delete instance from LB backend-set", "WorkRequestId", resp.OpcWorkRequestId)
692-
logger.Info("Waiting for LB work request to be complete")
693-
_, err = ociutil.AwaitNLBWorkRequest(ctx, m.NetworkLoadBalancerClient, resp.OpcWorkRequestId)
694-
if err != nil {
695-
return err
696-
}
661+
// we always try to delete the backend if it exists during a reconcile loop and wait for the work request
662+
// to complete. If there is a work request in progress, in the rare case, CAPOCI pod restarts during the
663+
// work request, the delete backend call may throw an error which is ok, as reconcile will go into
664+
// an exponential backoff
665+
resp, err := m.NetworkLoadBalancerClient.DeleteBackend(ctx, networkloadbalancer.DeleteBackendRequest{
666+
NetworkLoadBalancerId: loadbalancerId,
667+
BackendSetName: backendSet.Name,
668+
BackendName: common.String(m.Name()),
669+
})
670+
if err != nil {
671+
return err
672+
}
673+
m.OCIMachine.Status.DeleteBackendWorkRequestId = *resp.OpcWorkRequestId
674+
logger.Info("Delete instance from LB backend-set", "WorkRequestId", resp.OpcWorkRequestId)
675+
logger.Info("Waiting for LB work request to be complete")
676+
_, err = ociutil.AwaitNLBWorkRequest(ctx, m.NetworkLoadBalancerClient, resp.OpcWorkRequestId)
677+
if err != nil {
678+
return err
697679
}
698680
}
699681
}

0 commit comments

Comments
 (0)