@@ -44,6 +44,9 @@ def __eq__(self, other):
44
44
self .member_id == other .member_id and
45
45
self .protocol == other .protocol )
46
46
47
+ def __str__ (self ):
48
+ return "<Generation %s (member_id: %s, protocol: %s)>" % (self .generation_id , self .member_id , self .protocol )
49
+
47
50
48
51
Generation .NO_GENERATION = Generation (DEFAULT_GENERATION_ID , UNKNOWN_MEMBER_ID , None )
49
52
@@ -404,17 +407,16 @@ def _handle_join_success(self, member_assignment_bytes):
404
407
# will be invoked even if the consumer is woken up before
405
408
# finishing the rebalance
406
409
with self ._lock :
407
- log .info ("Successfully joined group %s with generation %s" ,
408
- self .group_id , self ._generation .generation_id )
409
410
self .state = MemberState .STABLE
410
411
if self ._heartbeat_thread :
411
412
self ._heartbeat_thread .enable ()
412
413
413
- def _handle_join_failure (self , _ ):
414
+ def _handle_join_failure (self , exception ):
414
415
# we handle failures below after the request finishes.
415
416
# if the join completes after having been woken up,
416
417
# the exception is ignored and we will rejoin
417
418
with self ._lock :
419
+ log .info ("Failed to join group %s: %s" , self .group_id , exception )
418
420
self .state = MemberState .UNJOINED
419
421
420
422
def ensure_active_group (self , timeout_ms = None ):
@@ -572,10 +574,9 @@ def _failed_request(self, node_id, request, future, error):
572
574
future .failure (error )
573
575
574
576
def _handle_join_group_response (self , future , send_time , response ):
577
+ log .debug ("Received JoinGroup response: %s" , response )
575
578
error_type = Errors .for_code (response .error_code )
576
579
if error_type is Errors .NoError :
577
- log .debug ("Received successful JoinGroup response for group %s: %s" ,
578
- self .group_id , response )
579
580
if self ._sensors :
580
581
self ._sensors .join_latency .record ((time .time () - send_time ) * 1000 )
581
582
with self ._lock :
@@ -589,6 +590,7 @@ def _handle_join_group_response(self, future, send_time, response):
589
590
response .member_id ,
590
591
response .group_protocol )
591
592
593
+ log .info ("Successfully joined group %s %s" , self .group_id , self ._generation )
592
594
if response .leader_id == response .member_id :
593
595
log .info ("Elected group leader -- performing partition"
594
596
" assignments using %s" , self ._generation .protocol )
@@ -597,24 +599,24 @@ def _handle_join_group_response(self, future, send_time, response):
597
599
self ._on_join_follower ().chain (future )
598
600
599
601
elif error_type is Errors .CoordinatorLoadInProgressError :
600
- log .debug ("Attempt to join group %s rejected since coordinator %s"
601
- " is loading the group." , self .group_id , self .coordinator_id )
602
+ log .info ("Attempt to join group %s rejected since coordinator %s"
603
+ " is loading the group." , self .group_id , self .coordinator_id )
602
604
# backoff and retry
603
605
future .failure (error_type (response ))
604
606
elif error_type is Errors .UnknownMemberIdError :
605
607
# reset the member id and retry immediately
606
608
error = error_type (self ._generation .member_id )
607
609
self .reset_generation ()
608
- log .debug ("Attempt to join group %s failed due to unknown member id" ,
609
- self .group_id )
610
+ log .info ("Attempt to join group %s failed due to unknown member id" ,
611
+ self .group_id )
610
612
future .failure (error )
611
613
elif error_type in (Errors .CoordinatorNotAvailableError ,
612
614
Errors .NotCoordinatorError ):
613
615
# re-discover the coordinator and retry with backoff
614
616
self .coordinator_dead (error_type ())
615
- log .debug ("Attempt to join group %s failed due to obsolete "
616
- "coordinator information: %s" , self .group_id ,
617
- error_type .__name__ )
617
+ log .info ("Attempt to join group %s failed due to obsolete "
618
+ "coordinator information: %s" , self .group_id ,
619
+ error_type .__name__ )
618
620
future .failure (error_type ())
619
621
elif error_type in (Errors .InconsistentGroupProtocolError ,
620
622
Errors .InvalidSessionTimeoutError ,
@@ -625,12 +627,21 @@ def _handle_join_group_response(self, future, send_time, response):
625
627
self .group_id , error )
626
628
future .failure (error )
627
629
elif error_type is Errors .GroupAuthorizationFailedError :
630
+ log .error ("Attempt to join group %s failed due to group authorization error" ,
631
+ self .group_id )
628
632
future .failure (error_type (self .group_id ))
629
633
elif error_type is Errors .MemberIdRequiredError :
630
634
# Broker requires a concrete member id to be allowed to join the group. Update member id
631
635
# and send another join group request in next cycle.
636
+ log .info ("Received member id %s for group %s; will retry join-group" ,
637
+ response .member_id , self .group_id )
632
638
self .reset_generation (response .member_id )
633
639
future .failure (error_type ())
640
+ elif error_type is Errors .RebalanceInProgressError :
641
+ log .info ("Attempt to join group %s failed due to RebalanceInProgressError,"
642
+ " which could indicate a replication timeout on the broker. Will retry." ,
643
+ self .group_id )
644
+ future .failure (error_type ())
634
645
else :
635
646
# unexpected error, throw the exception
636
647
error = error_type ()
@@ -699,6 +710,7 @@ def _send_sync_group_request(self, request):
699
710
return future
700
711
701
712
def _handle_sync_group_response (self , future , send_time , response ):
713
+ log .debug ("Received SyncGroup response: %s" , response )
702
714
error_type = Errors .for_code (response .error_code )
703
715
if error_type is Errors .NoError :
704
716
if self ._sensors :
@@ -745,13 +757,13 @@ def _send_group_coordinator_request(self):
745
757
e = Errors .NodeNotReadyError (node_id )
746
758
return Future ().failure (e )
747
759
748
- log .debug ("Sending group coordinator request for group %s to broker %s" ,
749
- self .group_id , node_id )
750
760
version = self ._client .api_version (FindCoordinatorRequest , max_version = 2 )
751
761
if version == 0 :
752
762
request = FindCoordinatorRequest [version ](self .group_id )
753
763
else :
754
764
request = FindCoordinatorRequest [version ](self .group_id , 0 )
765
+ log .debug ("Sending group coordinator request for group %s to broker %s: %s" ,
766
+ self .group_id , node_id , request )
755
767
future = Future ()
756
768
_f = self ._client .send (node_id , request )
757
769
_f .add_callback (self ._handle_group_coordinator_response , future )
@@ -880,6 +892,7 @@ def maybe_leave_group(self, timeout_ms=None):
880
892
log .info ('Leaving consumer group (%s).' , self .group_id )
881
893
version = self ._client .api_version (LeaveGroupRequest , max_version = 2 )
882
894
request = LeaveGroupRequest [version ](self .group_id , self ._generation .member_id )
895
+ log .debug ('Sending LeaveGroupRequest to %s: %s' , self .coordinator_id , request )
883
896
future = self ._client .send (self .coordinator_id , request )
884
897
future .add_callback (self ._handle_leave_group_response )
885
898
future .add_errback (log .error , "LeaveGroup request failed: %s" )
@@ -888,10 +901,11 @@ def maybe_leave_group(self, timeout_ms=None):
888
901
self .reset_generation ()
889
902
890
903
def _handle_leave_group_response (self , response ):
904
+ log .debug ("Received LeaveGroupResponse: %s" , response )
891
905
error_type = Errors .for_code (response .error_code )
892
906
if error_type is Errors .NoError :
893
- log .debug ("LeaveGroup request for group %s returned successfully" ,
894
- self .group_id )
907
+ log .info ("LeaveGroup request for group %s returned successfully" ,
908
+ self .group_id )
895
909
else :
896
910
log .error ("LeaveGroup request for group %s failed with error: %s" ,
897
911
self .group_id , error_type ())
@@ -911,7 +925,7 @@ def _send_heartbeat_request(self):
911
925
request = HeartbeatRequest [version ](self .group_id ,
912
926
self ._generation .generation_id ,
913
927
self ._generation .member_id )
914
- heartbeat_log .debug ("Heartbeat: %s[%s] %s" , request . group , request . generation_id , request . member_id ) # pylint: disable-msg=no-member
928
+ heartbeat_log .debug ("Sending HeartbeatRequest to %s: %s" , self . coordinator_id , request )
915
929
future = Future ()
916
930
_f = self ._client .send (self .coordinator_id , request )
917
931
_f .add_callback (self ._handle_heartbeat_response , future , time .time ())
@@ -922,10 +936,10 @@ def _send_heartbeat_request(self):
922
936
def _handle_heartbeat_response (self , future , send_time , response ):
923
937
if self ._sensors :
924
938
self ._sensors .heartbeat_latency .record ((time .time () - send_time ) * 1000 )
939
+ heartbeat_log .debug ("Received heartbeat response for group %s: %s" ,
940
+ self .group_id , response )
925
941
error_type = Errors .for_code (response .error_code )
926
942
if error_type is Errors .NoError :
927
- heartbeat_log .debug ("Received successful heartbeat response for group %s" ,
928
- self .group_id )
929
943
future .success (None )
930
944
elif error_type in (Errors .CoordinatorNotAvailableError ,
931
945
Errors .NotCoordinatorError ):
@@ -1118,7 +1132,13 @@ def _run_once(self):
1118
1132
# the poll timeout has expired, which means that the
1119
1133
# foreground thread has stalled in between calls to
1120
1134
# poll(), so we explicitly leave the group.
1121
- heartbeat_log .warning ('Heartbeat poll expired, leaving group' )
1135
+ heartbeat_log .warning (
1136
+ "Consumer poll timeout has expired. This means the time between subsequent calls to poll()"
1137
+ " was longer than the configured max_poll_interval_ms, which typically implies that"
1138
+ " the poll loop is spending too much time processing messages. You can address this"
1139
+ " either by increasing max_poll_interval_ms or by reducing the maximum size of batches"
1140
+ " returned in poll() with max_poll_records."
1141
+ )
1122
1142
self .coordinator .maybe_leave_group ()
1123
1143
1124
1144
elif not self .coordinator .heartbeat .should_heartbeat ():
@@ -1128,10 +1148,12 @@ def _run_once(self):
1128
1148
self .coordinator ._lock .wait (next_hb )
1129
1149
1130
1150
else :
1151
+ heartbeat_log .debug ('Sending heartbeat for group %s %s' , self .coordinator .group_id , self .coordinator ._generation )
1131
1152
self .coordinator .heartbeat .sent_heartbeat ()
1132
1153
future = self .coordinator ._send_heartbeat_request ()
1133
1154
future .add_callback (self ._handle_heartbeat_success )
1134
1155
future .add_errback (self ._handle_heartbeat_failure )
1156
+
1135
1157
finally :
1136
1158
self .coordinator ._lock .release ()
1137
1159
try :
@@ -1142,6 +1164,7 @@ def _run_once(self):
1142
1164
1143
1165
def _handle_heartbeat_success (self , result ):
1144
1166
with self .coordinator ._lock :
1167
+ heartbeat_log .debug ('Heartbeat success' )
1145
1168
self .coordinator .heartbeat .received_heartbeat ()
1146
1169
1147
1170
def _handle_heartbeat_failure (self , exception ):
@@ -1152,8 +1175,10 @@ def _handle_heartbeat_failure(self, exception):
1152
1175
# member in the group for as long as the duration of the
1153
1176
# rebalance timeout. If we stop sending heartbeats, however,
1154
1177
# then the session timeout may expire before we can rejoin.
1178
+ heartbeat_log .debug ('Treating RebalanceInProgressError as successful heartbeat' )
1155
1179
self .coordinator .heartbeat .received_heartbeat ()
1156
1180
else :
1181
+ heartbeat_log .debug ('Heartbeat failure: %s' , exception )
1157
1182
self .coordinator .heartbeat .fail_heartbeat ()
1158
1183
# wake up the thread if it's sleeping to reschedule the heartbeat
1159
1184
self .coordinator ._lock .notify ()
0 commit comments