Skip to content

Commit 2595cb0

Browse files
committed
store: Handle unexpected event-poll errors by reloading store, with backoff
This fixes #563. We'll follow up with a few more commits that give more-informative errors in some cases, or change the handling of others from retrying getEvents to reloading the data from scratch. The if-disposed and store.isLoading lines have no effect in the case of a BAD_EVENT_QUEUE_ID error, because those can only come from the getEvents request, and then the inner catch block will have already taken the same steps. Fixes: #563
1 parent 61001ca commit 2595cb0

File tree

3 files changed

+99
-8
lines changed

3 files changed

+99
-8
lines changed

lib/model/store.dart

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -993,6 +993,18 @@ class UpdateMachine {
993993
}());
994994
}
995995

996+
// This is static so that it persists through new UpdateMachine instances
997+
// as we attempt to fix things by reloading data from scratch. In principle
998+
// it could also be per-account (or per-realm or per-server); but currently
999+
// we skip that complication, as well as attempting to reset backoff on
1000+
// later success. After all, these unexpected errors should be uncommon;
1001+
// ideally they'd never happen.
1002+
static BackoffMachine get _unexpectedErrorBackoffMachine {
1003+
return __unexpectedErrorBackoffMachine
1004+
??= BackoffMachine(maxBound: const Duration(seconds: 60));
1005+
}
1006+
static BackoffMachine? __unexpectedErrorBackoffMachine;
1007+
9961008
/// This controls when we start to report transient errors to the user when
9971009
/// polling.
9981010
///
@@ -1113,13 +1125,42 @@ class UpdateMachine {
11131125
}
11141126
}
11151127
} catch (e) {
1128+
if (_disposed) return;
1129+
1130+
// An error occurred, other than the request errors we retry on.
1131+
// This means either a lost/expired event queue on the server (which is
1132+
// normal after the app is offline for a period like 10 minutes),
1133+
// or an unexpected exception representing a bug in our code or the server.
1134+
// Either way, the show must go on. So reload server data from scratch.
1135+
1136+
// First, log what happened.
1137+
store.isLoading = true;
1138+
bool isUnexpected;
11161139
switch (e) {
11171140
case ZulipApiException(code: 'BAD_EVENT_QUEUE_ID'):
11181141
assert(debugLog('Lost event queue for $store. Replacing…'));
11191142
// The old event queue is gone, so we need a new one. This is normal.
1143+
isUnexpected = false;
11201144

11211145
default:
1122-
rethrow; // TODO(#563) try to recover instead
1146+
assert(debugLog('BUG: Unexpected error in event polling: $e\n' // TODO(log)
1147+
'Replacing event queue…'));
1148+
_reportToUserErrorConnectingToServer(e);
1149+
// We can't just continue with the next event, because our state
1150+
// may be garbled due to failing to apply this one (and worse,
1151+
// any invariants that were left in a broken state from where
1152+
// the exception was thrown). So reload from scratch.
1153+
// Hopefully (probably?) the bug only affects our implementation of
1154+
// the *change* in state represented by the event, and when we get the
1155+
// new state in a fresh InitialSnapshot we'll handle that just fine.
1156+
isUnexpected = true;
1157+
}
1158+
1159+
if (isUnexpected) {
1160+
// We don't know the cause of the failure; it might well keep happening.
1161+
// Avoid creating a retry storm.
1162+
await _unexpectedErrorBackoffMachine.wait();
1163+
if (_disposed) return;
11231164
}
11241165

11251166
// This disposes the store, which disposes this update machine.

test/example_data.dart

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,10 @@ void _checkPositive(int? value, String description) {
1818
assert(value == null || value > 0, '$description should be positive');
1919
}
2020

21+
Object nullCheckError() {
22+
try { null!; } catch (e) { return e; } // ignore: null_check_always_fails
23+
}
24+
2125
////////////////////////////////////////////////////////////////
2226
// Realm-wide (or server-wide) metadata.
2327
//

test/model/store_test.dart

Lines changed: 53 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -639,16 +639,25 @@ void main() {
639639
check(store.userSettings!.twentyFourHourTime).isTrue();
640640
}));
641641

642-
void checkReload(void Function() prepareError) {
642+
void checkReload(FutureOr<void> Function() prepareError, {
643+
bool expectBackoff = true,
644+
}) {
643645
awaitFakeAsync((async) async {
644646
await preparePoll();
645647
check(globalStore.perAccountSync(store.accountId)).identicalTo(store);
646648

647-
prepareError();
649+
await prepareError();
648650
updateMachine.debugAdvanceLoop();
649651
async.elapse(Duration.zero);
650652
check(store).isLoading.isTrue();
651653

654+
if (expectBackoff) {
655+
// The reload doesn't happen immediately; there's a timer.
656+
check(globalStore.perAccountSync(store.accountId)).identicalTo(store);
657+
check(async.pendingTimers).length.equals(1);
658+
async.flushTimers();
659+
}
660+
652661
// The global store has a new store.
653662
check(globalStore.perAccountSync(store.accountId)).not((it) => it.identicalTo(store));
654663
updateFromGlobalStore();
@@ -700,6 +709,10 @@ void main() {
700709

701710
// These cases are ordered by how far the request got before it failed.
702711

712+
void prepareUnexpectedLoopError() {
713+
updateMachine.debugPrepareLoopError(eg.nullCheckError());
714+
}
715+
703716
void prepareNetworkExceptionSocketException() {
704717
connection.prepare(exception: const SocketException('failed'));
705718
}
@@ -753,6 +766,23 @@ void main() {
753766
});
754767
}
755768

769+
Future<void> prepareHandleEventError() async {
770+
final stream = eg.stream();
771+
await store.addStream(stream);
772+
// Set up a situation that breaks our data structures' invariants:
773+
// a stream/channel found in the by-ID map is missing in the by-name map.
774+
store.streamsByName.remove(stream.name);
775+
// Then prepare an event on which handleEvent will throw
776+
// because it hits that broken invariant.
777+
connection.prepare(json: GetEventsResult(events: [
778+
ChannelDeleteEvent(id: 1, streams: [stream]),
779+
], queueId: null).toJson());
780+
}
781+
782+
test('reloads on unexpected error within loop', () {
783+
checkReload(prepareUnexpectedLoopError);
784+
});
785+
756786
test('retries on NetworkException from SocketException', () {
757787
// We skip reporting errors on these; check we retry them all the same.
758788
checkRetry(prepareNetworkExceptionSocketException);
@@ -786,8 +816,12 @@ void main() {
786816
checkRetry(prepareZulipApiExceptionBadRequest);
787817
});
788818

789-
test('reloads on expired queue', () {
790-
checkReload(prepareExpiredEventQueue);
819+
test('reloads immediately on expired queue', () {
820+
checkReload(expectBackoff: false, prepareExpiredEventQueue);
821+
});
822+
823+
test('reloads on handleEvent error', () {
824+
checkReload(prepareHandleEventError);
791825
});
792826

793827
test('expired queue disposes registered MessageListView instances', () => awaitFakeAsync((async) async {
@@ -828,18 +862,20 @@ void main() {
828862
await preparePoll(lastEventId: 1);
829863
}
830864

831-
void pollAndFail(FakeAsync async) {
865+
void pollAndFail(FakeAsync async, {bool shouldCheckRequest = true}) {
832866
updateMachine.debugAdvanceLoop();
833867
async.elapse(Duration.zero);
834-
checkLastRequest(lastEventId: 1);
868+
if (shouldCheckRequest) checkLastRequest(lastEventId: 1);
835869
check(store).isLoading.isTrue();
836870
}
837871

838872
Subject<String> checkReported(void Function() prepareError) {
839873
return awaitFakeAsync((async) async {
840874
await prepare();
841875
prepareError();
842-
pollAndFail(async);
876+
// No need to check on the request; there's no later step of this test
877+
// for it to be needed as setup for.
878+
pollAndFail(async, shouldCheckRequest: false);
843879
return check(takeLastReportedError()).isNotNull();
844880
});
845881
}
@@ -892,6 +928,10 @@ void main() {
892928
});
893929
}
894930

931+
test('report unexpected error within loop', () {
932+
checkReported(prepareUnexpectedLoopError);
933+
});
934+
895935
test('ignore NetworkException from SocketException', () {
896936
checkNotReported(prepareNetworkExceptionSocketException);
897937
});
@@ -941,6 +981,12 @@ void main() {
941981
test('ignore expired queue', () {
942982
checkNotReported(prepareExpiredEventQueue);
943983
});
984+
985+
test('report handleEvent error', () {
986+
checkReported(prepareHandleEventError).startsWith(
987+
"Error connecting to Zulip. Retrying…\n"
988+
"Error connecting to Zulip at");
989+
});
944990
});
945991
});
946992

0 commit comments

Comments
 (0)