Skip to content

Commit b321908

Browse files
committed
tools/content: Support systematically surveying unimplemented content features.
We added 2 scripts. - fetch_messages.dart, the script that fetches messages from a given Zulip server, that does not depend on Flutter or other involved Zulip Flutter packages, so that it can run without Flutter. It is meant to be run first to produce the corpuses needed for surveying the unimplemented features. The fetched messages are formatted in JSON Lines format, where each individual entry is JSON containing the message ID and the rendered HTML content. The script stores output in separate files for messages from each server, because message IDs are not unique across them. - unimplemented_features_test.dart, a test that goes over all messages collected, parses then with the content parser, and report the unimplemented features it discovered. This is implemented as a test mainly because of its dependency on the content parser, which depends on the Flutter engine (and `flutter test` conveniently sets up a test device). The test can be run manually via: `flutter test --dart-define=corpusDir=path/to/corpusDir tools/content` See comments from the file for more instructions. Signed-off-by: Zixuan James Li <[email protected]>
1 parent 50e9d0a commit b321908

File tree

3 files changed

+400
-0
lines changed

3 files changed

+400
-0
lines changed

tools/content/fetch_messages.dart

Lines changed: 219 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,219 @@
1+
#!/usr/bin/env dart
2+
3+
import 'dart:convert';
4+
import 'dart:io';
5+
import 'dart:math';
6+
7+
// Avoid any Flutter-related dependencies so this can be run as a CLI program.
8+
import 'package:args/args.dart';
9+
import 'package:http/http.dart';
10+
import 'package:ini/ini.dart' as ini;
11+
import 'package:zulip/api/backoff.dart';
12+
13+
import 'model.dart';
14+
15+
/// Fetch all public message contents from a Zulip server in bulk.
16+
///
17+
/// It outputs JSON entries of the message IDs and the rendered HTML contents in
18+
/// JSON Lines (https://jsonlines.org) format. The output can be used later to
19+
/// perform checks for discovering unimplemented features.
20+
///
21+
/// Because message IDs are only unique within a single server, the script
22+
/// names corpuses from each server differently (if --corpus-dir is specified).
23+
///
24+
/// See tools/content/unimplemented_features_test.dart for more details.
25+
void main(List<String> args) async {
26+
final argParser = ArgParser();
27+
argParser.addOption(
28+
'config-file',
29+
help: 'A zuliprc file with identity information including email, API key\n'
30+
'and the Zulip server URL to fetch the messages from (required).\n\n'
31+
'To get the file, see\n'
32+
'https://zulip.com/api/configuring-python-bindings#download-a-zuliprc-file\n',
33+
valueHelp: 'path/to/zuliprc',
34+
);
35+
argParser.addOption(
36+
'corpus-dir',
37+
help: 'The directory to look for/store the corpus file. If not given,\n'
38+
'the script will write output to stdout. Otherwise, this will\n'
39+
'first read from the existing corpus file (assumed to be named\n'
40+
'as "host name of the Zulip server.jsonl") to avoid duplicates\n'
41+
'before fetching more messages',
42+
valueHelp: 'path/to/czo.jsonl',
43+
);
44+
argParser.addFlag(
45+
'fetch-newer',
46+
help: 'Fetch newer messages instead of older ones.\n'
47+
'Only useful when there is a matching corpus file in corpus-dir.',
48+
defaultsTo: false,
49+
);
50+
argParser.addFlag(
51+
'help', abbr: 'h',
52+
negatable: false,
53+
help: 'Show this help message.',
54+
);
55+
56+
void printUsage() {
57+
// Give it a pass when printing the help message.
58+
// ignore: avoid_print
59+
print('usage: fetch_messages --config-file <CONFIG_FILE>\n\n'
60+
'Fetch message contents from a Zulip server in bulk.\n\n'
61+
'${argParser.usage}');
62+
}
63+
64+
Never throwWithUsage(String error) {
65+
printUsage();
66+
throw Exception('\nError: $error');
67+
}
68+
69+
final parsedArguments = argParser.parse(args);
70+
if (parsedArguments['help'] as bool) {
71+
printUsage();
72+
exit(0);
73+
}
74+
75+
final zuliprc = parsedArguments['config-file'] as String?;
76+
if (zuliprc == null) {
77+
throwWithUsage('"config-file is required');
78+
}
79+
80+
final configFile = File(zuliprc);
81+
if (!configFile.existsSync()) {
82+
throwWithUsage('Config file "$zuliprc" does not exist');
83+
}
84+
85+
// `zuliprc` is a file in INI format containing the user's identity
86+
// information.
87+
//
88+
// See also:
89+
// https://zulip.com/api/configuring-python-bindings#configuration-keys-and-environment-variables
90+
final parsedConfig = ini.Config.fromString(configFile.readAsStringSync());
91+
final email = parsedConfig.get('api', 'email') as String;
92+
final apiKey = parsedConfig.get('api', 'key') as String;
93+
final site = Uri.parse(parsedConfig.get('api', 'site') as String);
94+
95+
final outputDirStr = parsedArguments['corpus-dir'] as String?;
96+
final fetchNewer = parsedArguments['fetch-newer'] as bool;
97+
int? anchorMessageId;
98+
IOSink output = stdout;
99+
if (outputDirStr != null) {
100+
// If the corpus file already exists, we need to find the known
101+
// newest/oldest message so that we can continue fetching from where we
102+
// left off.
103+
final outputDir = Directory(outputDirStr);
104+
outputDir.createSync(recursive: true);
105+
final outputFile = File('$outputDirStr/${site.host}.jsonl');
106+
if (!outputFile.existsSync()) outputFile.createSync();
107+
await for (final message in readMessagesFromJsonl(outputFile)) {
108+
// Newer Zulip messages have higher message IDs. This utilizes that to
109+
// find the newest/oldest message.
110+
anchorMessageId ??= message.id;
111+
anchorMessageId = (fetchNewer ? max : min)(message.id, anchorMessageId);
112+
}
113+
output = outputFile.openWrite(mode: FileMode.writeOnlyAppend);
114+
}
115+
116+
final client = Client();
117+
final authHeader = 'Basic ${base64Encode(utf8.encode('$email:$apiKey'))}';
118+
119+
// These are working constants chosen abitrarily.
120+
const batchSize = 5000;
121+
const maxRetries = 10;
122+
const fetchInterval = Duration(seconds: 5);
123+
124+
int retries = 0;
125+
BackoffMachine? backoff;
126+
127+
while (true) {
128+
// This loops until there is no message fetched in an iteration.
129+
final _GetMessagesResult result;
130+
try {
131+
result = await _getMessages(client, realmUrl: site,
132+
authHeader: authHeader,
133+
anchorMessageId: anchorMessageId,
134+
numBefore: (!fetchNewer) ? batchSize : 0,
135+
numAfter: (fetchNewer) ? batchSize : 0,
136+
);
137+
} catch (e) {
138+
// We could have more fine-grained error handling and avoid retrying on
139+
// non-network-related failures, but that's skipped for now.
140+
if (retries >= maxRetries) {
141+
rethrow;
142+
}
143+
retries++;
144+
await (backoff ??= BackoffMachine()).wait();
145+
continue;
146+
}
147+
148+
final messageEntries = result.messages.map(MessageEntry.fromJson);
149+
if (messageEntries.isEmpty) {
150+
// Sanity check to ensure that the server agrees
151+
// there is no more messages to fetch.
152+
if (fetchNewer) assert(result.foundNewest);
153+
if (!fetchNewer) assert(result.foundOldest);
154+
break;
155+
}
156+
157+
// Find and use the newest/oldest message as the next message fetch anchor.
158+
anchorMessageId = messageEntries.map((x) => x.id).reduce(fetchNewer ? max : min);
159+
messageEntries.map(jsonEncode).forEach((json) => output.writeln(json));
160+
161+
// This I/O operation could fail, but crashing is fine here.
162+
final flushFuture = output.flush();
163+
// Make sure the delay happens concurrently to the flush.
164+
await Future<void>.delayed(fetchInterval);
165+
await flushFuture;
166+
backoff = null;
167+
}
168+
exit(0);
169+
}
170+
171+
/// https://zulip.com/api/get-messages#response
172+
// Ported from [GetMessagesResult] to avoid depending on Flutter libraries.
173+
class _GetMessagesResult {
174+
const _GetMessagesResult(this.foundOldest, this.foundNewest, this.messages);
175+
176+
final bool foundOldest;
177+
final bool foundNewest;
178+
final List<Map<String, Object?>> messages;
179+
180+
factory _GetMessagesResult.fromJson(Map<String, Object?> json) =>
181+
_GetMessagesResult(
182+
json['found_oldest'] as bool,
183+
json['found_newest'] as bool,
184+
(json['messages'] as List<Object?>).map((x) => (x as Map<String, Object?>)).toList());
185+
}
186+
187+
/// https://zulip.com/api/get-messages
188+
Future<_GetMessagesResult> _getMessages(Client client, {
189+
required Uri realmUrl,
190+
required String authHeader,
191+
required int numBefore,
192+
required int numAfter,
193+
int? anchorMessageId,
194+
}) async {
195+
final url = realmUrl.replace(
196+
path: '/api/v1/messages',
197+
queryParameters: {
198+
// This fallback will only be used when first fetching from a server.
199+
'anchor': anchorMessageId != null ? jsonEncode(anchorMessageId) : 'newest',
200+
// The anchor message already exists in the corpus,
201+
// so avoid fetching it again.
202+
'include_anchor': jsonEncode(anchorMessageId == null),
203+
'num_before': jsonEncode(numBefore),
204+
'num_after': jsonEncode(numAfter),
205+
'narrow': jsonEncode([{'operator': 'channels', 'operand': 'public'}]),
206+
});
207+
final response = await client.send(
208+
Request('GET', url)..headers['Authorization'] = authHeader);
209+
final bytes = await response.stream.toBytes();
210+
final json = jsonDecode(utf8.decode(bytes)) as Map<String, dynamic>?;
211+
212+
if (response.statusCode != 200 || json == null) {
213+
// Just crashing early here should be fine for this tool. We don't need
214+
// to handle the specific error codes.
215+
throw Exception('Failed to get messages. Code: ${response.statusCode}\n'
216+
'Details: ${json ?? 'unknown'}');
217+
}
218+
return _GetMessagesResult.fromJson(json);
219+
}

tools/content/model.dart

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
import 'dart:io';
2+
import 'dart:convert';
3+
4+
import 'package:json_annotation/json_annotation.dart';
5+
6+
/// A data structure representing a message.
7+
@JsonSerializable()
8+
final class MessageEntry {
9+
const MessageEntry({
10+
required this.id,
11+
required this.content,
12+
});
13+
14+
/// Selectively parses from get-message responses.
15+
///
16+
/// See also: https://zulip.com/api/get-messages#response
17+
factory MessageEntry.fromJson(Map<String, Object?> json) =>
18+
MessageEntry(id: (json['id'] as num).toInt(), content: json['content'] as String);
19+
20+
Map<String, Object> toJson() => {'id': id, 'content': content};
21+
22+
/// The message ID, unique within a server.
23+
final int id;
24+
25+
/// The rendered HTML of the message.
26+
final String content;
27+
}
28+
29+
/// Open the given JSON Lines file and read [MessageEntry]'s from it.
30+
///
31+
/// We store the entries in JSON Lines format and return them from a stream to
32+
/// avoid excessive use of memory.
33+
Stream<MessageEntry> readMessagesFromJsonl(File file) => file.openRead()
34+
.transform(utf8.decoder).transform(const LineSplitter())
35+
.map(jsonDecode).map((x) => MessageEntry.fromJson(x as Map<String, Object?>));

0 commit comments

Comments
 (0)