Skip to content

Commit 346c198

Browse files
committed
PHPLIB-1237 Implement Parallel Multi File Export Bench
1 parent 11b0213 commit 346c198

File tree

4 files changed

+297
-41
lines changed

4 files changed

+297
-41
lines changed
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
<?php
2+
3+
namespace MongoDB\Benchmark\DriverBench\Amp;
4+
5+
use Amp\Cancellation;
6+
use Amp\Parallel\Worker\Task;
7+
use Amp\Sync\Channel;
8+
use MongoDB\Benchmark\DriverBench\ParallelMultiFileExportBench;
9+
10+
final class ExportFileTask implements Task
11+
{
12+
public function __construct(
13+
private string|array $files,
14+
private array $filter = [],
15+
private array $options = [],
16+
) {
17+
}
18+
19+
public function run(Channel $channel, Cancellation $cancellation): mixed
20+
{
21+
ParallelMultiFileExportBench::exportFile($this->files, $this->filter, $this->options);
22+
23+
return $this->files;
24+
}
25+
}

benchmark/src/DriverBench/Amp/ImportFileTask.php

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,14 +10,14 @@
1010
final class ImportFileTask implements Task
1111
{
1212
public function __construct(
13-
private string $file,
13+
private array $files,
1414
) {
1515
}
1616

1717
public function run(Channel $channel, Cancellation $cancellation): mixed
1818
{
19-
ParallelMultiFileImportBench::importFile($this->file);
19+
ParallelMultiFileImportBench::importFile($this->files);
2020

21-
return null;
21+
return $this->files;
2222
}
2323
}
Lines changed: 225 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,225 @@
1+
<?php
2+
3+
namespace MongoDB\Benchmark\DriverBench;
4+
5+
use Amp\Future;
6+
use Amp\Parallel\Worker\ContextWorkerFactory;
7+
use Amp\Parallel\Worker\ContextWorkerPool;
8+
use Generator;
9+
use MongoDB\Benchmark\DriverBench\Amp\ExportFileTask;
10+
use MongoDB\Benchmark\Fixtures\Data;
11+
use MongoDB\Benchmark\Utils;
12+
use MongoDB\BSON\Document;
13+
use PhpBench\Attributes\AfterClassMethods;
14+
use PhpBench\Attributes\AfterMethods;
15+
use PhpBench\Attributes\BeforeClassMethods;
16+
use PhpBench\Attributes\Iterations;
17+
use PhpBench\Attributes\ParamProviders;
18+
use PhpBench\Attributes\Revs;
19+
use RuntimeException;
20+
21+
use function array_chunk;
22+
use function array_fill;
23+
use function array_map;
24+
use function ceil;
25+
use function file_exists;
26+
use function file_get_contents;
27+
use function file_put_contents;
28+
use function is_dir;
29+
use function json_encode;
30+
use function mkdir;
31+
use function pcntl_fork;
32+
use function pcntl_waitpid;
33+
use function range;
34+
use function sprintf;
35+
use function sys_get_temp_dir;
36+
use function unlink;
37+
38+
/**
39+
* For accurate results, run benchmarks on a standalone server.
40+
*
41+
* @see https://github.com/mongodb/specifications/blob/ddfc8b583d49aaf8c4c19fa01255afb66b36b92e/source/benchmarking/benchmarking.rst#ldjson-multi-file-export
42+
*/
43+
#[BeforeClassMethods('beforeClass')]
44+
#[AfterClassMethods('afterClass')]
45+
#[AfterMethods('afterIteration')]
46+
#[Iterations(1)]
47+
#[Revs(1)]
48+
final class ParallelMultiFileExportBench
49+
{
50+
public static function beforeClass(): void
51+
{
52+
// Resets the database to ensure that the collection is empty
53+
Utils::getDatabase()->drop();
54+
55+
$doc = Document::fromJSON(file_get_contents(Data::LDJSON_FILE_PATH));
56+
Utils::getCollection()->insertMany(array_fill(0, 500_000, $doc));
57+
}
58+
59+
public static function afterClass(): void
60+
{
61+
Utils::getDatabase()->drop();
62+
}
63+
64+
public function afterIteration(): void
65+
{
66+
foreach (self::getFileNames() as $file) {
67+
if (file_exists($file)) {
68+
unlink($file);
69+
}
70+
}
71+
}
72+
73+
/**
74+
* Using a single thread to export multiple files.
75+
* By executing a single Find command for multiple files, we can reduce the number of roundtrips to the server.
76+
*
77+
* @param array{chunk:int} $params
78+
*/
79+
#[ParamProviders(['provideChunkParams'])]
80+
public function benchSequential(array $params): void
81+
{
82+
foreach (array_chunk(self::getFileNames(), $params['chunk']) as $i => $files) {
83+
self::exportFile($files, [], [
84+
'limit' => 5_000 * $params['chunk'],
85+
'skip' => 5_000 * $params['chunk'] * $i,
86+
]);
87+
}
88+
}
89+
90+
/**
91+
* Using multiple forked threads
92+
*
93+
* @param array{chunk:int} $params
94+
*/
95+
#[ParamProviders(['provideChunkParams'])]
96+
public function benchFork(array $params): void
97+
{
98+
$pids = [];
99+
100+
// Reset to ensure that the existing libmongoc client (via the Manager) is not re-used by the child
101+
// process. When the child process constructs a new Manager, the differing PID will result in creation
102+
// of a new libmongoc client.
103+
Utils::reset();
104+
105+
// Create a child process for each chunk of files
106+
foreach (array_chunk(self::getFileNames(), $params['chunk']) as $i => $files) {
107+
$pid = pcntl_fork();
108+
if ($pid === 0) {
109+
self::exportFile($files, [], [
110+
'limit' => 5_000 * $params['chunk'],
111+
'skip' => 5_000 * $params['chunk'] * $i,
112+
]);
113+
114+
// Exit the child process
115+
exit(0);
116+
}
117+
118+
if ($pid === -1) {
119+
throw new RuntimeException('Failed to fork');
120+
}
121+
122+
// Keep the forked process id to wait for it later
123+
$pids[$pid] = true;
124+
}
125+
126+
// Wait for all child processes to finish
127+
while ($pids !== []) {
128+
$pid = pcntl_waitpid(-1, $status);
129+
unset($pids[$pid]);
130+
}
131+
}
132+
133+
/**
134+
* Using amphp/parallel with worker pool
135+
*
136+
* @param array{chunk:int} $params
137+
*/
138+
#[ParamProviders(['provideChunkParams'])]
139+
public function benchAmpWorkers(array $params): void
140+
{
141+
$workerPool = new ContextWorkerPool(ceil(100 / $params['chunk']), new ContextWorkerFactory());
142+
143+
$futures = [];
144+
foreach (array_chunk(self::getFileNames(), $params['chunk']) as $i => $files) {
145+
$futures[] = $workerPool->submit(
146+
new ExportFileTask(
147+
files: $files,
148+
options: [
149+
'limit' => 5_000 * $params['chunk'],
150+
'skip' => 5_000 * $params['chunk'] * $i,
151+
],
152+
),
153+
)->getFuture();
154+
}
155+
156+
foreach (Future::iterate($futures) as $future) {
157+
$future->await();
158+
}
159+
}
160+
161+
public static function provideChunkParams(): Generator
162+
{
163+
yield 'by 1' => ['chunk' => 1];
164+
yield 'by 2' => ['chunk' => 2];
165+
yield 'by 4' => ['chunk' => 4];
166+
yield 'by 8' => ['chunk' => 8];
167+
yield 'by 13' => ['chunk' => 13];
168+
yield 'by 20' => ['chunk' => 20];
169+
yield 'by 100' => ['chunk' => 100];
170+
}
171+
172+
/**
173+
* Export a query to a file
174+
*/
175+
public static function exportFile(array|string $files, array $filter = [], array $options = []): void
176+
{
177+
$options += [
178+
// bson typemap is faster on query result, but slower to JSON encode
179+
'typeMap' => ['root' => 'array'],
180+
// Excludes _id field to be identical to fixtures data
181+
'projection' => ['_id' => 0],
182+
'sort' => ['_id' => 1],
183+
];
184+
$cursor = Utils::getCollection()->find($filter, $options);
185+
$cursor->rewind();
186+
187+
foreach ((array) $files as $file) {
188+
// Aggregate file in memory to reduce filesystem operations
189+
$data = '';
190+
for ($i = 0; $i < 5_000; $i++) {
191+
$document = $cursor->current();
192+
// Cursor exhausted
193+
if (! $document) {
194+
break;
195+
}
196+
197+
// We don't use MongoDB\BSON\Document::toCanonicalExtendedJSON() because
198+
// it is slower than json_encode() on an array.
199+
$data .= json_encode($document) . "\n";
200+
$cursor->next();
201+
}
202+
203+
// Write file in a single operation
204+
file_put_contents($file, $data);
205+
}
206+
}
207+
208+
/**
209+
* Using a method to regenerate the file names because we cannot cache the result of the method in a static
210+
* property. The benchmark runner will call the method in a different process, so the static property will not be
211+
* populated.
212+
*/
213+
private static function getFileNames(): array
214+
{
215+
$tempDir = sys_get_temp_dir() . '/mongodb-php-benchmark';
216+
if (! is_dir($tempDir)) {
217+
mkdir($tempDir);
218+
}
219+
220+
return array_map(
221+
static fn (int $i) => sprintf('%s/%03d.txt', $tempDir, $i),
222+
range(0, 99),
223+
);
224+
}
225+
}

0 commit comments

Comments
 (0)