-
Notifications
You must be signed in to change notification settings - Fork 266
PHPLIB-1237 Implement Parallel Multi File Export Bench #1169
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
4 commits
Select commit
Hold shift + click to select a range
a59e030
Upgrade benchmark to AMPHP with fibers
GromNaN 58c96ba
PHPLIB-1237 Implement Parallel Multi File Export Bench
GromNaN 033f317
Fix BSONMicroBench::benchDecoding to take the already encoded bson-bi…
GromNaN 308f508
fclose in finally block
GromNaN File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
<?php | ||
|
||
namespace MongoDB\Benchmark\DriverBench\Amp; | ||
|
||
use Amp\Cancellation; | ||
use Amp\Parallel\Worker\Task; | ||
use Amp\Sync\Channel; | ||
use MongoDB\Benchmark\DriverBench\ParallelMultiFileExportBench; | ||
|
||
final class ExportFileTask implements Task | ||
{ | ||
public function __construct( | ||
private string|array $files, | ||
private array $filter = [], | ||
private array $options = [], | ||
) { | ||
} | ||
|
||
public function run(Channel $channel, Cancellation $cancellation): mixed | ||
{ | ||
ParallelMultiFileExportBench::exportFile($this->files, $this->filter, $this->options); | ||
|
||
return $this->files; | ||
} | ||
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
<?php | ||
|
||
namespace MongoDB\Benchmark\DriverBench\Amp; | ||
|
||
use Amp\Cancellation; | ||
use Amp\Parallel\Worker\Task; | ||
use Amp\Sync\Channel; | ||
use MongoDB\Benchmark\DriverBench\ParallelMultiFileImportBench; | ||
|
||
final class ImportFileTask implements Task | ||
{ | ||
public function __construct( | ||
private array $files, | ||
) { | ||
} | ||
|
||
public function run(Channel $channel, Cancellation $cancellation): mixed | ||
{ | ||
ParallelMultiFileImportBench::importFile($this->files); | ||
|
||
return $this->files; | ||
} | ||
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
225 changes: 225 additions & 0 deletions
225
benchmark/src/DriverBench/ParallelMultiFileExportBench.php
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,225 @@ | ||
<?php | ||
|
||
namespace MongoDB\Benchmark\DriverBench; | ||
|
||
use Amp\Future; | ||
use Amp\Parallel\Worker\ContextWorkerFactory; | ||
use Amp\Parallel\Worker\ContextWorkerPool; | ||
use Generator; | ||
use MongoDB\Benchmark\DriverBench\Amp\ExportFileTask; | ||
use MongoDB\Benchmark\Fixtures\Data; | ||
use MongoDB\Benchmark\Utils; | ||
use MongoDB\BSON\Document; | ||
use PhpBench\Attributes\AfterClassMethods; | ||
use PhpBench\Attributes\AfterMethods; | ||
use PhpBench\Attributes\BeforeClassMethods; | ||
use PhpBench\Attributes\Iterations; | ||
use PhpBench\Attributes\ParamProviders; | ||
use PhpBench\Attributes\Revs; | ||
use RuntimeException; | ||
|
||
use function array_chunk; | ||
use function array_fill; | ||
use function array_map; | ||
use function ceil; | ||
use function file_exists; | ||
use function file_get_contents; | ||
use function file_put_contents; | ||
use function is_dir; | ||
use function json_encode; | ||
use function mkdir; | ||
use function pcntl_fork; | ||
use function pcntl_waitpid; | ||
use function range; | ||
use function sprintf; | ||
use function sys_get_temp_dir; | ||
use function unlink; | ||
|
||
/** | ||
* For accurate results, run benchmarks on a standalone server. | ||
* | ||
* @see https://github.com/mongodb/specifications/blob/ddfc8b583d49aaf8c4c19fa01255afb66b36b92e/source/benchmarking/benchmarking.rst#ldjson-multi-file-export | ||
*/ | ||
#[BeforeClassMethods('beforeClass')] | ||
#[AfterClassMethods('afterClass')] | ||
#[AfterMethods('afterIteration')] | ||
#[Iterations(1)] | ||
#[Revs(1)] | ||
final class ParallelMultiFileExportBench | ||
{ | ||
public static function beforeClass(): void | ||
{ | ||
// Resets the database to ensure that the collection is empty | ||
Utils::getDatabase()->drop(); | ||
|
||
$doc = Document::fromJSON(file_get_contents(Data::LDJSON_FILE_PATH)); | ||
Utils::getCollection()->insertMany(array_fill(0, 500_000, $doc)); | ||
} | ||
|
||
public static function afterClass(): void | ||
{ | ||
Utils::getDatabase()->drop(); | ||
} | ||
|
||
public function afterIteration(): void | ||
{ | ||
foreach (self::getFileNames() as $file) { | ||
if (file_exists($file)) { | ||
unlink($file); | ||
} | ||
} | ||
} | ||
|
||
/** | ||
* Using a single thread to export multiple files. | ||
* By executing a single Find command for multiple files, we can reduce the number of roundtrips to the server. | ||
* | ||
* @param array{chunk:int} $params | ||
*/ | ||
#[ParamProviders(['provideChunkParams'])] | ||
public function benchSequential(array $params): void | ||
{ | ||
foreach (array_chunk(self::getFileNames(), $params['chunk']) as $i => $files) { | ||
self::exportFile($files, [], [ | ||
'limit' => 5_000 * $params['chunk'], | ||
'skip' => 5_000 * $params['chunk'] * $i, | ||
]); | ||
} | ||
} | ||
|
||
/** | ||
* Using multiple forked threads | ||
* | ||
* @param array{chunk:int} $params | ||
*/ | ||
#[ParamProviders(['provideChunkParams'])] | ||
public function benchFork(array $params): void | ||
{ | ||
$pids = []; | ||
|
||
// Reset to ensure that the existing libmongoc client (via the Manager) is not re-used by the child | ||
// process. When the child process constructs a new Manager, the differing PID will result in creation | ||
// of a new libmongoc client. | ||
Utils::reset(); | ||
|
||
// Create a child process for each chunk of files | ||
foreach (array_chunk(self::getFileNames(), $params['chunk']) as $i => $files) { | ||
$pid = pcntl_fork(); | ||
if ($pid === 0) { | ||
self::exportFile($files, [], [ | ||
'limit' => 5_000 * $params['chunk'], | ||
'skip' => 5_000 * $params['chunk'] * $i, | ||
]); | ||
|
||
// Exit the child process | ||
exit(0); | ||
} | ||
|
||
if ($pid === -1) { | ||
throw new RuntimeException('Failed to fork'); | ||
} | ||
|
||
// Keep the forked process id to wait for it later | ||
$pids[$pid] = true; | ||
} | ||
|
||
// Wait for all child processes to finish | ||
while ($pids !== []) { | ||
$pid = pcntl_waitpid(-1, $status); | ||
unset($pids[$pid]); | ||
} | ||
} | ||
|
||
/** | ||
* Using amphp/parallel with worker pool | ||
* | ||
* @param array{chunk:int} $params | ||
*/ | ||
#[ParamProviders(['provideChunkParams'])] | ||
public function benchAmpWorkers(array $params): void | ||
{ | ||
$workerPool = new ContextWorkerPool(ceil(100 / $params['chunk']), new ContextWorkerFactory()); | ||
|
||
$futures = []; | ||
foreach (array_chunk(self::getFileNames(), $params['chunk']) as $i => $files) { | ||
$futures[] = $workerPool->submit( | ||
new ExportFileTask( | ||
files: $files, | ||
options: [ | ||
'limit' => 5_000 * $params['chunk'], | ||
'skip' => 5_000 * $params['chunk'] * $i, | ||
], | ||
), | ||
)->getFuture(); | ||
} | ||
|
||
foreach (Future::iterate($futures) as $future) { | ||
$future->await(); | ||
} | ||
} | ||
|
||
public static function provideChunkParams(): Generator | ||
{ | ||
yield 'by 1' => ['chunk' => 1]; | ||
yield 'by 2' => ['chunk' => 2]; | ||
yield 'by 4' => ['chunk' => 4]; | ||
yield 'by 8' => ['chunk' => 8]; | ||
yield 'by 13' => ['chunk' => 13]; | ||
yield 'by 20' => ['chunk' => 20]; | ||
yield 'by 100' => ['chunk' => 100]; | ||
} | ||
|
||
/** | ||
* Export a query to a file | ||
*/ | ||
public static function exportFile(array|string $files, array $filter = [], array $options = []): void | ||
{ | ||
$options += [ | ||
// bson typemap is faster on query result, but slower to JSON encode | ||
'typeMap' => ['root' => 'array'], | ||
// Excludes _id field to be identical to fixtures data | ||
'projection' => ['_id' => 0], | ||
'sort' => ['_id' => 1], | ||
]; | ||
$cursor = Utils::getCollection()->find($filter, $options); | ||
$cursor->rewind(); | ||
|
||
foreach ((array) $files as $file) { | ||
// Aggregate file in memory to reduce filesystem operations | ||
$data = ''; | ||
for ($i = 0; $i < 5_000; $i++) { | ||
$document = $cursor->current(); | ||
// Cursor exhausted | ||
if (! $document) { | ||
break; | ||
} | ||
|
||
// We don't use MongoDB\BSON\Document::toCanonicalExtendedJSON() because | ||
// it is slower than json_encode() on an array. | ||
$data .= json_encode($document) . "\n"; | ||
$cursor->next(); | ||
} | ||
|
||
// Write file in a single operation | ||
file_put_contents($file, $data); | ||
} | ||
} | ||
|
||
/** | ||
* Using a method to regenerate the file names because we cannot cache the result of the method in a static | ||
* property. The benchmark runner will call the method in a different process, so the static property will not be | ||
* populated. | ||
*/ | ||
private static function getFileNames(): array | ||
{ | ||
$tempDir = sys_get_temp_dir() . '/mongodb-php-benchmark'; | ||
if (! is_dir($tempDir)) { | ||
mkdir($tempDir); | ||
} | ||
|
||
return array_map( | ||
static fn (int $i) => sprintf('%s/%03d.txt', $tempDir, $i), | ||
range(0, 99), | ||
); | ||
} | ||
} |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Note that
json_encode
does not produce the same result astoCanonicalExtendedJSON
. That said, it does produce the result we wanted.I also had to do a double-take on this, so I wrote a small benchmark:
The last benchmark actually calls
json_encode($document->toPHP(['root' => 'array']))
, so I'm a bit surprised that is faster. This might provide an opportunity to revisit the JSON serialisation logic in libbson.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Tracked in PHPC-2299