Skip to content

Commit b8fe0d6

Browse files
committed
Add multithreaded benchmark for umf
Helper functions taken from pmemstream repo.
1 parent 4c95ea8 commit b8fe0d6

File tree

5 files changed

+319
-4
lines changed

5 files changed

+319
-4
lines changed

CMakeLists.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ option(UMF_BUILD_LIBUMF_POOL_JEMALLOC "Build the libumf_pool_jemalloc static lib
1313
option(UMF_BUILD_LIBUMF_POOL_SCALABLE "Build the libumf_pool_scalable static library" OFF)
1414
option(UMF_BUILD_TESTS "Build UMF tests" ON)
1515
option(UMF_BUILD_BENCHMARKS "Build UMF benchmarks" OFF)
16+
option(UMF_BUILD_BENCHMARKS_MT "Build UMF multithreaded benchmarks" OFF)
1617
option(UMF_ENABLE_POOL_TRACKING "Build UMF with pool tracking" ON)
1718
option(UMF_DEVELOPER_MODE "Enable developer checks, treats warnings as errors" OFF)
1819
option(UMF_FORMAT_CODE_STYLE "Format UMF code with clang-format" OFF)
@@ -26,7 +27,8 @@ option(USE_MSAN "Enable MemorySanitizer checks" OFF)
2627
# CMake will set up a strict C build, without C++ support.
2728
set(OPTIONS_REQUIRING_CXX
2829
"UMF_BUILD_TESTS"
29-
"UMF_BUILD_LIBUMF_POOL_DISJOINT")
30+
"UMF_BUILD_LIBUMF_POOL_DISJOINT"
31+
"UMF_BUILD_BENCHMARKS_MT")
3032
foreach(option_name ${OPTIONS_REQUIRING_CXX})
3133
if(${option_name})
3234
enable_language(CXX)

README.md

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -78,14 +78,17 @@ Required packages:
7878
For development and contributions:
7979
- clang-format-15.0 (can be installed with `python -m pip install clang-format==15.0.7`)
8080

81-
For building tests and Disjoint Pool:
81+
For building tests, multithreaded benchmarks and Disjoint Pool:
8282
- C++ compiler with C++17 support
8383

8484
### Benchmark
8585

86-
A simple micro benchmark based on [ubench](https://github.com/sheredom/ubench.h).
86+
UMF comes with a single-threaded micro benchmark based on [ubench](https://github.com/sheredom/ubench.h).
8787
In order to build the benchmark, the `UMF_BUILD_BENCHMARKS` CMake configuration flag has to be turned `ON`.
8888

89+
UMF also provides multithreaded benchmarks that can be enabled bysetting the `UMF_BUILD_BENCHMARKS_MT` CMake
90+
configuration flag to `ON`. Multithreaded benchmarks require a C++ support.
91+
8992
### Windows
9093

9194
Generating Visual Studio Project. EXE and binaries will be in **build/bin/{build_config}**

benchmark/CMakeLists.txt

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
44

55
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
6-
message(WARNING "The ubench SHOULD NOT be run in the Debug build type!")
6+
message(WARNING "The benchmarks SHOULD NOT be run in the Debug build type!")
77
endif()
88

99
if(UMF_BUILD_LIBUMF_POOL_DISJOINT)
@@ -32,18 +32,44 @@ target_link_libraries(ubench
3232
pthread
3333
m)
3434

35+
if (UMF_BUILD_BENCHMARKS_MT)
36+
add_executable(multithread_bench multithread.cpp)
37+
target_link_libraries(multithread_bench
38+
umf
39+
${LIBS_OPTIONAL}
40+
pthread
41+
m)
42+
target_include_directories(multithread_bench PRIVATE ${UMF_CMAKE_SOURCE_DIR}/include/)
43+
endif()
44+
3545
if (UMF_BUILD_OS_MEMORY_PROVIDER)
3646
target_compile_definitions(ubench PRIVATE UMF_BUILD_OS_MEMORY_PROVIDER=1)
47+
48+
if (UMF_BUILD_BENCHMARKS_MT)
49+
target_compile_definitions(multithread_bench PRIVATE UMF_BUILD_OS_MEMORY_PROVIDER=1)
50+
endif()
3751
endif()
3852

3953
if (UMF_BUILD_LIBUMF_POOL_DISJOINT)
4054
target_compile_definitions(ubench PRIVATE UMF_BUILD_LIBUMF_POOL_DISJOINT=1)
55+
56+
if (UMF_BUILD_BENCHMARKS_MT)
57+
target_compile_definitions(multithread_bench PRIVATE UMF_BUILD_LIBUMF_POOL_DISJOINT=1)
58+
endif()
4159
endif()
4260

4361
if (UMF_BUILD_LIBUMF_POOL_JEMALLOC)
4462
target_compile_definitions(ubench PRIVATE UMF_BUILD_LIBUMF_POOL_JEMALLOC=1)
63+
64+
if (UMF_BUILD_BENCHMARKS_MT)
65+
target_compile_definitions(multithread_bench PRIVATE UMF_BUILD_LIBUMF_POOL_JEMALLOC=1)
66+
endif()
4567
endif()
4668

4769
if (UMF_BUILD_LIBUMF_POOL_SCALABLE)
4870
target_compile_definitions(ubench PRIVATE UMF_BUILD_LIBUMF_POOL_SCALABLE=1)
71+
72+
if (UMF_BUILD_BENCHMARKS_MT)
73+
target_compile_definitions(multithread_bench PRIVATE UMF_BUILD_LIBUMF_POOL_SCALABLE=1)
74+
endif()
4975
endif()

benchmark/multithread.cpp

Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
/*
2+
*
3+
* Copyright (C) 2024 Intel Corporation
4+
*
5+
* Under the Apache License v2.0 with LLVM Exceptions. See LICENSE.TXT.
6+
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7+
*
8+
*/
9+
10+
#include "multithread.hpp"
11+
12+
#include <umf/memory_pool.h>
13+
#include <umf/pools/pool_disjoint.h>
14+
#include <umf/pools/pool_jemalloc.h>
15+
#include <umf/pools/pool_scalable.h>
16+
#include <umf/providers/provider_os_memory.h>
17+
18+
#include <iostream>
19+
#include <memory>
20+
#include <numeric>
21+
22+
struct bench_params {
23+
// bench_params() = default;
24+
size_t n_repeats = 5;
25+
size_t n_iterations = 50000;
26+
size_t n_threads = 20;
27+
size_t alloc_size = 64;
28+
};
29+
30+
using poolCreateExtParams = std::tuple<umf_memory_pool_ops_t *, void *,
31+
umf_memory_provider_ops_t *, void *>;
32+
33+
static auto poolCreateExtUnique(poolCreateExtParams params) {
34+
umf_memory_pool_handle_t hPool;
35+
auto [pool_ops, pool_params, provider_ops, provider_params] = params;
36+
37+
umf_memory_provider_handle_t provider = nullptr;
38+
auto ret =
39+
umfMemoryProviderCreate(provider_ops, provider_params, &provider);
40+
if (ret != UMF_RESULT_SUCCESS) {
41+
std::cerr << "provider create failed" << std::endl;
42+
abort();
43+
}
44+
45+
ret = umfPoolCreate(pool_ops, provider, pool_params,
46+
UMF_POOL_CREATE_FLAG_OWN_PROVIDER, &hPool);
47+
if (ret != UMF_RESULT_SUCCESS) {
48+
std::cerr << "pool create failed" << std::endl;
49+
abort();
50+
}
51+
52+
return std::shared_ptr<umf_memory_pool_t>(hPool, &umfPoolDestroy);
53+
}
54+
55+
static void mt_alloc_free(poolCreateExtParams params,
56+
const bench_params &bench = bench_params()) {
57+
auto pool = poolCreateExtUnique(params);
58+
59+
std::vector<std::vector<void *>> allocs(bench.n_threads);
60+
std::vector<size_t> numFailures(bench.n_threads);
61+
for (auto &v : allocs) {
62+
v.reserve(bench.n_iterations);
63+
}
64+
65+
auto values = umf_bench::measure<std::chrono::milliseconds>(
66+
bench.n_repeats, bench.n_threads,
67+
[&, pool = pool.get()](auto thread_id) {
68+
for (int i = 0; i < bench.n_iterations; i++) {
69+
allocs[thread_id].push_back(
70+
umfPoolMalloc(pool, bench.alloc_size));
71+
if (!allocs[thread_id].back()) {
72+
numFailures[thread_id]++;
73+
}
74+
}
75+
76+
for (int i = 0; i < bench.n_iterations; i++) {
77+
umfPoolFree(pool, allocs[thread_id][i]);
78+
}
79+
80+
// clear the vector as this function might be called multiple times
81+
allocs[thread_id].clear();
82+
});
83+
84+
std::cout << "mean: " << umf_bench::mean(values)
85+
<< " [ms] std_dev: " << umf_bench::std_dev(values) << " [ms]"
86+
<< " (total alloc failures: "
87+
<< std::accumulate(numFailures.begin(), numFailures.end(), 0)
88+
<< " out of "
89+
<< bench.n_iterations * bench.n_repeats * bench.n_threads << ")"
90+
<< std::endl;
91+
}
92+
93+
int main() {
94+
#if defined(UMF_BUILD_OS_MEMORY_PROVIDER)
95+
auto osParams = umfOsMemoryProviderParamsDefault();
96+
#endif
97+
98+
#if defined(UMF_BUILD_OS_MEMORY_PROVIDER) && \
99+
defined(UMF_BUILD_LIBUMF_POOL_SCALABLE)
100+
101+
// Increase iterations for scalable pool since it runs much faster than the remaining
102+
// ones.
103+
bench_params params;
104+
params.n_iterations *= 20;
105+
106+
std::cout << "scalable_pool mt_alloc_free: ";
107+
mt_alloc_free(poolCreateExtParams{umfScalablePoolOps(), nullptr,
108+
umfOsMemoryProviderOps(), &osParams},
109+
params);
110+
#else
111+
std::cout << "skipping scalable_pool mt_alloc_free" << std::endl;
112+
#endif
113+
114+
#if defined(UMF_BUILD_OS_MEMORY_PROVIDER) && \
115+
defined(UMF_BUILD_LIBUMF_POOL_JEMALLOC)
116+
std::cout << "jemalloc_pool mt_alloc_free: ";
117+
mt_alloc_free(poolCreateExtParams{umfJemallocPoolOps(), nullptr,
118+
umfOsMemoryProviderOps(), &osParams});
119+
#else
120+
std::cout << "skipping jemalloc_pool mt_alloc_free" << std::endl;
121+
#endif
122+
123+
#if defined(UMF_BUILD_OS_MEMORY_PROVIDER) && \
124+
defined(UMF_BUILD_LIBUMF_POOL_DISJOINT)
125+
auto disjointParams = umfDisjointPoolParamsDefault();
126+
127+
std::cout << "disjoint_pool mt_alloc_free: ";
128+
mt_alloc_free(poolCreateExtParams{umfDisjointPoolOps(), &disjointParams,
129+
umfOsMemoryProviderOps(), &osParams});
130+
#else
131+
std::cout << "skipping disjoint_pool mt_alloc_free" << std::endl;
132+
#endif
133+
134+
return 0;
135+
}

benchmark/multithread.hpp

Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
/*
2+
*
3+
* Copyright (C) 2024 Intel Corporation
4+
*
5+
* Under the Apache License v2.0 with LLVM Exceptions. See LICENSE.TXT.
6+
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7+
*
8+
*/
9+
10+
#include <algorithm>
11+
#include <chrono>
12+
#include <cmath>
13+
#include <condition_variable>
14+
#include <functional>
15+
#include <mutex>
16+
#include <numeric>
17+
#include <thread>
18+
#include <vector>
19+
20+
namespace umf_bench {
21+
22+
template <typename Function>
23+
void parallel_exec(size_t threads_number, Function &&f) {
24+
std::vector<std::thread> threads;
25+
threads.reserve(threads_number);
26+
27+
for (size_t i = 0; i < threads_number; ++i) {
28+
threads.emplace_back([&](size_t id) { f(id); }, i);
29+
}
30+
31+
for (auto &t : threads) {
32+
t.join();
33+
}
34+
}
35+
36+
class latch {
37+
public:
38+
latch(size_t desired) : counter(desired) {}
39+
40+
/* Returns true for the last thread arriving at the latch, false for all
41+
* other threads. */
42+
bool wait(std::unique_lock<std::mutex> &lock) {
43+
counter--;
44+
if (counter > 0) {
45+
cv.wait(lock, [&] { return counter == 0; });
46+
return false;
47+
} else {
48+
/*
49+
* notify_call could be called outside of a lock
50+
* (it would perform better) but drd complains
51+
* in that case
52+
*/
53+
cv.notify_all();
54+
return true;
55+
}
56+
}
57+
58+
private:
59+
std::condition_variable cv;
60+
size_t counter = 0;
61+
};
62+
63+
/* Implements multi-use barrier (latch). Once all threads arrive at the
64+
* latch, a new latch is allocated and used by all subsequent calls to
65+
* syncthreads. */
66+
struct syncthreads_barrier {
67+
syncthreads_barrier(size_t num_threads) : num_threads(num_threads) {
68+
mutex = std::shared_ptr<std::mutex>(new std::mutex);
69+
current_latch = std::shared_ptr<latch>(new latch(num_threads));
70+
}
71+
72+
syncthreads_barrier(const syncthreads_barrier &) = delete;
73+
syncthreads_barrier &operator=(const syncthreads_barrier &) = delete;
74+
syncthreads_barrier(syncthreads_barrier &&) = default;
75+
76+
void operator()() {
77+
std::unique_lock<std::mutex> lock(*mutex);
78+
auto l = current_latch;
79+
if (l->wait(lock)) {
80+
current_latch = std::shared_ptr<latch>(new latch(num_threads));
81+
}
82+
}
83+
84+
private:
85+
size_t num_threads;
86+
std::shared_ptr<std::mutex> mutex;
87+
std::shared_ptr<latch> current_latch;
88+
};
89+
90+
template <typename TimeUnit, typename F>
91+
typename TimeUnit::rep measure(F &&func) {
92+
auto start = std::chrono::steady_clock::now();
93+
94+
func();
95+
96+
auto duration = std::chrono::duration_cast<TimeUnit>(
97+
std::chrono::steady_clock::now() - start);
98+
return duration.count();
99+
}
100+
101+
/* Measure time of execution of run_workload(thread_id) function. */
102+
template <typename TimeUnit, typename F>
103+
auto measure(size_t iterations, size_t concurrency, F &&run_workload) {
104+
using ResultsType = typename TimeUnit::rep;
105+
std::vector<ResultsType> results;
106+
107+
for (size_t i = 0; i < iterations; i++) {
108+
std::vector<ResultsType> iteration_results(concurrency);
109+
syncthreads_barrier syncthreads(concurrency);
110+
parallel_exec(concurrency, [&](size_t id) {
111+
syncthreads();
112+
113+
iteration_results[id] =
114+
measure<TimeUnit>([&]() { run_workload(id); });
115+
116+
syncthreads();
117+
});
118+
results.insert(results.end(), iteration_results.begin(),
119+
iteration_results.end());
120+
}
121+
122+
return results;
123+
}
124+
125+
template <typename T> T min(const std::vector<T> &values) {
126+
return *std::min_element(values.begin(), values.end());
127+
}
128+
129+
template <typename T> T max(const std::vector<T> &values) {
130+
return *std::max_element(values.begin(), values.end());
131+
}
132+
133+
template <typename T> double mean(const std::vector<T> &values) {
134+
return std::accumulate(values.begin(), values.end(), 0.0) / values.size();
135+
}
136+
137+
template <typename T> double std_dev(const std::vector<T> &values) {
138+
auto m = mean(values);
139+
std::vector<T> diff_squares;
140+
diff_squares.reserve(values.size());
141+
142+
for (auto &v : values) {
143+
diff_squares.push_back(std::pow((v - m), 2.0));
144+
}
145+
146+
return std::sqrt(mean(diff_squares));
147+
}
148+
149+
} // namespace umf_bench

0 commit comments

Comments
 (0)