Add multithreaded benchmark for umf

igchor · igchor · commit b8fe0d61b167 · 2024-02-13T16:25:01.000+01:00
Helper functions taken from pmemstream repo.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -13,6 +13,7 @@ option(UMF_BUILD_LIBUMF_POOL_JEMALLOC "Build the libumf_pool_jemalloc static lib
 option(UMF_BUILD_LIBUMF_POOL_SCALABLE "Build the libumf_pool_scalable static library" OFF)
 option(UMF_BUILD_TESTS "Build UMF tests" ON)
 option(UMF_BUILD_BENCHMARKS "Build UMF benchmarks" OFF)
+option(UMF_BUILD_BENCHMARKS_MT "Build UMF multithreaded benchmarks" OFF)
 option(UMF_ENABLE_POOL_TRACKING "Build UMF with pool tracking" ON)
 option(UMF_DEVELOPER_MODE "Enable developer checks, treats warnings as errors" OFF)
 option(UMF_FORMAT_CODE_STYLE "Format UMF code with clang-format" OFF)
@@ -26,7 +27,8 @@ option(USE_MSAN "Enable MemorySanitizer checks" OFF)
 # CMake will set up a strict C build, without C++ support.
 set(OPTIONS_REQUIRING_CXX
     "UMF_BUILD_TESTS"
-    "UMF_BUILD_LIBUMF_POOL_DISJOINT")
+    "UMF_BUILD_LIBUMF_POOL_DISJOINT"
+    "UMF_BUILD_BENCHMARKS_MT")
 foreach(option_name ${OPTIONS_REQUIRING_CXX})
     if(${option_name})
         enable_language(CXX)
diff --git a/README.md b/README.md
@@ -78,14 +78,17 @@ Required packages:
 For development and contributions:
 - clang-format-15.0 (can be installed with `python -m pip install clang-format==15.0.7`)
 
-For building tests and Disjoint Pool:
+For building tests, multithreaded benchmarks and Disjoint Pool:
 - C++ compiler with C++17 support
 
 ### Benchmark
 
-A simple micro benchmark based on [ubench](https://github.com/sheredom/ubench.h).
+UMF comes with a single-threaded micro benchmark based on [ubench](https://github.com/sheredom/ubench.h).
 In order to build the benchmark, the `UMF_BUILD_BENCHMARKS` CMake configuration flag has to be turned `ON`.
 
+UMF also provides multithreaded benchmarks that can be enabled bysetting the `UMF_BUILD_BENCHMARKS_MT` CMake
+configuration flag to `ON`. Multithreaded benchmarks require a C++ support.
+
 ### Windows
 
 Generating Visual Studio Project. EXE and binaries will be in **build/bin/{build_config}**
diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 if(CMAKE_BUILD_TYPE STREQUAL "Debug")
-	message(WARNING "The ubench SHOULD NOT be run in the Debug build type!")
+	message(WARNING "The benchmarks SHOULD NOT be run in the Debug build type!")
 endif()
 
 if(UMF_BUILD_LIBUMF_POOL_DISJOINT)
@@ -32,18 +32,44 @@ target_link_libraries(ubench
 	pthread
 	m)
 
+if (UMF_BUILD_BENCHMARKS_MT)
+	add_executable(multithread_bench multithread.cpp)
+	target_link_libraries(multithread_bench
+		umf
+		${LIBS_OPTIONAL}
+		pthread
+		m)
+	target_include_directories(multithread_bench PRIVATE ${UMF_CMAKE_SOURCE_DIR}/include/)
+endif()
+
 if (UMF_BUILD_OS_MEMORY_PROVIDER)
 	target_compile_definitions(ubench PRIVATE UMF_BUILD_OS_MEMORY_PROVIDER=1)
+
+	if (UMF_BUILD_BENCHMARKS_MT)
+		target_compile_definitions(multithread_bench PRIVATE UMF_BUILD_OS_MEMORY_PROVIDER=1)
+	endif()
 endif()
 
 if (UMF_BUILD_LIBUMF_POOL_DISJOINT)
 	target_compile_definitions(ubench PRIVATE UMF_BUILD_LIBUMF_POOL_DISJOINT=1)
+
+	if (UMF_BUILD_BENCHMARKS_MT)
+		target_compile_definitions(multithread_bench PRIVATE UMF_BUILD_LIBUMF_POOL_DISJOINT=1)
+	endif()
 endif()
 
 if (UMF_BUILD_LIBUMF_POOL_JEMALLOC)
 	target_compile_definitions(ubench PRIVATE UMF_BUILD_LIBUMF_POOL_JEMALLOC=1)
+
+	if (UMF_BUILD_BENCHMARKS_MT)
+		target_compile_definitions(multithread_bench PRIVATE UMF_BUILD_LIBUMF_POOL_JEMALLOC=1)
+	endif()
 endif()
 
 if (UMF_BUILD_LIBUMF_POOL_SCALABLE)
 	target_compile_definitions(ubench PRIVATE UMF_BUILD_LIBUMF_POOL_SCALABLE=1)
+
+	if (UMF_BUILD_BENCHMARKS_MT)
+		target_compile_definitions(multithread_bench PRIVATE UMF_BUILD_LIBUMF_POOL_SCALABLE=1)
+	endif()
 endif()
diff --git a/benchmark/multithread.cpp b/benchmark/multithread.cpp
@@ -0,0 +1,135 @@
+/*
+ *
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Under the Apache License v2.0 with LLVM Exceptions. See LICENSE.TXT.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ */
+
+#include "multithread.hpp"
+
+#include <umf/memory_pool.h>
+#include <umf/pools/pool_disjoint.h>
+#include <umf/pools/pool_jemalloc.h>
+#include <umf/pools/pool_scalable.h>
+#include <umf/providers/provider_os_memory.h>
+
+#include <iostream>
+#include <memory>
+#include <numeric>
+
+struct bench_params {
+    // bench_params() = default;
+    size_t n_repeats = 5;
+    size_t n_iterations = 50000;
+    size_t n_threads = 20;
+    size_t alloc_size = 64;
+};
+
+using poolCreateExtParams = std::tuple<umf_memory_pool_ops_t *, void *,
+                                       umf_memory_provider_ops_t *, void *>;
+
+static auto poolCreateExtUnique(poolCreateExtParams params) {
+    umf_memory_pool_handle_t hPool;
+    auto [pool_ops, pool_params, provider_ops, provider_params] = params;
+
+    umf_memory_provider_handle_t provider = nullptr;
+    auto ret =
+        umfMemoryProviderCreate(provider_ops, provider_params, &provider);
+    if (ret != UMF_RESULT_SUCCESS) {
+        std::cerr << "provider create failed" << std::endl;
+        abort();
+    }
+
+    ret = umfPoolCreate(pool_ops, provider, pool_params,
+                        UMF_POOL_CREATE_FLAG_OWN_PROVIDER, &hPool);
+    if (ret != UMF_RESULT_SUCCESS) {
+        std::cerr << "pool create failed" << std::endl;
+        abort();
+    }
+
+    return std::shared_ptr<umf_memory_pool_t>(hPool, &umfPoolDestroy);
+}
+
+static void mt_alloc_free(poolCreateExtParams params,
+                          const bench_params &bench = bench_params()) {
+    auto pool = poolCreateExtUnique(params);
+
+    std::vector<std::vector<void *>> allocs(bench.n_threads);
+    std::vector<size_t> numFailures(bench.n_threads);
+    for (auto &v : allocs) {
+        v.reserve(bench.n_iterations);
+    }
+
+    auto values = umf_bench::measure<std::chrono::milliseconds>(
+        bench.n_repeats, bench.n_threads,
+        [&, pool = pool.get()](auto thread_id) {
+            for (int i = 0; i < bench.n_iterations; i++) {
+                allocs[thread_id].push_back(
+                    umfPoolMalloc(pool, bench.alloc_size));
+                if (!allocs[thread_id].back()) {
+                    numFailures[thread_id]++;
+                }
+            }
+
+            for (int i = 0; i < bench.n_iterations; i++) {
+                umfPoolFree(pool, allocs[thread_id][i]);
+            }
+
+            // clear the vector as this function might be called multiple times
+            allocs[thread_id].clear();
+        });
+
+    std::cout << "mean: " << umf_bench::mean(values)
+              << " [ms] std_dev: " << umf_bench::std_dev(values) << " [ms]"
+              << " (total alloc failures: "
+              << std::accumulate(numFailures.begin(), numFailures.end(), 0)
+              << " out of "
+              << bench.n_iterations * bench.n_repeats * bench.n_threads << ")"
+              << std::endl;
+}
+
+int main() {
+#if defined(UMF_BUILD_OS_MEMORY_PROVIDER)
+    auto osParams = umfOsMemoryProviderParamsDefault();
+#endif
+
+#if defined(UMF_BUILD_OS_MEMORY_PROVIDER) &&                                   \
+    defined(UMF_BUILD_LIBUMF_POOL_SCALABLE)
+
+    // Increase iterations for scalable pool since it runs much faster than the remaining
+    // ones.
+    bench_params params;
+    params.n_iterations *= 20;
+
+    std::cout << "scalable_pool mt_alloc_free: ";
+    mt_alloc_free(poolCreateExtParams{umfScalablePoolOps(), nullptr,
+                                      umfOsMemoryProviderOps(), &osParams},
+                  params);
+#else
+    std::cout << "skipping scalable_pool mt_alloc_free" << std::endl;
+#endif
+
+#if defined(UMF_BUILD_OS_MEMORY_PROVIDER) &&                                   \
+    defined(UMF_BUILD_LIBUMF_POOL_JEMALLOC)
+    std::cout << "jemalloc_pool mt_alloc_free: ";
+    mt_alloc_free(poolCreateExtParams{umfJemallocPoolOps(), nullptr,
+                                      umfOsMemoryProviderOps(), &osParams});
+#else
+    std::cout << "skipping jemalloc_pool mt_alloc_free" << std::endl;
+#endif
+
+#if defined(UMF_BUILD_OS_MEMORY_PROVIDER) &&                                   \
+    defined(UMF_BUILD_LIBUMF_POOL_DISJOINT)
+    auto disjointParams = umfDisjointPoolParamsDefault();
+
+    std::cout << "disjoint_pool mt_alloc_free: ";
+    mt_alloc_free(poolCreateExtParams{umfDisjointPoolOps(), &disjointParams,
+                                      umfOsMemoryProviderOps(), &osParams});
+#else
+    std::cout << "skipping disjoint_pool mt_alloc_free" << std::endl;
+#endif
+
+    return 0;
+}
diff --git a/benchmark/multithread.hpp b/benchmark/multithread.hpp
@@ -0,0 +1,149 @@
+/*
+ *
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Under the Apache License v2.0 with LLVM Exceptions. See LICENSE.TXT.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ */
+
+#include <algorithm>
+#include <chrono>
+#include <cmath>
+#include <condition_variable>
+#include <functional>
+#include <mutex>
+#include <numeric>
+#include <thread>
+#include <vector>
+
+namespace umf_bench {
+
+template <typename Function>
+void parallel_exec(size_t threads_number, Function &&f) {
+    std::vector<std::thread> threads;
+    threads.reserve(threads_number);
+
+    for (size_t i = 0; i < threads_number; ++i) {
+        threads.emplace_back([&](size_t id) { f(id); }, i);
+    }
+
+    for (auto &t : threads) {
+        t.join();
+    }
+}
+
+class latch {
+  public:
+    latch(size_t desired) : counter(desired) {}
+
+    /* Returns true for the last thread arriving at the latch, false for all
+     * other threads. */
+    bool wait(std::unique_lock<std::mutex> &lock) {
+        counter--;
+        if (counter > 0) {
+            cv.wait(lock, [&] { return counter == 0; });
+            return false;
+        } else {
+            /*
+             * notify_call could be called outside of a lock
+             * (it would perform better) but drd complains
+             * in that case
+             */
+            cv.notify_all();
+            return true;
+        }
+    }
+
+  private:
+    std::condition_variable cv;
+    size_t counter = 0;
+};
+
+/* Implements multi-use barrier (latch). Once all threads arrive at the
+ * latch, a new latch is allocated and used by all subsequent calls to
+ * syncthreads. */
+struct syncthreads_barrier {
+    syncthreads_barrier(size_t num_threads) : num_threads(num_threads) {
+        mutex = std::shared_ptr<std::mutex>(new std::mutex);
+        current_latch = std::shared_ptr<latch>(new latch(num_threads));
+    }
+
+    syncthreads_barrier(const syncthreads_barrier &) = delete;
+    syncthreads_barrier &operator=(const syncthreads_barrier &) = delete;
+    syncthreads_barrier(syncthreads_barrier &&) = default;
+
+    void operator()() {
+        std::unique_lock<std::mutex> lock(*mutex);
+        auto l = current_latch;
+        if (l->wait(lock)) {
+            current_latch = std::shared_ptr<latch>(new latch(num_threads));
+        }
+    }
+
+  private:
+    size_t num_threads;
+    std::shared_ptr<std::mutex> mutex;
+    std::shared_ptr<latch> current_latch;
+};
+
+template <typename TimeUnit, typename F>
+typename TimeUnit::rep measure(F &&func) {
+    auto start = std::chrono::steady_clock::now();
+
+    func();
+
+    auto duration = std::chrono::duration_cast<TimeUnit>(
+        std::chrono::steady_clock::now() - start);
+    return duration.count();
+}
+
+/* Measure time of execution of run_workload(thread_id) function. */
+template <typename TimeUnit, typename F>
+auto measure(size_t iterations, size_t concurrency, F &&run_workload) {
+    using ResultsType = typename TimeUnit::rep;
+    std::vector<ResultsType> results;
+
+    for (size_t i = 0; i < iterations; i++) {
+        std::vector<ResultsType> iteration_results(concurrency);
+        syncthreads_barrier syncthreads(concurrency);
+        parallel_exec(concurrency, [&](size_t id) {
+            syncthreads();
+
+            iteration_results[id] =
+                measure<TimeUnit>([&]() { run_workload(id); });
+
+            syncthreads();
+        });
+        results.insert(results.end(), iteration_results.begin(),
+                       iteration_results.end());
+    }
+
+    return results;
+}
+
+template <typename T> T min(const std::vector<T> &values) {
+    return *std::min_element(values.begin(), values.end());
+}
+
+template <typename T> T max(const std::vector<T> &values) {
+    return *std::max_element(values.begin(), values.end());
+}
+
+template <typename T> double mean(const std::vector<T> &values) {
+    return std::accumulate(values.begin(), values.end(), 0.0) / values.size();
+}
+
+template <typename T> double std_dev(const std::vector<T> &values) {
+    auto m = mean(values);
+    std::vector<T> diff_squares;
+    diff_squares.reserve(values.size());
+
+    for (auto &v : values) {
+        diff_squares.push_back(std::pow((v - m), 2.0));
+    }
+
+    return std::sqrt(mean(diff_squares));
+}
+
+} // namespace umf_bench