Add multithreaded benchmark for umf

igchor · igchor · commit 1e2257d105da · 2024-02-03T02:25:56.000+01:00
Helper functions taken from pmemstream repo.
diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
@@ -32,18 +32,29 @@ target_link_libraries(ubench
 	pthread
 	m)
 
+add_executable(multithread_bench multithread.cpp)
+target_link_libraries(multithread_bench
+	umf
+	${LIBS_OPTIONAL}
+	pthread
+	m)
+
 if (UMF_BUILD_OS_MEMORY_PROVIDER)
 	target_compile_definitions(ubench PRIVATE UMF_BUILD_OS_MEMORY_PROVIDER=1)
+	target_compile_definitions(multithread_bench PRIVATE UMF_BUILD_OS_MEMORY_PROVIDER=1)
 endif()
 
 if (UMF_BUILD_LIBUMF_POOL_DISJOINT)
 	target_compile_definitions(ubench PRIVATE UMF_BUILD_LIBUMF_POOL_DISJOINT=1)
+	target_compile_definitions(multithread_bench PRIVATE UMF_BUILD_LIBUMF_POOL_DISJOINT=1)
 endif()
 
 if (UMF_BUILD_LIBUMF_POOL_JEMALLOC)
 	target_compile_definitions(ubench PRIVATE UMF_BUILD_LIBUMF_POOL_JEMALLOC=1)
+	target_compile_definitions(multithread_bench PRIVATE UMF_BUILD_LIBUMF_POOL_JEMALLOC=1)
 endif()
 
 if (UMF_BUILD_LIBUMF_POOL_SCALABLE)
 	target_compile_definitions(ubench PRIVATE UMF_BUILD_LIBUMF_POOL_SCALABLE=1)
+	target_compile_definitions(multithread_bench PRIVATE UMF_BUILD_LIBUMF_POOL_SCALABLE=1)
 endif()
diff --git a/benchmark/multithread.cpp b/benchmark/multithread.cpp
@@ -0,0 +1,110 @@
+/*
+ *
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Under the Apache License v2.0 with LLVM Exceptions. See LICENSE.TXT.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ */
+
+#include "multithread.hpp"
+
+#include <umf/memory_pool.h>
+#include <umf/pools/pool_jemalloc.h>
+#include <umf/pools/pool_scalable.h>
+#include <umf/providers/provider_os_memory.h>
+
+#include <iostream>
+#include <memory>
+#include <numeric>
+
+static constexpr size_t N_REPEATS = 5;
+static constexpr size_t N_ITERATIONS = 50000;
+static constexpr size_t N_THREADS = 20;
+static constexpr size_t ALLOC_SIZE = 64;
+
+using poolCreateExtParams = std::tuple<umf_memory_pool_ops_t *, void *,
+                                       umf_memory_provider_ops_t *, void *>;
+
+static auto poolCreateExtUnique(poolCreateExtParams params) {
+    umf_memory_pool_handle_t hPool;
+    auto [pool_ops, pool_params, provider_ops, provider_params] = params;
+
+    umf_memory_provider_handle_t provider = nullptr;
+    auto ret =
+        umfMemoryProviderCreate(provider_ops, provider_params, &provider);
+    if (ret != UMF_RESULT_SUCCESS) {
+        std::cerr << "provider create failed" << std::endl;
+        abort();
+    }
+
+    ret = umfPoolCreate(pool_ops, provider, pool_params,
+                        UMF_POOL_CREATE_FLAG_OWN_PROVIDER, &hPool);
+    if (ret != UMF_RESULT_SUCCESS) {
+        std::cerr << "pool create failed" << std::endl;
+        abort();
+    }
+
+    return std::shared_ptr<umf_memory_pool_t>(hPool, &umfPoolDestroy);
+}
+
+static void mt_alloc_free(poolCreateExtParams params) {
+    auto pool = poolCreateExtUnique(params);
+
+    std::vector<void *> allocs[N_THREADS];
+    size_t numFailures[N_THREADS] = {};
+    for (auto &v : allocs) {
+        v.reserve(N_ITERATIONS);
+    }
+
+    auto values = umf_bench::measure<std::chrono::milliseconds>(
+        N_REPEATS, N_THREADS, [&, pool = pool.get()](auto thread_id) {
+            for (int i = 0; i < N_ITERATIONS; i++) {
+                allocs[thread_id].push_back(umfPoolMalloc(pool, ALLOC_SIZE));
+                if (!allocs[thread_id].back()) {
+                    numFailures[thread_id]++;
+                }
+            }
+
+            for (int i = 0; i < N_ITERATIONS; i++) {
+                umfPoolFree(pool, allocs[thread_id][i]);
+            }
+
+            // clear the vector as this function might be called multiple times
+            allocs[thread_id].clear();
+        });
+
+    std::cout << "mean: " << umf_bench::mean(values)
+              << " [ms] std_dev: " << umf_bench::std_dev(values) << " [ms]"
+              << std::endl;
+    std::cout << "Total alloc failures: "
+              << std::accumulate(numFailures, numFailures + N_THREADS, 0)
+              << " out of " << N_ITERATIONS * N_REPEATS * N_THREADS
+              << std::endl;
+}
+
+int main() {
+#if defined(UMF_BUILD_OS_MEMORY_PROVIDER)
+    auto osParams = umfOsMemoryProviderParamsDefault();
+#endif
+
+#if defined(UMF_BUILD_OS_MEMORY_PROVIDER) &&                                   \
+    defined(UMF_BUILD_LIBUMF_POOL_SCALABLE)
+    std::cout << "scalable_pool mt_alloc_free: " << std::endl;
+    mt_alloc_free(poolCreateExtParams{&UMF_SCALABLE_POOL_OPS, nullptr,
+                                      &UMF_OS_MEMORY_PROVIDER_OPS, &osParams});
+#else
+    std::cout << "skipping scalable_pool mt_alloc_free" << std::endl;
+#endif
+
+#if defined(UMF_BUILD_OS_MEMORY_PROVIDER) &&                                   \
+    defined(UMF_BUILD_LIBUMF_POOL_JEMALLOC)
+    std::cout << "jemalloc_pool mt_alloc_free: " << std::endl;
+    mt_alloc_free(poolCreateExtParams{&UMF_JEMALLOC_POOL_OPS, nullptr,
+                                      &UMF_OS_MEMORY_PROVIDER_OPS, &osParams});
+#else
+    std::cout << "skipping jemalloc_pool mt_alloc_free" << std::endl;
+#endif
+
+    return 0;
+}
diff --git a/benchmark/multithread.hpp b/benchmark/multithread.hpp
@@ -0,0 +1,153 @@
+/*
+ *
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Under the Apache License v2.0 with LLVM Exceptions. See LICENSE.TXT.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ */
+
+#include <algorithm>
+#include <chrono>
+#include <cmath>
+#include <condition_variable>
+#include <functional>
+#include <mutex>
+#include <numeric>
+#include <thread>
+#include <vector>
+
+namespace umf_bench {
+
+template <typename Function>
+void parallel_exec(size_t threads_number, Function &&f) {
+    std::vector<std::thread> threads;
+    threads.reserve(threads_number);
+
+    for (size_t i = 0; i < threads_number; ++i) {
+        threads.emplace_back([&](size_t id) { f(id); }, i);
+    }
+
+    for (auto &t : threads) {
+        t.join();
+    }
+}
+
+class latch {
+  public:
+    latch(size_t desired) : counter(desired) {}
+
+    /* Returns true for the last thread arriving at the latch, false for all
+	 * other threads. */
+    bool wait(std::unique_lock<std::mutex> &lock) {
+        counter--;
+        if (counter > 0) {
+            cv.wait(lock, [&] { return counter == 0; });
+            return false;
+        } else {
+            /*
+			 * notify_call could be called outside of a lock
+			 * (it would perform better) but drd complains
+			 * in that case
+			 */
+            cv.notify_all();
+            return true;
+        }
+    }
+
+  private:
+    std::condition_variable cv;
+    size_t counter = 0;
+};
+
+/* Implements multi-use barrier (latch). Once all threads arrive at the
+ * latch, a new latch is allocated and used by all subsequent calls to
+ * syncthreads. */
+struct syncthreads_barrier {
+    syncthreads_barrier(size_t num_threads) : num_threads(num_threads) {
+        mutex = std::shared_ptr<std::mutex>(new std::mutex);
+        current_latch = std::shared_ptr<latch>(new latch(num_threads));
+    }
+
+    syncthreads_barrier(const syncthreads_barrier &) = delete;
+    syncthreads_barrier &operator=(const syncthreads_barrier &) = delete;
+    syncthreads_barrier(syncthreads_barrier &&) = default;
+
+    void operator()() {
+        std::unique_lock<std::mutex> lock(*mutex);
+        auto l = current_latch;
+        if (l->wait(lock)) {
+            current_latch = std::shared_ptr<latch>(new latch(num_threads));
+        }
+    }
+
+  private:
+    size_t num_threads;
+    std::shared_ptr<std::mutex> mutex;
+    std::shared_ptr<latch> current_latch;
+};
+
+template <typename TimeUnit, typename F>
+typename TimeUnit::rep measure(F &&func) {
+    auto start = std::chrono::steady_clock::now();
+
+    func();
+
+    auto duration = std::chrono::duration_cast<TimeUnit>(
+        std::chrono::steady_clock::now() - start);
+    return duration.count();
+}
+
+/* Measure time of execution of run_workload(thread_id) function. */
+template <typename TimeUnit, typename F>
+auto measure(size_t iterations, size_t concurrency, F &&run_workload) {
+    using ResultsType = typename TimeUnit::rep;
+    std::vector<ResultsType> results;
+
+    for (size_t i = 0; i < iterations; i++) {
+        std::vector<ResultsType> iteration_results(concurrency);
+        syncthreads_barrier syncthreads(concurrency);
+        parallel_exec(concurrency, [&](size_t id) {
+            syncthreads();
+
+            iteration_results[id] =
+                measure<TimeUnit>([&]() { run_workload(id); });
+
+            syncthreads();
+        });
+        results.insert(results.end(), iteration_results.begin(),
+                       iteration_results.end());
+    }
+
+    return results;
+}
+
+template <typename T> T min(const std::vector<T> &values) {
+    return *std::min_element(values.begin(), values.end());
+}
+
+template <typename T> T max(const std::vector<T> &values) {
+    return *std::max_element(values.begin(), values.end());
+}
+
+template <typename T> double mean(const std::vector<T> &values) {
+    return std::accumulate(values.begin(), values.end(), 0.0) / values.size();
+}
+
+template <typename T> double std_dev(const std::vector<T> &values) {
+    auto m = mean(values);
+    std::vector<T> diff_squares;
+    diff_squares.reserve(values.size());
+
+    for (auto &v : values) {
+        diff_squares.push_back(std::pow((v - m), 2.0));
+    }
+
+    auto variance =
+        std::accumulate(diff_squares.begin(), diff_squares.end(), 0.0) /
+        values.size();
+
+    return std::sqrt(variance);
+}
+
+} // namespace umf_bench