Implementation thread parallel with threadpool (#2173)

kirklandsign · facebook-github-bot · commit 566209d6ecd4 · 2024-03-04T14:15:49.000-08:00
Summary: Pull Request resolved: #2173 Use the ET threadpool (with underlying pthreadpool) to provide `parallel_for` functionality Reviewed By: kimishpatel Differential Revision: D54335940 fbshipit-source-id: 0865d0c76d1f16c325da8c13656fa955d6a48ade
diff --git a/extension/parallel/TARGETS b/extension/parallel/TARGETS
@@ -0,0 +1,8 @@
+# Any targets that should be shared between fbcode and xplat must be defined in
+# targets.bzl. This file can contain fbcode-only targets.
+
+load(":targets.bzl", "define_common_targets")
+
+oncall("executorch")
+
+define_common_targets()
diff --git a/extension/parallel/targets.bzl b/extension/parallel/targets.bzl
@@ -0,0 +1,28 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+def define_common_targets():
+    """Defines targets that should be shared between fbcode and xplat.
+
+    The directory containing this targets.bzl file should also contain both
+    TARGETS and BUCK files that call this function.
+    """
+
+    for aten_mode in (True, False):
+        aten_suffix = ("_aten" if aten_mode else "")
+
+        runtime.cxx_library(
+            name = "thread_parallel" + aten_suffix,
+            srcs = [
+                "thread_parallel.cpp",
+            ],
+            exported_headers = [
+                "thread_parallel.h",
+            ],
+            visibility = [
+                "//executorch/...",
+            ],
+            deps = [
+                "//executorch/backends/xnnpack/threadpool:threadpool",
+                "//executorch/runtime/core:core",
+            ],
+        )
diff --git a/extension/parallel/test/TARGETS b/extension/parallel/test/TARGETS
@@ -0,0 +1,8 @@
+# Any targets that should be shared between fbcode and xplat must be defined in
+# targets.bzl. This file can contain fbcode-only targets.
+
+load(":targets.bzl", "define_common_targets")
+
+oncall("executorch")
+
+define_common_targets()
diff --git a/extension/parallel/test/targets.bzl b/extension/parallel/test/targets.bzl
@@ -0,0 +1,18 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+def define_common_targets():
+    """Defines targets that should be shared between fbcode and xplat.
+
+    The directory containing this targets.bzl file should also contain both
+    TARGETS and BUCK files that call this function.
+    """
+
+    runtime.cxx_test(
+        name = "thread_parallel_test",
+        srcs = [
+            "thread_parallel_test.cpp",
+        ],
+        deps = [
+            "//executorch/extension/parallel:thread_parallel",
+        ],
+    )
diff --git a/extension/parallel/test/thread_parallel_test.cpp b/extension/parallel/test/thread_parallel_test.cpp
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <gtest/gtest.h>
+
+#include <array>
+#include <mutex>
+
+#include <executorch/extension/parallel/thread_parallel.h>
+#include <executorch/test/utils/DeathTest.h>
+
+using namespace ::testing;
+
+namespace torch::executor {
+
+class ParallelTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    data_.fill(0);
+    sum_of_all_elements_ = 0;
+  }
+
+  void RunTask(int64_t begin, int64_t end) {
+    for (int64_t j = begin; j < end; ++j) {
+      // Check that we haven't written to this index before
+      EXPECT_EQ(data_[j], 0);
+      data_[j] = j;
+    }
+  }
+
+  void RunExclusiveTask(int64_t begin, int64_t end) {
+    for (int64_t j = begin; j < end; ++j) {
+      // Check that we haven't written to this index before
+      EXPECT_EQ(data_[j], 0);
+      std::lock_guard<std::mutex> lock(mutex_);
+      data_[j] = j;
+      sum_of_all_elements_ += data_[j];
+    }
+  }
+
+  std::array<int, 10> data_;
+  std::mutex mutex_;
+  int sum_of_all_elements_;
+};
+
+TEST_F(ParallelTest, TestAllInvoked) {
+  parallel_for(0, 10, 1, [this](int64_t begin, int64_t end) {
+    this->RunTask(begin, end);
+  });
+
+  for (int64_t i = 0; i < 10; ++i) {
+    EXPECT_EQ(data_[i], i);
+  }
+}
+
+TEST_F(ParallelTest, TestAllInvokedWithMutex) {
+  parallel_for(0, 10, 1, [this](int64_t begin, int64_t end) {
+    this->RunExclusiveTask(begin, end);
+  });
+
+  int expected_sum = 0;
+  for (int64_t i = 0; i < 10; ++i) {
+    EXPECT_EQ(data_[i], i);
+    expected_sum += i;
+  }
+  EXPECT_EQ(sum_of_all_elements_, expected_sum);
+}
+
+TEST_F(ParallelTest, TestInvalidRange) {
+  ET_EXPECT_DEATH(
+      {
+        parallel_for(10, 0, 1, [this](int64_t begin, int64_t end) {
+          this->RunExclusiveTask(begin, end);
+        });
+      },
+      "");
+
+  for (int64_t i = 0; i < 10; ++i) {
+    EXPECT_EQ(data_[i], 0);
+  }
+  EXPECT_EQ(sum_of_all_elements_, 0);
+}
+
+TEST_F(ParallelTest, TestInvalidRange2) {
+  ET_EXPECT_DEATH(
+      {
+        parallel_for(6, 5, 1, [this](int64_t begin, int64_t end) {
+          this->RunExclusiveTask(begin, end);
+        });
+      },
+      "");
+
+  for (int64_t i = 0; i < 10; ++i) {
+    EXPECT_EQ(data_[i], 0);
+  }
+  EXPECT_EQ(sum_of_all_elements_, 0);
+}
+
+TEST_F(ParallelTest, TestInvokePartialFromBeginning) {
+  parallel_for(0, 5, 1, [this](int64_t begin, int64_t end) {
+    this->RunTask(begin, end);
+  });
+
+  for (int64_t i = 0; i < 5; ++i) {
+    EXPECT_EQ(data_[i], i);
+  }
+  for (int64_t i = 5; i < 10; ++i) {
+    EXPECT_EQ(data_[i], 0);
+  }
+}
+
+TEST_F(ParallelTest, TestInvokePartialToEnd) {
+  parallel_for(5, 10, 1, [this](int64_t begin, int64_t end) {
+    this->RunTask(begin, end);
+  });
+
+  for (int64_t i = 0; i < 5; ++i) {
+    EXPECT_EQ(data_[i], 0);
+  }
+  for (int64_t i = 5; i < 10; ++i) {
+    EXPECT_EQ(data_[i], i);
+  }
+}
+
+TEST_F(ParallelTest, TestInvokePartialMiddle) {
+  parallel_for(2, 8, 1, [this](int64_t begin, int64_t end) {
+    this->RunTask(begin, end);
+  });
+
+  for (int64_t i = 0; i < 2; ++i) {
+    EXPECT_EQ(data_[i], 0);
+  }
+  for (int64_t i = 2; i < 8; ++i) {
+    EXPECT_EQ(data_[i], i);
+  }
+  for (int64_t i = 8; i < 10; ++i) {
+    EXPECT_EQ(data_[i], 0);
+  }
+}
+
+TEST_F(ParallelTest, TestChunkSize2) {
+  parallel_for(0, 10, 2, [this](int64_t begin, int64_t end) {
+    this->RunTask(begin, end);
+  });
+
+  for (int64_t i = 0; i < 10; ++i) {
+    EXPECT_EQ(data_[i], i);
+  }
+}
+
+TEST_F(ParallelTest, TestChunkSize2Middle) {
+  parallel_for(3, 8, 2, [this](int64_t begin, int64_t end) {
+    this->RunTask(begin, end);
+  });
+
+  for (int64_t i = 0; i < 3; ++i) {
+    EXPECT_EQ(data_[i], 0);
+  }
+  for (int64_t i = 3; i < 8; ++i) {
+    EXPECT_EQ(data_[i], i);
+  }
+  for (int64_t i = 8; i < 10; ++i) {
+    EXPECT_EQ(data_[i], 0);
+  }
+}
+
+TEST_F(ParallelTest, TestChunkSize3) {
+  parallel_for(0, 10, 3, [this](int64_t begin, int64_t end) {
+    this->RunTask(begin, end);
+  });
+
+  for (int64_t i = 0; i < 10; ++i) {
+    EXPECT_EQ(data_[i], i);
+  }
+}
+
+TEST_F(ParallelTest, TestChunkSize6) {
+  parallel_for(0, 10, 6, [this](int64_t begin, int64_t end) {
+    this->RunTask(begin, end);
+  });
+
+  for (int64_t i = 0; i < 10; ++i) {
+    EXPECT_EQ(data_[i], i);
+  }
+}
+
+TEST_F(ParallelTest, TestChunkSizeTooLarge) {
+  parallel_for(0, 10, 11, [this](int64_t begin, int64_t end) {
+    this->RunTask(begin, end);
+  });
+
+  for (int64_t i = 0; i < 10; ++i) {
+    EXPECT_EQ(data_[i], i);
+  }
+}
+
+} // namespace torch::executor
diff --git a/extension/parallel/thread_parallel.cpp b/extension/parallel/thread_parallel.cpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <tuple>
+
+#include <executorch/backends/xnnpack/threadpool/threadpool.h>
+#include <executorch/extension/parallel/thread_parallel.h>
+#include <executorch/runtime/platform/assert.h>
+
+namespace torch::executor {
+
+using namespace torch::executorch::threadpool;
+
+inline int64_t divup(int64_t x, int64_t y) {
+  return (x + y - 1) / y;
+}
+
+inline std::tuple<int64_t, int64_t>
+calc_num_tasks_and_chunk_size(int64_t begin, int64_t end, int64_t grain_size) {
+  if ((end - begin) < grain_size) {
+    return std::make_tuple(1, std::max((int64_t)0, end - begin));
+  }
+  // Choose number of tasks based on grain size and number of threads.
+  int64_t chunk_size =
+      divup((end - begin), get_threadpool()->get_thread_count());
+  // Make sure each task is at least grain_size size.
+  chunk_size = std::max(grain_size, chunk_size);
+  int64_t num_tasks = divup((end - begin), chunk_size);
+  return std::make_tuple(num_tasks, chunk_size);
+}
+
+void parallel_for(
+    const int64_t begin,
+    const int64_t end,
+    const int64_t grain_size,
+    const std::function<void(int64_t, int64_t)>& f) {
+  ET_CHECK_MSG(begin >= 0 && end >= 0, "Begin and end should be non-negative");
+  ET_CHECK_MSG(end >= begin, "end should be greater than or equal to begin");
+  ET_CHECK_MSG(grain_size > 0, "grain_size should be positive");
+  int64_t num_tasks = 0, chunk_size = 0;
+  std::tie(num_tasks, chunk_size) =
+      calc_num_tasks_and_chunk_size(begin, end, grain_size);
+
+  auto task = [f, begin, end, chunk_size](size_t task_id) {
+    int64_t local_start = begin + static_cast<int64_t>(task_id) * chunk_size;
+    if (local_start < end) {
+      int64_t local_end = std::min(end, (int64_t)(chunk_size + local_start));
+      f(local_start, local_end);
+    }
+  };
+
+  // Per protocol from threadpool (pthreadpool), when this returns, all tasks
+  // are executed, so this is synchronous.
+  get_threadpool()->run(task, num_tasks);
+}
+
+} // namespace torch::executor
diff --git a/extension/parallel/thread_parallel.h b/extension/parallel/thread_parallel.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cstdint>
+// @nolint PATTERNLINT Ok to use stdlib for this optional library
+#include <functional>
+
+namespace torch::executor {
+
+/**
+ * A helper to run function in parallel.
+ *
+ * begin, end: describe the extent of the workitems via first and last workitem
+ * to be processed
+ * grain_size: number of workitems processed by user callback which is
+ * described below
+ * f: user function applied in parallel to the chunks, signature:
+ *   void f(int64_t begin, int64_t end)
+ *
+ * Warning: parallel_for does NOT copy thread local states from the current
+ * thread to the worker threads. Users need to protect the access to captured
+ * data if they mutate them in f.
+ */
+void parallel_for(
+    const int64_t begin,
+    const int64_t end,
+    const int64_t grain_size,
+    const std::function<void(int64_t, int64_t)>& f);
+
+} // namespace torch::executor