[SYCL][PI][L0] Add dynamic batch size adjustment (#2792)

kbsmith-intel · web-flow · commit c70b0477aa8a · 2020-11-20T13:19:00.000-08:00
* Add dynamic batch size adjustment

These changes add code to implement dynamic command list batch
size adjustment, change the documentation of the environment
variable that can be used to control command list batching,
and updates and adds tests for the batching feature.
diff --git a/sycl/doc/EnvironmentVariables.md b/sycl/doc/EnvironmentVariables.md
@@ -28,7 +28,7 @@ subject to change. Do not rely on these variables in production code.
 | SYCL_DEVICELIB_NO_FALLBACK | Any(\*) | Disable loading and linking of device library images |
 | SYCL_PI_LEVEL_ZERO_MAX_COMMAND_LIST_CACHE | Positive integer | Maximum number of oneAPI Level Zero Command lists that can be allocated with no reuse before throwing an "out of resources" error. Default is 20000, threshold may be increased based on resource availabilty and workload demand. |
 | SYCL_PI_LEVEL_ZERO_DISABLE_USM_ALLOCATOR | Any(\*) | Disable USM allocator in Level Zero plugin (each memory request will go directly to Level Zero runtime) |
-| SYCL_PI_LEVEL_ZERO_BATCH_SIZE | Positive integer | Sets a preferred number of commands to batch into a command list before executing the command list. Values 0 and 1 turn off batching. Default is 4. |
+| SYCL_PI_LEVEL_ZERO_BATCH_SIZE | Integer | Sets a preferred number of commands to batch into a command list before executing the command list. A value of 0 causes the batch size to be adjusted dynamically. A value greater than 0 specifies fixed size batching, with the batch size set to the specified value. The default is 0. |
 
 `(*) Note: Any means this environment variable is effective when set to any non-null value.`
 
diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp
@@ -440,10 +440,8 @@ _pi_queue::resetCommandListFenceEntry(ze_command_list_handle_t ZeCommandList,
 }
 
 static const pi_uint32 ZeCommandListBatchSize = [] {
-  // Default value of 4. This has been seen as a good tradeoff between
-  // lower overhead of number of enqueue and fence calls, and getting
-  // commands seen as soon possible (i.e. lazy vs eager submission).
-  pi_uint32 BatchSizeVal = 4;
+  // Default value of 0. This specifies to use dynamic batch size adjustment.
+  pi_uint32 BatchSizeVal = 0;
   const auto BatchSizeStr = std::getenv("SYCL_PI_LEVEL_ZERO_BATCH_SIZE");
   if (BatchSizeStr) {
     pi_int32 BatchSizeStrVal = std::atoi(BatchSizeStr);
@@ -550,6 +548,49 @@ pi_result _pi_device::getAvailableCommandList(
   return pi_result;
 }
 
+void _pi_queue::adjustBatchSizeForFullBatch() {
+  // QueueBatchSize of 0 means never allow batching.
+  if (QueueBatchSize == 0 || !UseDynamicBatching)
+    return;
+
+  NumTimesClosedFull += 1;
+
+  // If the number of times the list has been closed early is low, and
+  // the number of times it has been closed full is high, then raise
+  // the batching size slowly. Don't raise it if it is already pretty
+  // high.
+  if (NumTimesClosedEarly <= 2 && NumTimesClosedFull > 10) {
+    if (QueueBatchSize < 16) {
+      QueueBatchSize = QueueBatchSize + 1;
+      zePrint("Raising QueueBatchSize to %d\n", QueueBatchSize);
+    }
+    NumTimesClosedEarly = 0;
+    NumTimesClosedFull = 0;
+  }
+}
+
+void _pi_queue::adjustBatchSizeForPartialBatch(pi_uint32 PartialBatchSize) {
+  // QueueBatchSize of 0 means never allow batching.
+  if (QueueBatchSize == 0 || !UseDynamicBatching)
+    return;
+
+  NumTimesClosedEarly += 1;
+
+  // If we are closing early more than about 3x the number of times
+  // it is closing full, lower the batch size to the value of the
+  // current open command list. This is trying to quickly get to a
+  // batch size that will be able to be closed full at least once
+  // in a while.
+  if (NumTimesClosedEarly > (NumTimesClosedFull + 1) * 3) {
+    QueueBatchSize = PartialBatchSize - 1;
+    if (QueueBatchSize < 1)
+      QueueBatchSize = 1;
+    zePrint("Lowering QueueBatchSize to %d\n", QueueBatchSize);
+    NumTimesClosedEarly = 0;
+    NumTimesClosedFull = 0;
+  }
+}
+
 pi_result _pi_queue::executeCommandList(ze_command_list_handle_t ZeCommandList,
                                         ze_fence_handle_t ZeFence,
                                         bool IsBlocking,
@@ -572,6 +613,8 @@ pi_result _pi_queue::executeCommandList(ze_command_list_handle_t ZeCommandList,
       return PI_SUCCESS;
     }
 
+    adjustBatchSizeForFullBatch();
+
     this->ZeOpenCommandList = nullptr;
     this->ZeOpenCommandListFence = nullptr;
     this->ZeOpenCommandListSize = 0;
@@ -592,7 +635,7 @@ pi_result _pi_queue::executeCommandList(ze_command_list_handle_t ZeCommandList,
 }
 
 bool _pi_queue::isBatchingAllowed() {
-  return (this->QueueBatchSize > 1 && ((ZeSerialize & ZeSerializeBlock) == 0));
+  return (this->QueueBatchSize > 0 && ((ZeSerialize & ZeSerializeBlock) == 0));
 }
 
 pi_result _pi_queue::executeOpenCommandList() {
@@ -602,6 +645,8 @@ pi_result _pi_queue::executeOpenCommandList() {
   if (OpenList) {
     auto OpenListFence = this->ZeOpenCommandListFence;
 
+    adjustBatchSizeForPartialBatch(this->ZeOpenCommandListSize);
+
     this->ZeOpenCommandList = nullptr;
     this->ZeOpenCommandListFence = nullptr;
     this->ZeOpenCommandListSize = 0;
@@ -1860,6 +1905,9 @@ pi_result piQueueRelease(pi_queue Queue) {
     Queue->ZeCommandListFenceMap.clear();
     ZE_CALL(zeCommandQueueDestroy(Queue->ZeCommandQueue));
     Queue->ZeCommandQueue = nullptr;
+
+    zePrint("piQueueRelease NumTimesClosedFull %d, NumTimesClosedEarly %d\n",
+            Queue->NumTimesClosedFull, Queue->NumTimesClosedEarly);
   }
   return PI_SUCCESS;
 }
diff --git a/sycl/plugins/level_zero/pi_level_zero.hpp b/sycl/plugins/level_zero/pi_level_zero.hpp
@@ -271,11 +271,15 @@ struct _pi_context : _pi_object {
   std::mutex NumEventsLiveInEventPoolMutex;
 };
 
+// If doing dynamic batching, start batch size at 2.
+const pi_uint32 DynamicBatchStartSize = 2;
+
 struct _pi_queue : _pi_object {
   _pi_queue(ze_command_queue_handle_t Queue, pi_context Context,
-            pi_device Device, pi_uint32 QueueBatchSize)
+            pi_device Device, pi_uint32 BatchSize)
       : ZeCommandQueue{Queue}, Context{Context}, Device{Device},
-        QueueBatchSize{QueueBatchSize} {}
+        QueueBatchSize{BatchSize > 0 ? BatchSize : DynamicBatchStartSize},
+        UseDynamicBatching{BatchSize == 0} {}
 
   // Level Zero command queue handle.
   ze_command_queue_handle_t ZeCommandQueue;
@@ -310,6 +314,18 @@ struct _pi_queue : _pi_object {
   // is thread safe because of the locking of the queue that occurs.
   pi_uint32 QueueBatchSize = {0};
 
+  // specifies whether this queue will be using dynamic batch size adjustment
+  // or not.  This is set only at queue creation time, and is therefore
+  // const for the life of the queue.
+  const bool UseDynamicBatching;
+
+  // These two members are used to keep track of how often the
+  // batching closes and executes a command list before reaching the
+  // QueueBatchSize limit, versus how often we reach the limit.
+  // This info might be used to vary the QueueBatchSize value.
+  pi_uint32 NumTimesClosedEarly = {0};
+  pi_uint32 NumTimesClosedFull = {0};
+
   // Map of all Command lists created with their associated Fence used for
   // tracking when the command list is available for use again.
   std::map<ze_command_list_handle_t, ze_fence_handle_t> ZeCommandListFenceMap;
@@ -318,6 +334,15 @@ struct _pi_queue : _pi_object {
   // be batched together.
   bool isBatchingAllowed();
 
+  // adjust the queue's batch size, knowing that the current command list
+  // is being closed with a full batch.
+  void adjustBatchSizeForFullBatch();
+
+  // adjust the queue's batch size, knowing that the current command list
+  // is being closed with only a partial batch of commands.  How many commands
+  // are in this partial closure is passed as the parameter.
+  void adjustBatchSizeForPartialBatch(pi_uint32 PartialBatchSize);
+
   // Resets the Command List and Associated fence in the ZeCommandListFenceMap.
   // If the reset command list should be made available, then MakeAvailable
   // needs to be set to true. The caller must verify that this command list and
diff --git a/sycl/test/plugins/level_zero_batch_test.cpp b/sycl/test/plugins/level_zero_batch_test.cpp
@@ -2,9 +2,6 @@
 
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple  %s -o %t.out
 
-// Default batching should be 4
-// RUN: env SYCL_PI_TRACE=2 ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1 | FileCheck --check-prefixes=CKALL,CKB4 %s
-
 // Set batching to 4 explicitly
 // RUN: env SYCL_PI_LEVEL_ZERO_BATCH_SIZE=4 SYCL_PI_TRACE=2 ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1 | FileCheck --check-prefixes=CKALL,CKB4 %s
 
diff --git a/sycl/test/plugins/level_zero_dynamic_batch_test.cpp b/sycl/test/plugins/level_zero_dynamic_batch_test.cpp
@@ -0,0 +1,219 @@
+// REQUIRES: gpu, level_zero
+
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple  %s -o %t.out
+
+// Check that dynamic batching increases batch size
+// RUN: env SYCL_PI_TRACE=2 ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1 | FileCheck --check-prefixes=CKALL,CKDYNUP %s
+
+// level_zero_dynamic_batch_test.cpp
+//
+// This tests the level zero plugin's kernel dyanmic batch size adjustment
+// code.
+// It starts out by enqueing 40 kernels before it does a wait, and it does
+// this 5 times.  That should cause the dynamic batch size adjustment to
+// raise the batch size up several times.
+//
+// Then the test starts enqueueing only 4 kernels before doing a wait, and
+// it does that 5 times as well.  That should cause the batch size to
+// be lowered, just once to be less than 4.
+//
+// CKDYN: Raising QueueBatchSize to 3
+// CKDYN: Raising QueueBatchSize to 4
+// CKDYN-NOT: Raising QueueBatchSize
+// CKALL: Test Pass
+// CKALL: Test Pass
+// CKALL: Test Pass
+// CKALL: Test Pass
+// CKALL: Test Pass
+// CKALL: Test Pass
+// CKALL: Test Pass
+// CKDYN: Lowering QueueBatchSize to 3
+// CKDYN-NOT: Lowering QueueBatchSize
+// CKALL: Test Pass
+// CKALL: Test Pass
+// CKALL: Test Pass
+// CKALL: Test Pass
+
+#include "CL/sycl.hpp"
+#include <chrono>
+#include <cmath>
+#include <iostream>
+
+namespace sycl = cl::sycl;
+
+void validate(uint32_t *result, uint32_t *expect, size_t n) {
+  int error = 0;
+  for (int i = 0; i < n; i++) {
+    if (result[i] != expect[i]) {
+      error++;
+      if (error < 10) {
+        printf("Error: %d, expect: %d\n", result[i], expect[i]);
+      }
+    }
+  }
+  error > 0 ? printf("Error: %d\n", error) : printf("Test Pass\n");
+}
+
+int main(int argc, char *argv[]) {
+  size_t M = 65536;
+  size_t N = 512 / 4;
+  size_t AL = M * N * sizeof(uint32_t);
+
+  sycl::queue q(sycl::default_selector{});
+  auto ctx = q.get_context();
+  auto dev = q.get_device();
+
+  uint32_t *Y1 = static_cast<uint32_t *>(sycl::malloc_shared(AL, dev, ctx));
+  uint32_t *Z1 = static_cast<uint32_t *>(sycl::malloc_shared(AL, dev, ctx));
+  uint32_t *Z2 = static_cast<uint32_t *>(sycl::malloc_shared(AL, dev, ctx));
+  uint32_t *Z3 = static_cast<uint32_t *>(sycl::malloc_shared(AL, dev, ctx));
+  uint32_t *Z4 = static_cast<uint32_t *>(sycl::malloc_shared(AL, dev, ctx));
+  uint32_t *Z5 = static_cast<uint32_t *>(sycl::malloc_shared(AL, dev, ctx));
+  uint32_t *Z6 = static_cast<uint32_t *>(sycl::malloc_shared(AL, dev, ctx));
+  uint32_t *Z7 = static_cast<uint32_t *>(sycl::malloc_shared(AL, dev, ctx));
+  uint32_t *Z8 = static_cast<uint32_t *>(sycl::malloc_shared(AL, dev, ctx));
+
+  for (size_t i = 0; i < M * N; i++) {
+    Y1[i] = i % 255;
+  }
+
+  memset(Z1, '\0', AL);
+  memset(Z2, '\0', AL);
+  memset(Z3, '\0', AL);
+  memset(Z4, '\0', AL);
+  memset(Z5, '\0', AL);
+  memset(Z6, '\0', AL);
+  memset(Z7, '\0', AL);
+  memset(Z8, '\0', AL);
+
+  for (size_t i = 0; i < 5; i++) {
+    for (size_t j = 0; j < 5; j++) {
+      q.submit([&](sycl::handler &h) {
+        h.parallel_for<class u32_copy1>(sycl::range<2>{M, N},
+                                        [=](sycl::id<2> it) {
+                                          const int m = it[0];
+                                          const int n = it[1];
+                                          Z1[m * N + n] = Y1[m * N + n];
+                                        });
+      });
+      q.submit([&](sycl::handler &h) {
+        h.parallel_for<class u32_copy2>(sycl::range<2>{M, N},
+                                        [=](sycl::id<2> it) {
+                                          const int m = it[0];
+                                          const int n = it[1];
+                                          Z2[m * N + n] = Y1[m * N + n];
+                                        });
+      });
+      q.submit([&](sycl::handler &h) {
+        h.parallel_for<class u32_copy3>(sycl::range<2>{M, N},
+                                        [=](sycl::id<2> it) {
+                                          const int m = it[0];
+                                          const int n = it[1];
+                                          Z3[m * N + n] = Y1[m * N + n];
+                                        });
+      });
+      q.submit([&](sycl::handler &h) {
+        h.parallel_for<class u32_copy4>(sycl::range<2>{M, N},
+                                        [=](sycl::id<2> it) {
+                                          const int m = it[0];
+                                          const int n = it[1];
+                                          Z4[m * N + n] = Y1[m * N + n];
+                                        });
+      });
+      q.submit([&](sycl::handler &h) {
+        h.parallel_for<class u32_copy5>(sycl::range<2>{M, N},
+                                        [=](sycl::id<2> it) {
+                                          const int m = it[0];
+                                          const int n = it[1];
+                                          Z5[m * N + n] = Y1[m * N + n];
+                                        });
+      });
+      q.submit([&](sycl::handler &h) {
+        h.parallel_for<class u32_copy6>(sycl::range<2>{M, N},
+                                        [=](sycl::id<2> it) {
+                                          const int m = it[0];
+                                          const int n = it[1];
+                                          Z6[m * N + n] = Y1[m * N + n];
+                                        });
+      });
+      q.submit([&](sycl::handler &h) {
+        h.parallel_for<class u32_copy7>(sycl::range<2>{M, N},
+                                        [=](sycl::id<2> it) {
+                                          const int m = it[0];
+                                          const int n = it[1];
+                                          Z7[m * N + n] = Y1[m * N + n];
+                                        });
+      });
+      q.submit([&](sycl::handler &h) {
+        h.parallel_for<class u32_copy8>(sycl::range<2>{M, N},
+                                        [=](sycl::id<2> it) {
+                                          const int m = it[0];
+                                          const int n = it[1];
+                                          Z8[m * N + n] = Y1[m * N + n];
+                                        });
+      });
+    }
+    q.wait();
+  }
+
+  validate(Y1, Z1, M * N);
+  validate(Y1, Z2, M * N);
+  validate(Y1, Z3, M * N);
+  validate(Y1, Z4, M * N);
+  validate(Y1, Z5, M * N);
+  validate(Y1, Z6, M * N);
+  validate(Y1, Z7, M * N);
+  validate(Y1, Z8, M * N);
+
+  for (size_t i = 0; i < 5; i++) {
+    q.submit([&](sycl::handler &h) {
+      h.parallel_for<class u32_copy9>(sycl::range<2>{M, N},
+                                      [=](sycl::id<2> it) {
+                                        const int m = it[0];
+                                        const int n = it[1];
+                                        Z1[m * N + n] = Y1[m * N + n];
+                                      });
+    });
+    q.submit([&](sycl::handler &h) {
+      h.parallel_for<class u32_copy10>(sycl::range<2>{M, N},
+                                       [=](sycl::id<2> it) {
+                                         const int m = it[0];
+                                         const int n = it[1];
+                                         Z2[m * N + n] = Y1[m * N + n];
+                                       });
+    });
+    q.submit([&](sycl::handler &h) {
+      h.parallel_for<class u32_copy11>(sycl::range<2>{M, N},
+                                       [=](sycl::id<2> it) {
+                                         const int m = it[0];
+                                         const int n = it[1];
+                                         Z3[m * N + n] = Y1[m * N + n];
+                                       });
+    });
+    q.submit([&](sycl::handler &h) {
+      h.parallel_for<class u32_copy12>(sycl::range<2>{M, N},
+                                       [=](sycl::id<2> it) {
+                                         const int m = it[0];
+                                         const int n = it[1];
+                                         Z4[m * N + n] = Y1[m * N + n];
+                                       });
+    });
+    q.wait();
+  }
+  validate(Y1, Z1, M * N);
+  validate(Y1, Z2, M * N);
+  validate(Y1, Z3, M * N);
+  validate(Y1, Z4, M * N);
+
+  sycl::free(Y1, ctx);
+  sycl::free(Z1, ctx);
+  sycl::free(Z2, ctx);
+  sycl::free(Z3, ctx);
+  sycl::free(Z4, ctx);
+  sycl::free(Z5, ctx);
+  sycl::free(Z6, ctx);
+  sycl::free(Z7, ctx);
+  sycl::free(Z8, ctx);
+
+  return 0;
+}