Skip to content

Commit c70b047

Browse files
[SYCL][PI][L0] Add dynamic batch size adjustment (#2792)
* Add dynamic batch size adjustment These changes add code to implement dynamic command list batch size adjustment, change the documentation of the environment variable that can be used to control command list batching, and updates and adds tests for the batching feature.
1 parent dd7e401 commit c70b047

File tree

5 files changed

+300
-11
lines changed

5 files changed

+300
-11
lines changed

sycl/doc/EnvironmentVariables.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ subject to change. Do not rely on these variables in production code.
2828
| SYCL_DEVICELIB_NO_FALLBACK | Any(\*) | Disable loading and linking of device library images |
2929
| SYCL_PI_LEVEL_ZERO_MAX_COMMAND_LIST_CACHE | Positive integer | Maximum number of oneAPI Level Zero Command lists that can be allocated with no reuse before throwing an "out of resources" error. Default is 20000, threshold may be increased based on resource availabilty and workload demand. |
3030
| SYCL_PI_LEVEL_ZERO_DISABLE_USM_ALLOCATOR | Any(\*) | Disable USM allocator in Level Zero plugin (each memory request will go directly to Level Zero runtime) |
31-
| SYCL_PI_LEVEL_ZERO_BATCH_SIZE | Positive integer | Sets a preferred number of commands to batch into a command list before executing the command list. Values 0 and 1 turn off batching. Default is 4. |
31+
| SYCL_PI_LEVEL_ZERO_BATCH_SIZE | Integer | Sets a preferred number of commands to batch into a command list before executing the command list. A value of 0 causes the batch size to be adjusted dynamically. A value greater than 0 specifies fixed size batching, with the batch size set to the specified value. The default is 0. |
3232

3333
`(*) Note: Any means this environment variable is effective when set to any non-null value.`
3434

sycl/plugins/level_zero/pi_level_zero.cpp

Lines changed: 53 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -440,10 +440,8 @@ _pi_queue::resetCommandListFenceEntry(ze_command_list_handle_t ZeCommandList,
440440
}
441441

442442
static const pi_uint32 ZeCommandListBatchSize = [] {
443-
// Default value of 4. This has been seen as a good tradeoff between
444-
// lower overhead of number of enqueue and fence calls, and getting
445-
// commands seen as soon possible (i.e. lazy vs eager submission).
446-
pi_uint32 BatchSizeVal = 4;
443+
// Default value of 0. This specifies to use dynamic batch size adjustment.
444+
pi_uint32 BatchSizeVal = 0;
447445
const auto BatchSizeStr = std::getenv("SYCL_PI_LEVEL_ZERO_BATCH_SIZE");
448446
if (BatchSizeStr) {
449447
pi_int32 BatchSizeStrVal = std::atoi(BatchSizeStr);
@@ -550,6 +548,49 @@ pi_result _pi_device::getAvailableCommandList(
550548
return pi_result;
551549
}
552550

551+
void _pi_queue::adjustBatchSizeForFullBatch() {
552+
// QueueBatchSize of 0 means never allow batching.
553+
if (QueueBatchSize == 0 || !UseDynamicBatching)
554+
return;
555+
556+
NumTimesClosedFull += 1;
557+
558+
// If the number of times the list has been closed early is low, and
559+
// the number of times it has been closed full is high, then raise
560+
// the batching size slowly. Don't raise it if it is already pretty
561+
// high.
562+
if (NumTimesClosedEarly <= 2 && NumTimesClosedFull > 10) {
563+
if (QueueBatchSize < 16) {
564+
QueueBatchSize = QueueBatchSize + 1;
565+
zePrint("Raising QueueBatchSize to %d\n", QueueBatchSize);
566+
}
567+
NumTimesClosedEarly = 0;
568+
NumTimesClosedFull = 0;
569+
}
570+
}
571+
572+
void _pi_queue::adjustBatchSizeForPartialBatch(pi_uint32 PartialBatchSize) {
573+
// QueueBatchSize of 0 means never allow batching.
574+
if (QueueBatchSize == 0 || !UseDynamicBatching)
575+
return;
576+
577+
NumTimesClosedEarly += 1;
578+
579+
// If we are closing early more than about 3x the number of times
580+
// it is closing full, lower the batch size to the value of the
581+
// current open command list. This is trying to quickly get to a
582+
// batch size that will be able to be closed full at least once
583+
// in a while.
584+
if (NumTimesClosedEarly > (NumTimesClosedFull + 1) * 3) {
585+
QueueBatchSize = PartialBatchSize - 1;
586+
if (QueueBatchSize < 1)
587+
QueueBatchSize = 1;
588+
zePrint("Lowering QueueBatchSize to %d\n", QueueBatchSize);
589+
NumTimesClosedEarly = 0;
590+
NumTimesClosedFull = 0;
591+
}
592+
}
593+
553594
pi_result _pi_queue::executeCommandList(ze_command_list_handle_t ZeCommandList,
554595
ze_fence_handle_t ZeFence,
555596
bool IsBlocking,
@@ -572,6 +613,8 @@ pi_result _pi_queue::executeCommandList(ze_command_list_handle_t ZeCommandList,
572613
return PI_SUCCESS;
573614
}
574615

616+
adjustBatchSizeForFullBatch();
617+
575618
this->ZeOpenCommandList = nullptr;
576619
this->ZeOpenCommandListFence = nullptr;
577620
this->ZeOpenCommandListSize = 0;
@@ -592,7 +635,7 @@ pi_result _pi_queue::executeCommandList(ze_command_list_handle_t ZeCommandList,
592635
}
593636

594637
bool _pi_queue::isBatchingAllowed() {
595-
return (this->QueueBatchSize > 1 && ((ZeSerialize & ZeSerializeBlock) == 0));
638+
return (this->QueueBatchSize > 0 && ((ZeSerialize & ZeSerializeBlock) == 0));
596639
}
597640

598641
pi_result _pi_queue::executeOpenCommandList() {
@@ -602,6 +645,8 @@ pi_result _pi_queue::executeOpenCommandList() {
602645
if (OpenList) {
603646
auto OpenListFence = this->ZeOpenCommandListFence;
604647

648+
adjustBatchSizeForPartialBatch(this->ZeOpenCommandListSize);
649+
605650
this->ZeOpenCommandList = nullptr;
606651
this->ZeOpenCommandListFence = nullptr;
607652
this->ZeOpenCommandListSize = 0;
@@ -1860,6 +1905,9 @@ pi_result piQueueRelease(pi_queue Queue) {
18601905
Queue->ZeCommandListFenceMap.clear();
18611906
ZE_CALL(zeCommandQueueDestroy(Queue->ZeCommandQueue));
18621907
Queue->ZeCommandQueue = nullptr;
1908+
1909+
zePrint("piQueueRelease NumTimesClosedFull %d, NumTimesClosedEarly %d\n",
1910+
Queue->NumTimesClosedFull, Queue->NumTimesClosedEarly);
18631911
}
18641912
return PI_SUCCESS;
18651913
}

sycl/plugins/level_zero/pi_level_zero.hpp

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -271,11 +271,15 @@ struct _pi_context : _pi_object {
271271
std::mutex NumEventsLiveInEventPoolMutex;
272272
};
273273

274+
// If doing dynamic batching, start batch size at 2.
275+
const pi_uint32 DynamicBatchStartSize = 2;
276+
274277
struct _pi_queue : _pi_object {
275278
_pi_queue(ze_command_queue_handle_t Queue, pi_context Context,
276-
pi_device Device, pi_uint32 QueueBatchSize)
279+
pi_device Device, pi_uint32 BatchSize)
277280
: ZeCommandQueue{Queue}, Context{Context}, Device{Device},
278-
QueueBatchSize{QueueBatchSize} {}
281+
QueueBatchSize{BatchSize > 0 ? BatchSize : DynamicBatchStartSize},
282+
UseDynamicBatching{BatchSize == 0} {}
279283

280284
// Level Zero command queue handle.
281285
ze_command_queue_handle_t ZeCommandQueue;
@@ -310,6 +314,18 @@ struct _pi_queue : _pi_object {
310314
// is thread safe because of the locking of the queue that occurs.
311315
pi_uint32 QueueBatchSize = {0};
312316

317+
// specifies whether this queue will be using dynamic batch size adjustment
318+
// or not. This is set only at queue creation time, and is therefore
319+
// const for the life of the queue.
320+
const bool UseDynamicBatching;
321+
322+
// These two members are used to keep track of how often the
323+
// batching closes and executes a command list before reaching the
324+
// QueueBatchSize limit, versus how often we reach the limit.
325+
// This info might be used to vary the QueueBatchSize value.
326+
pi_uint32 NumTimesClosedEarly = {0};
327+
pi_uint32 NumTimesClosedFull = {0};
328+
313329
// Map of all Command lists created with their associated Fence used for
314330
// tracking when the command list is available for use again.
315331
std::map<ze_command_list_handle_t, ze_fence_handle_t> ZeCommandListFenceMap;
@@ -318,6 +334,15 @@ struct _pi_queue : _pi_object {
318334
// be batched together.
319335
bool isBatchingAllowed();
320336

337+
// adjust the queue's batch size, knowing that the current command list
338+
// is being closed with a full batch.
339+
void adjustBatchSizeForFullBatch();
340+
341+
// adjust the queue's batch size, knowing that the current command list
342+
// is being closed with only a partial batch of commands. How many commands
343+
// are in this partial closure is passed as the parameter.
344+
void adjustBatchSizeForPartialBatch(pi_uint32 PartialBatchSize);
345+
321346
// Resets the Command List and Associated fence in the ZeCommandListFenceMap.
322347
// If the reset command list should be made available, then MakeAvailable
323348
// needs to be set to true. The caller must verify that this command list and

sycl/test/plugins/level_zero_batch_test.cpp

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,6 @@
22

33
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
44

5-
// Default batching should be 4
6-
// RUN: env SYCL_PI_TRACE=2 ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1 | FileCheck --check-prefixes=CKALL,CKB4 %s
7-
85
// Set batching to 4 explicitly
96
// RUN: env SYCL_PI_LEVEL_ZERO_BATCH_SIZE=4 SYCL_PI_TRACE=2 ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1 | FileCheck --check-prefixes=CKALL,CKB4 %s
107

Lines changed: 219 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,219 @@
1+
// REQUIRES: gpu, level_zero
2+
3+
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
4+
5+
// Check that dynamic batching increases batch size
6+
// RUN: env SYCL_PI_TRACE=2 ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1 | FileCheck --check-prefixes=CKALL,CKDYNUP %s
7+
8+
// level_zero_dynamic_batch_test.cpp
9+
//
10+
// This tests the level zero plugin's kernel dyanmic batch size adjustment
11+
// code.
12+
// It starts out by enqueing 40 kernels before it does a wait, and it does
13+
// this 5 times. That should cause the dynamic batch size adjustment to
14+
// raise the batch size up several times.
15+
//
16+
// Then the test starts enqueueing only 4 kernels before doing a wait, and
17+
// it does that 5 times as well. That should cause the batch size to
18+
// be lowered, just once to be less than 4.
19+
//
20+
// CKDYN: Raising QueueBatchSize to 3
21+
// CKDYN: Raising QueueBatchSize to 4
22+
// CKDYN-NOT: Raising QueueBatchSize
23+
// CKALL: Test Pass
24+
// CKALL: Test Pass
25+
// CKALL: Test Pass
26+
// CKALL: Test Pass
27+
// CKALL: Test Pass
28+
// CKALL: Test Pass
29+
// CKALL: Test Pass
30+
// CKDYN: Lowering QueueBatchSize to 3
31+
// CKDYN-NOT: Lowering QueueBatchSize
32+
// CKALL: Test Pass
33+
// CKALL: Test Pass
34+
// CKALL: Test Pass
35+
// CKALL: Test Pass
36+
37+
#include "CL/sycl.hpp"
38+
#include <chrono>
39+
#include <cmath>
40+
#include <iostream>
41+
42+
namespace sycl = cl::sycl;
43+
44+
void validate(uint32_t *result, uint32_t *expect, size_t n) {
45+
int error = 0;
46+
for (int i = 0; i < n; i++) {
47+
if (result[i] != expect[i]) {
48+
error++;
49+
if (error < 10) {
50+
printf("Error: %d, expect: %d\n", result[i], expect[i]);
51+
}
52+
}
53+
}
54+
error > 0 ? printf("Error: %d\n", error) : printf("Test Pass\n");
55+
}
56+
57+
int main(int argc, char *argv[]) {
58+
size_t M = 65536;
59+
size_t N = 512 / 4;
60+
size_t AL = M * N * sizeof(uint32_t);
61+
62+
sycl::queue q(sycl::default_selector{});
63+
auto ctx = q.get_context();
64+
auto dev = q.get_device();
65+
66+
uint32_t *Y1 = static_cast<uint32_t *>(sycl::malloc_shared(AL, dev, ctx));
67+
uint32_t *Z1 = static_cast<uint32_t *>(sycl::malloc_shared(AL, dev, ctx));
68+
uint32_t *Z2 = static_cast<uint32_t *>(sycl::malloc_shared(AL, dev, ctx));
69+
uint32_t *Z3 = static_cast<uint32_t *>(sycl::malloc_shared(AL, dev, ctx));
70+
uint32_t *Z4 = static_cast<uint32_t *>(sycl::malloc_shared(AL, dev, ctx));
71+
uint32_t *Z5 = static_cast<uint32_t *>(sycl::malloc_shared(AL, dev, ctx));
72+
uint32_t *Z6 = static_cast<uint32_t *>(sycl::malloc_shared(AL, dev, ctx));
73+
uint32_t *Z7 = static_cast<uint32_t *>(sycl::malloc_shared(AL, dev, ctx));
74+
uint32_t *Z8 = static_cast<uint32_t *>(sycl::malloc_shared(AL, dev, ctx));
75+
76+
for (size_t i = 0; i < M * N; i++) {
77+
Y1[i] = i % 255;
78+
}
79+
80+
memset(Z1, '\0', AL);
81+
memset(Z2, '\0', AL);
82+
memset(Z3, '\0', AL);
83+
memset(Z4, '\0', AL);
84+
memset(Z5, '\0', AL);
85+
memset(Z6, '\0', AL);
86+
memset(Z7, '\0', AL);
87+
memset(Z8, '\0', AL);
88+
89+
for (size_t i = 0; i < 5; i++) {
90+
for (size_t j = 0; j < 5; j++) {
91+
q.submit([&](sycl::handler &h) {
92+
h.parallel_for<class u32_copy1>(sycl::range<2>{M, N},
93+
[=](sycl::id<2> it) {
94+
const int m = it[0];
95+
const int n = it[1];
96+
Z1[m * N + n] = Y1[m * N + n];
97+
});
98+
});
99+
q.submit([&](sycl::handler &h) {
100+
h.parallel_for<class u32_copy2>(sycl::range<2>{M, N},
101+
[=](sycl::id<2> it) {
102+
const int m = it[0];
103+
const int n = it[1];
104+
Z2[m * N + n] = Y1[m * N + n];
105+
});
106+
});
107+
q.submit([&](sycl::handler &h) {
108+
h.parallel_for<class u32_copy3>(sycl::range<2>{M, N},
109+
[=](sycl::id<2> it) {
110+
const int m = it[0];
111+
const int n = it[1];
112+
Z3[m * N + n] = Y1[m * N + n];
113+
});
114+
});
115+
q.submit([&](sycl::handler &h) {
116+
h.parallel_for<class u32_copy4>(sycl::range<2>{M, N},
117+
[=](sycl::id<2> it) {
118+
const int m = it[0];
119+
const int n = it[1];
120+
Z4[m * N + n] = Y1[m * N + n];
121+
});
122+
});
123+
q.submit([&](sycl::handler &h) {
124+
h.parallel_for<class u32_copy5>(sycl::range<2>{M, N},
125+
[=](sycl::id<2> it) {
126+
const int m = it[0];
127+
const int n = it[1];
128+
Z5[m * N + n] = Y1[m * N + n];
129+
});
130+
});
131+
q.submit([&](sycl::handler &h) {
132+
h.parallel_for<class u32_copy6>(sycl::range<2>{M, N},
133+
[=](sycl::id<2> it) {
134+
const int m = it[0];
135+
const int n = it[1];
136+
Z6[m * N + n] = Y1[m * N + n];
137+
});
138+
});
139+
q.submit([&](sycl::handler &h) {
140+
h.parallel_for<class u32_copy7>(sycl::range<2>{M, N},
141+
[=](sycl::id<2> it) {
142+
const int m = it[0];
143+
const int n = it[1];
144+
Z7[m * N + n] = Y1[m * N + n];
145+
});
146+
});
147+
q.submit([&](sycl::handler &h) {
148+
h.parallel_for<class u32_copy8>(sycl::range<2>{M, N},
149+
[=](sycl::id<2> it) {
150+
const int m = it[0];
151+
const int n = it[1];
152+
Z8[m * N + n] = Y1[m * N + n];
153+
});
154+
});
155+
}
156+
q.wait();
157+
}
158+
159+
validate(Y1, Z1, M * N);
160+
validate(Y1, Z2, M * N);
161+
validate(Y1, Z3, M * N);
162+
validate(Y1, Z4, M * N);
163+
validate(Y1, Z5, M * N);
164+
validate(Y1, Z6, M * N);
165+
validate(Y1, Z7, M * N);
166+
validate(Y1, Z8, M * N);
167+
168+
for (size_t i = 0; i < 5; i++) {
169+
q.submit([&](sycl::handler &h) {
170+
h.parallel_for<class u32_copy9>(sycl::range<2>{M, N},
171+
[=](sycl::id<2> it) {
172+
const int m = it[0];
173+
const int n = it[1];
174+
Z1[m * N + n] = Y1[m * N + n];
175+
});
176+
});
177+
q.submit([&](sycl::handler &h) {
178+
h.parallel_for<class u32_copy10>(sycl::range<2>{M, N},
179+
[=](sycl::id<2> it) {
180+
const int m = it[0];
181+
const int n = it[1];
182+
Z2[m * N + n] = Y1[m * N + n];
183+
});
184+
});
185+
q.submit([&](sycl::handler &h) {
186+
h.parallel_for<class u32_copy11>(sycl::range<2>{M, N},
187+
[=](sycl::id<2> it) {
188+
const int m = it[0];
189+
const int n = it[1];
190+
Z3[m * N + n] = Y1[m * N + n];
191+
});
192+
});
193+
q.submit([&](sycl::handler &h) {
194+
h.parallel_for<class u32_copy12>(sycl::range<2>{M, N},
195+
[=](sycl::id<2> it) {
196+
const int m = it[0];
197+
const int n = it[1];
198+
Z4[m * N + n] = Y1[m * N + n];
199+
});
200+
});
201+
q.wait();
202+
}
203+
validate(Y1, Z1, M * N);
204+
validate(Y1, Z2, M * N);
205+
validate(Y1, Z3, M * N);
206+
validate(Y1, Z4, M * N);
207+
208+
sycl::free(Y1, ctx);
209+
sycl::free(Z1, ctx);
210+
sycl::free(Z2, ctx);
211+
sycl::free(Z3, ctx);
212+
sycl::free(Z4, ctx);
213+
sycl::free(Z5, ctx);
214+
sycl::free(Z6, ctx);
215+
sycl::free(Z7, ctx);
216+
sycl::free(Z8, ctx);
217+
218+
return 0;
219+
}

0 commit comments

Comments
 (0)