Skip to content

Commit cffe7f1

Browse files
Use postsync for copy and fill
Related-To: NEO-5968 Signed-off-by: Aravind Gopalakrishnan <[email protected]>
1 parent f1574be commit cffe7f1

File tree

9 files changed

+411
-31
lines changed

9 files changed

+411
-31
lines changed

level_zero/core/source/cmdlist/cmdlist_hw.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -235,6 +235,11 @@ struct CommandListCoreFamily : CommandListImp {
235235
void appendEventForProfilingAllWalkers(ze_event_handle_t hEvent, bool beforeWalker);
236236
void appendEventForProfilingCopyCommand(ze_event_handle_t hEvent, bool beforeWalker);
237237
void appendSignalEventPostWalker(ze_event_handle_t hEvent, bool workloadPartition);
238+
void programEventL3Flush(ze_event_handle_t hEvent,
239+
Device *device,
240+
uint32_t partitionCount,
241+
NEO::CommandContainer &commandContainer);
242+
void adjustEventKernelCount(ze_event_handle_t hEvent);
238243
void programStateBaseAddress(NEO::CommandContainer &container, bool genericMediaStateClearRequired);
239244
void appendComputeBarrierCommand();
240245
NEO::PipeControlArgs createBarrierFlags();

level_zero/core/source/cmdlist/cmdlist_hw.inl

Lines changed: 36 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -224,8 +224,17 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchCooperativeKernel(
224224
return ret;
225225
}
226226

227-
return appendLaunchKernelWithParams(hKernel, pLaunchFuncArgs,
228-
hSignalEvent, false, false, true);
227+
ret = appendLaunchKernelWithParams(hKernel, pLaunchFuncArgs,
228+
hSignalEvent, false, false, true);
229+
if (ret) {
230+
return ret;
231+
}
232+
233+
if (hSignalEvent) {
234+
programEventL3Flush(hSignalEvent, this->device, this->partitionCount, commandContainer);
235+
}
236+
237+
return ret;
229238
}
230239

231240
template <GFXCORE_FAMILY gfxCoreFamily>
@@ -242,6 +251,12 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelIndirect(ze_
242251
appendEventForProfiling(hEvent, true, false);
243252
ret = appendLaunchKernelWithParams(hKernel, pDispatchArgumentsBuffer,
244253
nullptr, true, false, false);
254+
if (ret) {
255+
return ret;
256+
}
257+
if (hEvent) {
258+
programEventL3Flush(hEvent, this->device, this->partitionCount, commandContainer);
259+
}
245260
appendSignalEventPostWalker(hEvent, false);
246261

247262
return ret;
@@ -276,7 +291,9 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchMultipleKernelsInd
276291
return ret;
277292
}
278293
}
279-
294+
if (hEvent) {
295+
programEventL3Flush(hEvent, this->device, this->partitionCount, commandContainer);
296+
}
280297
appendSignalEventPostWalker(hEvent, false);
281298

282299
return ret;
@@ -800,22 +817,6 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemAdvise(ze_device_hand
800817
return ZE_RESULT_ERROR_INVALID_ARGUMENT;
801818
}
802819

803-
template <GFXCORE_FAMILY gfxCoreFamily>
804-
ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelSplit(ze_kernel_handle_t hKernel,
805-
const ze_group_count_t *pThreadGroupDimensions,
806-
ze_event_handle_t hEvent) {
807-
return appendLaunchKernelWithParams(hKernel, pThreadGroupDimensions, nullptr, false, false, false);
808-
}
809-
810-
template <GFXCORE_FAMILY gfxCoreFamily>
811-
void CommandListCoreFamily<gfxCoreFamily>::appendEventForProfilingAllWalkers(ze_event_handle_t hEvent, bool beforeWalker) {
812-
if (beforeWalker) {
813-
appendEventForProfiling(hEvent, true, false);
814-
} else {
815-
appendSignalEventPostWalker(hEvent, false);
816-
}
817-
}
818-
819820
template <GFXCORE_FAMILY gfxCoreFamily>
820821
ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopyKernelWithGA(void *dstPtr,
821822
NEO::GraphicsAllocation *dstPtrAlloc,
@@ -1069,6 +1070,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopy(void *dstptr,
10691070
}
10701071

10711072
appendEventForProfilingAllWalkers(hSignalEvent, true);
1073+
adjustEventKernelCount(hSignalEvent);
10721074

10731075
if (ret == ZE_RESULT_SUCCESS && leftSize) {
10741076
Builtin func = Builtin::CopyBufferToBufferSide;
@@ -1128,16 +1130,22 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopy(void *dstptr,
11281130
isStateless);
11291131
}
11301132

1133+
if (hSignalEvent) {
1134+
programEventL3Flush(hSignalEvent, this->device, this->partitionCount, commandContainer);
1135+
}
11311136
appendEventForProfilingAllWalkers(hSignalEvent, false);
11321137

11331138
const auto &hwInfo = this->device->getHwInfo();
11341139
if (NEO::MemorySynchronizationCommands<GfxFamily>::getDcFlushEnable(true, hwInfo)) {
11351140
auto event = Event::fromHandle(hSignalEvent);
11361141
if (event) {
11371142
dstAllocationStruct.needsFlush &= !event->signalScope;
1143+
dstAllocationStruct.needsFlush &= !event->l3FlushWaApplied;
11381144
}
11391145

1140-
if (dstAllocationStruct.needsFlush && !isCopyOnly()) {
1146+
dstAllocationStruct.needsFlush &= !isCopyOnly();
1147+
1148+
if (dstAllocationStruct.needsFlush) {
11411149
NEO::PipeControlArgs args;
11421150
args.dcFlushEnable = true;
11431151
NEO::MemorySynchronizationCommands<GfxFamily>::addPipeControl(*commandContainer.getCommandStream(), args);
@@ -1452,6 +1460,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryFill(void *ptr,
14521460
builtinFunction->setArgumentValue(2, sizeof(value), &value);
14531461

14541462
appendEventForProfilingAllWalkers(hSignalEvent, true);
1463+
adjustEventKernelCount(hSignalEvent);
14551464

14561465
uint32_t groups = static_cast<uint32_t>(size) / groupSizeX;
14571466
ze_group_count_t dispatchFuncArgs{groups, 1u, 1u};
@@ -1526,6 +1535,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryFill(void *ptr,
15261535
builtinFunction->setArgumentValue(3, sizeof(patternSizeInEls), &patternSizeInEls);
15271536

15281537
appendEventForProfilingAllWalkers(hSignalEvent, true);
1538+
adjustEventKernelCount(hSignalEvent);
15291539

15301540
ze_group_count_t dispatchFuncArgs{groups, 1u, 1u};
15311541
res = appendLaunchKernelSplit(builtinFunction->toHandle(), &dispatchFuncArgs, hSignalEvent);
@@ -1564,15 +1574,21 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryFill(void *ptr,
15641574
}
15651575
}
15661576

1577+
if (hSignalEvent) {
1578+
programEventL3Flush(hSignalEvent, this->device, this->partitionCount, commandContainer);
1579+
}
15671580
appendEventForProfilingAllWalkers(hSignalEvent, false);
15681581

15691582
const auto &hwInfo = this->device->getHwInfo();
15701583
if (NEO::MemorySynchronizationCommands<GfxFamily>::getDcFlushEnable(true, hwInfo)) {
15711584
auto event = Event::fromHandle(hSignalEvent);
15721585
if (event) {
15731586
hostPointerNeedsFlush &= !event->signalScope;
1587+
hostPointerNeedsFlush &= !event->l3FlushWaApplied;
15741588
}
15751589

1590+
hostPointerNeedsFlush &= !isCopyOnly();
1591+
15761592
if (hostPointerNeedsFlush) {
15771593
NEO::PipeControlArgs args;
15781594
args.dcFlushEnable = true;

level_zero/core/source/cmdlist/cmdlist_hw_base.inl

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,26 @@ size_t CommandListCoreFamily<gfxCoreFamily>::getReserveSshSize() {
3232
return helper.getRenderSurfaceStateSize();
3333
}
3434

35+
template <GFXCORE_FAMILY gfxCoreFamily>
36+
void CommandListCoreFamily<gfxCoreFamily>::programEventL3Flush(ze_event_handle_t hEvent,
37+
Device *device,
38+
uint32_t partitionCount,
39+
NEO::CommandContainer &commandContainer) {
40+
}
41+
42+
template <GFXCORE_FAMILY gfxCoreFamily>
43+
void CommandListCoreFamily<gfxCoreFamily>::adjustEventKernelCount(ze_event_handle_t hEvent) {
44+
}
45+
46+
template <GFXCORE_FAMILY gfxCoreFamily>
47+
void CommandListCoreFamily<gfxCoreFamily>::appendEventForProfilingAllWalkers(ze_event_handle_t hEvent, bool beforeWalker) {
48+
if (beforeWalker) {
49+
appendEventForProfiling(hEvent, true, false);
50+
} else {
51+
appendSignalEventPostWalker(hEvent, false);
52+
}
53+
}
54+
3555
template <GFXCORE_FAMILY gfxCoreFamily>
3656
ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(ze_kernel_handle_t hKernel,
3757
const ze_group_count_t *pThreadGroupDimensions,
@@ -174,6 +194,17 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(z
174194
return ZE_RESULT_SUCCESS;
175195
}
176196

197+
template <GFXCORE_FAMILY gfxCoreFamily>
198+
ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelSplit(ze_kernel_handle_t hKernel,
199+
const ze_group_count_t *pThreadGroupDimensions,
200+
ze_event_handle_t hEvent) {
201+
if (hEvent) {
202+
auto event = Event::fromHandle(hEvent);
203+
event->kernelCount = 1;
204+
}
205+
return appendLaunchKernelWithParams(hKernel, pThreadGroupDimensions, nullptr, false, false, false);
206+
}
207+
177208
template <GFXCORE_FAMILY gfxCoreFamily>
178209
void CommandListCoreFamily<gfxCoreFamily>::appendMultiPartitionPrologue(uint32_t partitionDataSize) {}
179210

level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl

Lines changed: 40 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -84,14 +84,20 @@ void CommandListCoreFamily<gfxCoreFamily>::applyMemoryRangesBarrier(uint32_t num
8484
}
8585

8686
template <GFXCORE_FAMILY gfxCoreFamily>
87-
void programEventL3Flush(ze_event_handle_t hEvent,
88-
Device *device,
89-
uint32_t partitionCount,
90-
NEO::CommandContainer &commandContainer) {
87+
void CommandListCoreFamily<gfxCoreFamily>::programEventL3Flush(ze_event_handle_t hEvent,
88+
Device *device,
89+
uint32_t partitionCount,
90+
NEO::CommandContainer &commandContainer) {
9191
using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
9292
using POST_SYNC_OPERATION = typename GfxFamily::PIPE_CONTROL::POST_SYNC_OPERATION;
9393
auto event = Event::fromHandle(hEvent);
9494

95+
const auto &hwInfo = this->device->getHwInfo();
96+
bool L3FlushEnable = NEO::MemorySynchronizationCommands<GfxFamily>::getDcFlushEnable(event->signalScope, hwInfo);
97+
if (!L3FlushEnable || isCopyOnly()) {
98+
return;
99+
}
100+
95101
auto eventPartitionOffset = (partitionCount > 1) ? (partitionCount * event->getSinglePacketSize())
96102
: event->getSinglePacketSize();
97103
uint64_t eventAddress = event->getPacketAddress(device) + eventPartitionOffset;
@@ -121,6 +127,13 @@ void programEventL3Flush(ze_event_handle_t hEvent,
121127
args);
122128
}
123129

130+
template <GFXCORE_FAMILY gfxCoreFamily>
131+
void CommandListCoreFamily<gfxCoreFamily>::appendEventForProfilingAllWalkers(ze_event_handle_t hEvent, bool beforeWalker) {
132+
if (hEvent && isCopyOnly()) {
133+
appendSignalEventPostWalker(hEvent, false);
134+
}
135+
}
136+
124137
template <GFXCORE_FAMILY gfxCoreFamily>
125138
ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(ze_kernel_handle_t hKernel,
126139
const ze_group_count_t *pThreadGroupDimensions,
@@ -165,6 +178,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(z
165178
commandContainer.addToResidencyContainer(eventAlloc);
166179
L3FlushEnable = NEO::MemorySynchronizationCommands<GfxFamily>::getDcFlushEnable(event->signalScope, hwInfo);
167180
isTimestampEvent = event->isUsingContextEndOffset();
181+
168182
eventAddress = event->getPacketAddress(this->device);
169183
}
170184

@@ -238,9 +252,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(z
238252
if (partitionCount > 1) {
239253
event->setPacketsInUse(partitionCount);
240254
}
241-
if (L3FlushEnable) {
242-
programEventL3Flush<gfxCoreFamily>(hEvent, this->device, partitionCount, commandContainer);
243-
}
255+
programEventL3Flush(hEvent, this->device, this->partitionCount, commandContainer);
244256
}
245257

246258
if (neoDevice->getDebugger()) {
@@ -292,6 +304,27 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(z
292304
return ZE_RESULT_SUCCESS;
293305
}
294306

307+
template <GFXCORE_FAMILY gfxCoreFamily>
308+
ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelSplit(ze_kernel_handle_t hKernel,
309+
const ze_group_count_t *pThreadGroupDimensions,
310+
ze_event_handle_t hEvent) {
311+
if (hEvent) {
312+
auto event = Event::fromHandle(hEvent);
313+
event->kernelCount += 1;
314+
}
315+
return appendLaunchKernelWithParams(hKernel, pThreadGroupDimensions, hEvent, false, false, false);
316+
}
317+
318+
template <GFXCORE_FAMILY gfxCoreFamily>
319+
void CommandListCoreFamily<gfxCoreFamily>::adjustEventKernelCount(ze_event_handle_t hEvent) {
320+
if (hEvent) {
321+
auto event = Event::fromHandle(hEvent);
322+
if (!isCopyOnly()) {
323+
event->kernelCount = 0u;
324+
}
325+
}
326+
}
327+
295328
template <GFXCORE_FAMILY gfxCoreFamily>
296329
void CommandListCoreFamily<gfxCoreFamily>::appendMultiPartitionPrologue(uint32_t partitionDataSize) {
297330
NEO::ImplicitScalingDispatch<GfxFamily>::dispatchOffsetRegister(*commandContainer.getCommandStream(),

level_zero/core/source/event/event.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -80,8 +80,8 @@ ze_result_t EventPoolImp::initialize(DriverHandle *driver, Context *context, uin
8080
eventSize = static_cast<uint32_t>(alignUp(EventPacketsCount::eventPackets * hwHelper.getSingleTimestampPacketSize(), eventAlignment));
8181

8282
size_t alignedSize = alignUp<size_t>(numEvents * eventSize, MemoryConstants::pageSize64k);
83-
NEO::AllocationType allocationType = isEventPoolTimestampFlagSet() ? NEO::AllocationType::TIMESTAMP_PACKET_TAG_BUFFER
84-
: NEO::AllocationType::BUFFER_HOST_MEMORY;
83+
NEO::AllocationType allocationType = NEO::AllocationType::TIMESTAMP_PACKET_TAG_BUFFER;
84+
8585
if (this->devices.size() > 1) {
8686
useDeviceAlloc = false;
8787
}

level_zero/core/source/event/event_impl.inl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -393,7 +393,8 @@ uint32_t EventImp<TagSizeT>::getPacketsUsedInLastKernel() {
393393

394394
template <typename TagSizeT>
395395
void EventImp<TagSizeT>::setPacketsInUse(uint32_t value) {
396-
kernelEventCompletionData[getCurrKernelDataIndex()].setPacketsUsed(value);
396+
auto kernelIndex = getCurrKernelDataIndex();
397+
kernelEventCompletionData[kernelIndex].setPacketsUsed(value);
397398
}
398399

399400
template <typename TagSizeT>

0 commit comments

Comments
 (0)