@@ -104,7 +104,6 @@ void GpgpuWalkerHelper<GfxFamily>::addAluReadModifyWriteRegister(
104
104
pCmd4->setDestinationRegisterAddress (aluRegister);
105
105
106
106
// Add PIPE_CONTROL to flush caches
107
- typedef typename GfxFamily::PIPE_CONTROL PIPE_CONTROL;
108
107
auto pCmd5 = reinterpret_cast <PIPE_CONTROL *>(pCommandStream->getSpace (sizeof (PIPE_CONTROL)));
109
108
*pCmd5 = PIPE_CONTROL::sInit ();
110
109
pCmd5->setCommandStreamerStallEnable (true );
@@ -157,7 +156,6 @@ void GpgpuWalkerHelper<GfxFamily>::dispatchProfilingCommandsStart(
157
156
HwTimeStamps &hwTimeStamps,
158
157
OCLRT::LinearStream *commandStream) {
159
158
using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;
160
- using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL;
161
159
162
160
// PIPE_CONTROL for global timestamp
163
161
uint64_t TimeStampAddress = reinterpret_cast <uint64_t >(&(hwTimeStamps.GlobalStartTS ));
@@ -185,7 +183,6 @@ void GpgpuWalkerHelper<GfxFamily>::dispatchProfilingCommandsEnd(
185
183
OCLRT::LinearStream *commandStream) {
186
184
187
185
using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;
188
- using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL;
189
186
190
187
// PIPE_CONTROL for global timestamp
191
188
auto pPipeControlCmd = (PIPE_CONTROL *)commandStream->getSpace (sizeof (PIPE_CONTROL));
@@ -340,7 +337,6 @@ void GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersCommandsStart(
340
337
OCLRT::HwPerfCounter &hwPerfCounter,
341
338
OCLRT::LinearStream *commandStream) {
342
339
343
- using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL;
344
340
using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;
345
341
using MI_REPORT_PERF_COUNT = typename GfxFamily::MI_REPORT_PERF_COUNT;
346
342
@@ -387,7 +383,6 @@ void GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersCommandsEnd(
387
383
OCLRT::HwPerfCounter &hwPerfCounter,
388
384
OCLRT::LinearStream *commandStream) {
389
385
390
- using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL;
391
386
using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;
392
387
using MI_REPORT_PERF_COUNT = typename GfxFamily::MI_REPORT_PERF_COUNT;
393
388
@@ -440,6 +435,7 @@ void GpgpuWalkerHelper<GfxFamily>::dispatchWalker(
440
435
KernelOperation **blockedCommandsData,
441
436
HwTimeStamps *hwTimeStamps,
442
437
OCLRT::HwPerfCounter *hwPerfCounter,
438
+ TimestampPacket *timestampPacket,
443
439
PreemptionMode preemptionMode,
444
440
bool blockQueue,
445
441
uint32_t commandType) {
@@ -519,6 +515,7 @@ void GpgpuWalkerHelper<GfxFamily>::dispatchWalker(
519
515
520
516
DEBUG_BREAK_IF (offsetInterfaceDescriptorTable % 64 != 0 );
521
517
518
+ size_t currentDispatchIndex = 0 ;
522
519
for (auto &dispatchInfo : multiDispatchInfo) {
523
520
auto &kernel = *dispatchInfo.getKernel ();
524
521
@@ -613,11 +610,20 @@ void GpgpuWalkerHelper<GfxFamily>::dispatchWalker(
613
610
// Implement enabling special WA DisableLSQCROPERFforOCL if needed
614
611
GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL (commandStream, kernel, true );
615
612
613
+ bool setupTimestampPacket = (DebugManager.flags .EnableTimestampPacket .get ()) && (currentDispatchIndex == numDispatches - 1 );
614
+ if (setupTimestampPacket) {
615
+ GpgpuWalkerHelper<GfxFamily>::setupTimestampPacket (commandStream, nullptr , timestampPacket, TimestampPacket::WriteOperationType::Start);
616
+ }
617
+
616
618
// Program the walker. Invokes execution so all state should already be programmed
617
619
typedef typename GfxFamily::GPGPU_WALKER GPGPU_WALKER;
618
620
auto pGpGpuWalkerCmd = (GPGPU_WALKER *)commandStream->getSpace (sizeof (GPGPU_WALKER));
619
621
*pGpGpuWalkerCmd = GfxFamily::cmdInitGpgpuWalker;
620
622
623
+ if (setupTimestampPacket) {
624
+ GpgpuWalkerHelper<GfxFamily>::setupTimestampPacket (commandStream, pGpGpuWalkerCmd, timestampPacket, TimestampPacket::WriteOperationType::End);
625
+ }
626
+
621
627
size_t globalOffsets[3 ] = {offset.x , offset.y , offset.z };
622
628
size_t startWorkGroups[3 ] = {swgs.x , swgs.y , swgs.z };
623
629
size_t numWorkGroups[3 ] = {nwgs.x , nwgs.y , nwgs.z };
@@ -645,6 +651,7 @@ void GpgpuWalkerHelper<GfxFamily>::dispatchWalker(
645
651
GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL (commandStream, kernel, false );
646
652
647
653
PreemptionHelper::applyPreemptionWaCmdsEnd<GfxFamily>(commandStream, commandQueue.getDevice ());
654
+ currentDispatchIndex++;
648
655
}
649
656
650
657
// If hwTimeStamps is passed (not nullptr), then we know that profiling is enabled
@@ -656,6 +663,24 @@ void GpgpuWalkerHelper<GfxFamily>::dispatchWalker(
656
663
}
657
664
}
658
665
666
+ template <typename GfxFamily>
667
+ void GpgpuWalkerHelper<GfxFamily>::setupTimestampPacket(
668
+ LinearStream *cmdStream,
669
+ WALKER_HANDLE walkerHandle,
670
+ TimestampPacket *timestampPacket,
671
+ TimestampPacket::WriteOperationType writeOperationType) {
672
+
673
+ uint64_t address = timestampPacket->pickAddressForPipeControlWrite (writeOperationType);
674
+
675
+ auto pipeControlCmd = cmdStream->getSpaceForCmd <PIPE_CONTROL>();
676
+ *pipeControlCmd = PIPE_CONTROL::sInit ();
677
+ pipeControlCmd->setCommandStreamerStallEnable (true );
678
+ pipeControlCmd->setPostSyncOperation (PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA);
679
+ pipeControlCmd->setAddress (static_cast <uint32_t >(address & 0x0000FFFFFFFFULL ));
680
+ pipeControlCmd->setAddressHigh (static_cast <uint32_t >(address >> 32 ));
681
+ pipeControlCmd->setImmediateData (0 );
682
+ }
683
+
659
684
template <typename GfxFamily>
660
685
void GpgpuWalkerHelper<GfxFamily>::dispatchScheduler(
661
686
CommandQueue &commandQueue,
@@ -667,7 +692,6 @@ void GpgpuWalkerHelper<GfxFamily>::dispatchScheduler(
667
692
668
693
using INTERFACE_DESCRIPTOR_DATA = typename GfxFamily::INTERFACE_DESCRIPTOR_DATA;
669
694
using GPGPU_WALKER = typename GfxFamily::GPGPU_WALKER;
670
- using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL;
671
695
using MI_BATCH_BUFFER_START = typename GfxFamily::MI_BATCH_BUFFER_START;
672
696
673
697
OCLRT::LinearStream *commandStream = nullptr ;
@@ -802,14 +826,14 @@ size_t GpgpuWalkerHelper<GfxFamily>::getSizeForWADisableLSQCROPERFforOCL(const K
802
826
template <typename GfxFamily>
803
827
size_t EnqueueOperation<GfxFamily>::getTotalSizeRequiredCS(bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo) {
804
828
size_t size = KernelCommandsHelper<GfxFamily>::getSizeRequiredCS () +
805
- sizeof (typename GfxFamily:: PIPE_CONTROL) * (KernelCommandsHelper<GfxFamily>::isPipeControlWArequired () ? 2 : 1 );
829
+ sizeof (PIPE_CONTROL) * (KernelCommandsHelper<GfxFamily>::isPipeControlWArequired () ? 2 : 1 );
806
830
if (reserveProfilingCmdsSpace) {
807
- size += 2 * sizeof (typename GfxFamily:: PIPE_CONTROL) + 4 * sizeof (typename GfxFamily::MI_STORE_REGISTER_MEM);
831
+ size += 2 * sizeof (PIPE_CONTROL) + 4 * sizeof (typename GfxFamily::MI_STORE_REGISTER_MEM);
808
832
}
809
833
if (reservePerfCounters) {
810
834
// start cmds
811
835
// P_C: flush CS & TimeStamp BEGIN
812
- size += 2 * sizeof (typename GfxFamily:: PIPE_CONTROL);
836
+ size += 2 * sizeof (PIPE_CONTROL);
813
837
// SRM NOOPID & Frequency
814
838
size += 2 * sizeof (typename GfxFamily::MI_STORE_REGISTER_MEM);
815
839
// gp registers
@@ -821,7 +845,7 @@ size_t EnqueueOperation<GfxFamily>::getTotalSizeRequiredCS(bool reserveProfiling
821
845
822
846
// end cmds
823
847
// P_C: flush CS & TimeStamp END;
824
- size += 2 * sizeof (typename GfxFamily:: PIPE_CONTROL);
848
+ size += 2 * sizeof (PIPE_CONTROL);
825
849
// OA buffer (status head, tail)
826
850
size += 3 * sizeof (typename GfxFamily::MI_STORE_REGISTER_MEM);
827
851
// report perf count
@@ -858,15 +882,15 @@ size_t EnqueueOperation<GfxFamily>::getSizeRequiredCS(uint32_t cmdType, bool res
858
882
template <typename GfxFamily>
859
883
size_t EnqueueOperation<GfxFamily>::getSizeRequiredCSKernel(bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue, const Kernel *pKernel) {
860
884
size_t size = sizeof (typename GfxFamily::GPGPU_WALKER) + KernelCommandsHelper<GfxFamily>::getSizeRequiredCS () +
861
- sizeof (typename GfxFamily:: PIPE_CONTROL) * (KernelCommandsHelper<GfxFamily>::isPipeControlWArequired () ? 2 : 1 );
885
+ sizeof (PIPE_CONTROL) * (KernelCommandsHelper<GfxFamily>::isPipeControlWArequired () ? 2 : 1 );
862
886
size += PreemptionHelper::getPreemptionWaCsSize<GfxFamily>(commandQueue.getDevice ());
863
887
if (reserveProfilingCmdsSpace) {
864
- size += 2 * sizeof (typename GfxFamily:: PIPE_CONTROL) + 2 * sizeof (typename GfxFamily::MI_STORE_REGISTER_MEM);
888
+ size += 2 * sizeof (PIPE_CONTROL) + 2 * sizeof (typename GfxFamily::MI_STORE_REGISTER_MEM);
865
889
}
866
890
if (reservePerfCounters) {
867
891
// start cmds
868
892
// P_C: flush CS & TimeStamp BEGIN
869
- size += 2 * sizeof (typename GfxFamily:: PIPE_CONTROL);
893
+ size += 2 * sizeof (PIPE_CONTROL);
870
894
// SRM NOOPID & Frequency
871
895
size += 2 * sizeof (typename GfxFamily::MI_STORE_REGISTER_MEM);
872
896
// gp registers
@@ -878,7 +902,7 @@ size_t EnqueueOperation<GfxFamily>::getSizeRequiredCSKernel(bool reserveProfilin
878
902
879
903
// end cmds
880
904
// P_C: flush CS & TimeStamp END;
881
- size += 2 * sizeof (typename GfxFamily:: PIPE_CONTROL);
905
+ size += 2 * sizeof (PIPE_CONTROL);
882
906
// OA buffer (status head, tail)
883
907
size += 3 * sizeof (typename GfxFamily::MI_STORE_REGISTER_MEM);
884
908
// report perf count
@@ -899,7 +923,7 @@ template <typename GfxFamily>
899
923
size_t EnqueueOperation<GfxFamily>::getSizeRequiredCSNonKernel(bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue) {
900
924
size_t size = 0 ;
901
925
if (reserveProfilingCmdsSpace) {
902
- size += 2 * sizeof (typename GfxFamily:: PIPE_CONTROL) + 4 * sizeof (typename GfxFamily::MI_STORE_REGISTER_MEM);
926
+ size += 2 * sizeof (PIPE_CONTROL) + 4 * sizeof (typename GfxFamily::MI_STORE_REGISTER_MEM);
903
927
}
904
928
return size;
905
929
}
0 commit comments