Skip to content

Commit 1c377dc

Browse files
fix: ensure payload arguments are patched before walker command is fetched
In case of indirect kernel launch some payload arguments are patched just before walker command, this change disables prefetch, performs batch buffer start to next bytes and then re-enable prefetch. All these operations are performed between MI_STORE_REGISTER_MEM and COMPUTE_WALKER Related-To: NEO-14584 Signed-off-by: Mateusz Jablonski <[email protected]>
1 parent ca45573 commit 1c377dc

File tree

2 files changed

+66
-0
lines changed

2 files changed

+66
-0
lines changed

shared/source/command_container/command_encoder.inl

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -631,6 +631,13 @@ void EncodeIndirectParams<Family>::encode(CommandContainer &container, uint64_t
631631
setWorkDimIndirect(container, numWorkDimOffsetV1, implicitArgsGpuPtr, dispatchInterface->getGroupSize(), nullptr);
632632
}
633633
}
634+
if (outArgs && !outArgs->commandsToPatch.empty()) {
635+
auto &commandStream = *container.getCommandStream();
636+
EncodeMiArbCheck<Family>::program(commandStream, true);
637+
auto gpuVa = commandStream.getCurrentGpuAddressPosition() + EncodeBatchBufferStartOrEnd<Family>::getBatchBufferStartSize();
638+
EncodeBatchBufferStartOrEnd<Family>::programBatchBufferStart(&commandStream, gpuVa, !(container.getFlushTaskUsedForImmediate() || container.isUsingPrimaryBuffer()), false, false);
639+
EncodeMiArbCheck<Family>::program(commandStream, false);
640+
}
634641
}
635642

636643
template <typename Family>

shared/test/unit_test/encoders/test_encode_math.cpp

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
#include "shared/source/indirect_heap/heap_size.h"
1111
#include "shared/test/common/cmd_parse/gen_cmd_parse.h"
1212
#include "shared/test/common/fixtures/device_fixture.h"
13+
#include "shared/test/common/helpers/gtest_helpers.h"
1314
#include "shared/test/common/mocks/mock_device.h"
1415
#include "shared/test/common/test_macros/hw_test.h"
1516
#include "shared/test/unit_test/mocks/mock_dispatch_kernel_encoder_interface.h"
@@ -489,6 +490,64 @@ HWTEST_F(CommandEncoderMathTest, givenPayloadArgumentStoredInInlineDataWhenEncod
489490
}
490491
}
491492

493+
HWTEST_F(CommandEncoderMathTest, givenPayloadArgumentStoredInInlineDataWhenEncodeIndirectParamsThenPreparserMitigationIsProgrammed) {
494+
using MI_ARB_CHECK = typename FamilyType::MI_ARB_CHECK;
495+
using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START;
496+
CommandContainer cmdContainer0;
497+
cmdContainer0.initialize(pDevice, nullptr, HeapSize::defaultHeapSize, true, false);
498+
499+
CommandContainer cmdContainer1;
500+
cmdContainer1.initialize(pDevice, nullptr, HeapSize::defaultHeapSize, true, false);
501+
502+
uint64_t crossThreadGpuVa = 0xBADF000;
503+
504+
IndirectParamsInInlineDataArgs args{};
505+
506+
MockDispatchKernelEncoder dispatchInterface;
507+
508+
auto &kernelDescriptor = dispatchInterface.kernelDescriptor;
509+
uint32_t groupSizes[3] = {1, 2, 3};
510+
dispatchInterface.getGroupSizeResult = groupSizes;
511+
512+
kernelDescriptor.kernelAttributes.inlineDataPayloadSize = 0x100;
513+
kernelDescriptor.payloadMappings.dispatchTraits.numWorkGroups[0] = 0x100;
514+
kernelDescriptor.payloadMappings.dispatchTraits.numWorkGroups[1] = 0x110;
515+
kernelDescriptor.payloadMappings.dispatchTraits.numWorkGroups[2] = undefined<CrossThreadDataOffset>;
516+
517+
kernelDescriptor.payloadMappings.dispatchTraits.globalWorkSize[0] = undefined<CrossThreadDataOffset>;
518+
kernelDescriptor.payloadMappings.dispatchTraits.globalWorkSize[1] = 0x120;
519+
kernelDescriptor.payloadMappings.dispatchTraits.globalWorkSize[2] = 0x130;
520+
521+
kernelDescriptor.payloadMappings.dispatchTraits.workDim = 0x140;
522+
523+
EncodeIndirectParams<FamilyType>::encode(cmdContainer0, crossThreadGpuVa, &dispatchInterface, 0u, &args);
524+
525+
kernelDescriptor.payloadMappings.dispatchTraits.workDim = 0x60;
526+
527+
EncodeIndirectParams<FamilyType>::encode(cmdContainer1, crossThreadGpuVa, &dispatchInterface, 0u, &args);
528+
529+
auto used0 = cmdContainer0.getCommandStream()->getUsed();
530+
auto used1 = cmdContainer1.getCommandStream()->getUsed();
531+
532+
auto expectedDiff = sizeof(MI_ARB_CHECK) * 2 + sizeof(MI_BATCH_BUFFER_START);
533+
EXPECT_EQ(expectedDiff, used1 - used0);
534+
535+
GenCmdList commands;
536+
CmdParse<FamilyType>::parseCommandBuffer(commands, ptrOffset(cmdContainer1.getCommandStream()->getCpuBase(), used0), used1 - used0);
537+
auto itor = commands.begin();
538+
itor = find<MI_ARB_CHECK *>(itor, commands.end());
539+
ASSERT_NE(itor, commands.end());
540+
itor = find<MI_BATCH_BUFFER_START *>(++itor, commands.end());
541+
542+
ASSERT_NE(itor, commands.end());
543+
itor = find<MI_ARB_CHECK *>(++itor, commands.end());
544+
545+
ASSERT_NE(itor, commands.end());
546+
itor = find<MI_ARB_CHECK *>(++itor, commands.end());
547+
548+
EXPECT_EQ(itor, commands.end());
549+
}
550+
492551
using CommandEncodeAluTests = ::testing::Test;
493552

494553
HWTEST_F(CommandEncodeAluTests, whenAskingForIncrementOrDecrementCmdsSizeThenReturnCorrectValue) {

0 commit comments

Comments
 (0)