Skip to content

Commit c36c083

Browse files
Refactor implicit scaling parameters for surface state
Related-To: NEO-6589 Signed-off-by: Zbigniew Zdanowicz <[email protected]>
1 parent 79c8605 commit c36c083

File tree

9 files changed

+105
-18
lines changed

9 files changed

+105
-18
lines changed

level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -266,6 +266,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(z
266266
args.gmmHelper = neoDevice->getGmmHelper();
267267
args.useGlobalAtomics = kernelImp->getKernelDescriptor().kernelAttributes.flags.useGlobalAtomics;
268268
args.areMultipleSubDevicesInContext = args.numAvailableDevices > 1;
269+
args.implicitScaling = this->partitionCount > 1;
269270

270271
NEO::EncodeSurfaceState<GfxFamily>::encodeBuffer(args);
271272
*reinterpret_cast<typename GfxFamily::RENDER_SURFACE_STATE *>(surfaceStateSpace) = surfaceState;

level_zero/core/source/kernel/kernel_hw.h

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (C) 2020-2021 Intel Corporation
2+
* Copyright (C) 2020-2022 Intel Corporation
33
*
44
* SPDX-License-Identifier: MIT
55
*
@@ -63,12 +63,14 @@ struct KernelHw : public KernelImp {
6363
bufferSizeForSsh = alignUp(bufferSizeForSsh, alignment);
6464

6565
bool l3Enabled = true;
66-
6766
// Allocation MUST be cacheline (64 byte) aligned in order to enable L3 caching otherwise Heap corruption will occur coming from the KMD.
6867
// Most commonly this issue will occur with Host Point Allocations from customers.
6968
l3Enabled = isL3Capable(*alloc);
7069

71-
auto allocData = this->module->getDevice()->getDriverHandle()->getSvmAllocsManager()->getSVMAlloc(reinterpret_cast<void *>(alloc->getGpuAddress()));
70+
Device *device = module->getDevice();
71+
NEO::Device *neoDevice = device->getNEODevice();
72+
73+
auto allocData = device->getDriverHandle()->getSvmAllocsManager()->getSVMAlloc(reinterpret_cast<void *>(alloc->getGpuAddress()));
7274
if (allocData && allocData->allocationFlagsProperty.flags.locallyUncachedResource) {
7375
l3Enabled = false;
7476
}
@@ -77,18 +79,17 @@ struct KernelHw : public KernelImp {
7779
this->kernelRequiresQueueUncachedMocsCount++;
7880
}
7981

80-
NEO::Device *neoDevice = module->getDevice()->getNEODevice();
81-
8282
NEO::EncodeSurfaceStateArgs args;
8383
args.outMemory = &surfaceState;
8484
args.graphicsAddress = bufferAddressForSsh;
8585
args.size = bufferSizeForSsh;
86-
args.mocs = this->module->getDevice()->getMOCS(l3Enabled, false);
86+
args.mocs = device->getMOCS(l3Enabled, false);
8787
args.numAvailableDevices = neoDevice->getNumGenericSubDevices();
8888
args.allocation = alloc;
8989
args.gmmHelper = neoDevice->getGmmHelper();
9090
args.useGlobalAtomics = kernelImmData->getDescriptor().kernelAttributes.flags.useGlobalAtomics;
9191
args.areMultipleSubDevicesInContext = args.numAvailableDevices > 1;
92+
args.implicitScaling = device->isImplicitScalingCapable();
9293

9394
NEO::EncodeSurfaceState<GfxFamily>::encodeBuffer(args);
9495
*reinterpret_cast<typename GfxFamily::RENDER_SURFACE_STATE *>(surfaceStateAddress) = surfaceState;

level_zero/core/source/kernel/kernel_imp.cpp

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,8 @@ KernelImmutableData::~KernelImmutableData() {
8484

8585
inline void patchWithImplicitSurface(ArrayRef<uint8_t> crossThreadData, ArrayRef<uint8_t> surfaceStateHeap,
8686
uintptr_t ptrToPatchInCrossThreadData, NEO::GraphicsAllocation &allocation,
87-
const NEO::ArgDescPointer &ptr, const NEO::Device &device, bool useGlobalAtomics) {
87+
const NEO::ArgDescPointer &ptr, const NEO::Device &device, bool useGlobalAtomics,
88+
bool implicitScaling) {
8889
if (false == crossThreadData.empty()) {
8990
NEO::patchPointer(crossThreadData, ptr, ptrToPatchInCrossThreadData);
9091
}
@@ -107,6 +108,7 @@ inline void patchWithImplicitSurface(ArrayRef<uint8_t> crossThreadData, ArrayRef
107108
args.numAvailableDevices = device.getNumGenericSubDevices();
108109
args.areMultipleSubDevicesInContext = args.numAvailableDevices > 1;
109110
args.mocs = hwHelper.getMocsIndex(*args.gmmHelper, true, false) << 1;
111+
args.implicitScaling = implicitScaling;
110112

111113
hwHelper.encodeBufferSurfaceState(args);
112114
}
@@ -179,7 +181,7 @@ void KernelImmutableData::initialize(NEO::KernelInfo *kernelInfo, Device *device
179181
patchWithImplicitSurface(crossThredDataArrayRef, surfaceStateHeapArrayRef,
180182
static_cast<uintptr_t>(globalConstBuffer->getGpuAddressToPatch()),
181183
*globalConstBuffer, kernelDescriptor->payloadMappings.implicitArgs.globalConstantsSurfaceAddress,
182-
*neoDevice, kernelDescriptor->kernelAttributes.flags.useGlobalAtomics);
184+
*neoDevice, kernelDescriptor->kernelAttributes.flags.useGlobalAtomics, deviceImp->isImplicitScalingCapable());
183185
this->residencyContainer.push_back(globalConstBuffer);
184186
} else if (nullptr != globalConstBuffer) {
185187
this->residencyContainer.push_back(globalConstBuffer);
@@ -191,7 +193,7 @@ void KernelImmutableData::initialize(NEO::KernelInfo *kernelInfo, Device *device
191193
patchWithImplicitSurface(crossThredDataArrayRef, surfaceStateHeapArrayRef,
192194
static_cast<uintptr_t>(globalVarBuffer->getGpuAddressToPatch()),
193195
*globalVarBuffer, kernelDescriptor->payloadMappings.implicitArgs.globalVariablesSurfaceAddress,
194-
*neoDevice, kernelDescriptor->kernelAttributes.flags.useGlobalAtomics);
196+
*neoDevice, kernelDescriptor->kernelAttributes.flags.useGlobalAtomics, deviceImp->isImplicitScalingCapable());
195197
this->residencyContainer.push_back(globalVarBuffer);
196198
} else if (nullptr != globalVarBuffer) {
197199
this->residencyContainer.push_back(globalVarBuffer);
@@ -758,7 +760,7 @@ void KernelImp::patchCrossthreadDataWithPrivateAllocation(NEO::GraphicsAllocatio
758760
patchWithImplicitSurface(crossThredDataArrayRef, surfaceStateHeapArrayRef,
759761
static_cast<uintptr_t>(privateAllocation->getGpuAddressToPatch()),
760762
*privateAllocation, kernelImmData->getDescriptor().payloadMappings.implicitArgs.privateMemoryAddress,
761-
*device->getNEODevice(), kernelAttributes.flags.useGlobalAtomics);
763+
*device->getNEODevice(), kernelAttributes.flags.useGlobalAtomics, device->isImplicitScalingCapable());
762764
}
763765

764766
ze_result_t KernelImp::initialize(const ze_kernel_desc_t *desc) {
@@ -941,7 +943,7 @@ void KernelImp::setDebugSurface() {
941943
patchWithImplicitSurface(ArrayRef<uint8_t>(), surfaceStateHeapRef,
942944
0,
943945
*device->getDebugSurface(), this->getImmutableData()->getDescriptor().payloadMappings.implicitArgs.systemThreadSurfaceAddress,
944-
*device->getNEODevice(), getKernelDescriptor().kernelAttributes.flags.useGlobalAtomics);
946+
*device->getNEODevice(), getKernelDescriptor().kernelAttributes.flags.useGlobalAtomics, device->isImplicitScalingCapable());
945947
}
946948
}
947949
void *KernelImp::patchBindlessSurfaceState(NEO::GraphicsAllocation *alloc, uint32_t bindless) {

level_zero/core/test/unit_tests/fixtures/module_fixture.h

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
11
/*
2-
* Copyright (C) 2020-2021 Intel Corporation
2+
* Copyright (C) 2020-2022 Intel Corporation
33
*
44
* SPDX-License-Identifier: MIT
55
*
66
*/
77

88
#pragma once
99

10+
#include "shared/source/command_container/implicit_scaling.h"
1011
#include "shared/source/helpers/file_io.h"
1112
#include "shared/source/memory_manager/allocation_properties.h"
1213
#include "shared/source/program/kernel_info.h"
@@ -392,5 +393,26 @@ struct ImportHostPointerModuleFixture : public ModuleFixture {
392393
void *hostPointer = nullptr;
393394
};
394395

396+
struct MultiTileModuleFixture : public MultiDeviceModuleFixture {
397+
void SetUp() {
398+
DebugManager.flags.EnableImplicitScaling.set(1);
399+
MultiDeviceFixture::numRootDevices = 1u;
400+
MultiDeviceFixture::numSubDevices = 2u;
401+
402+
MultiDeviceModuleFixture::SetUp();
403+
createModuleFromBinary(0);
404+
405+
device = driverHandle->devices[0];
406+
}
407+
408+
void TearDown() {
409+
MultiDeviceModuleFixture::TearDown();
410+
}
411+
412+
DebugManagerStateRestore debugRestore;
413+
VariableBackup<bool> backup{&NEO::ImplicitScaling::apiSupport, true};
414+
L0::Device *device = nullptr;
415+
};
416+
395417
} // namespace ult
396418
} // namespace L0

level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2328,5 +2328,34 @@ TEST_F(KernelImplicitArgTests, givenKernelWithoutImplicitArgsWhenPatchingImplici
23282328
EXPECT_EQ(0, memcmp(data, initData, 64));
23292329
}
23302330

2331+
using MultiTileModuleTest = Test<MultiTileModuleFixture>;
2332+
2333+
HWTEST2_F(MultiTileModuleTest, GivenMultiTileDeviceWhenSettingKernelArgAndSurfaceStateThenMultiTileFlagsAreSetCorrectly, IsXEHP) {
2334+
using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE;
2335+
ze_kernel_desc_t desc = {};
2336+
desc.pKernelName = kernelName.c_str();
2337+
2338+
WhiteBoxKernelHw<gfxCoreFamily> mockKernel;
2339+
mockKernel.module = modules[0].get();
2340+
mockKernel.initialize(&desc);
2341+
2342+
auto &arg = const_cast<NEO::ArgDescPointer &>(mockKernel.kernelImmData->getDescriptor().payloadMappings.explicitArgs[0].template as<NEO::ArgDescPointer>());
2343+
arg.bindless = undefined<CrossThreadDataOffset>;
2344+
arg.bindful = 0x40;
2345+
2346+
constexpr size_t size = 128;
2347+
uint64_t gpuAddress = 0x2000;
2348+
char bufferArray[size] = {};
2349+
void *buffer = reinterpret_cast<void *>(bufferArray);
2350+
NEO::MockGraphicsAllocation mockAllocation(buffer, gpuAddress, size);
2351+
2352+
mockKernel.setBufferSurfaceState(0, buffer, &mockAllocation);
2353+
2354+
void *surfaceStateAddress = ptrOffset(mockKernel.surfaceStateHeapData.get(), arg.bindful);
2355+
RENDER_SURFACE_STATE *surfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(surfaceStateAddress);
2356+
EXPECT_FALSE(surfaceState->getDisableSupportForMultiGpuAtomics());
2357+
EXPECT_FALSE(surfaceState->getDisableSupportForMultiGpuPartialWrites());
2358+
}
2359+
23312360
} // namespace ult
23322361
} // namespace L0

opencl/source/mem_obj/buffer_base.inl

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
11
/*
2-
* Copyright (C) 2019-2021 Intel Corporation
2+
* Copyright (C) 2019-2022 Intel Corporation
33
*
44
* SPDX-License-Identifier: MIT
55
*
66
*/
77

88
#include "shared/source/command_container/command_encoder.h"
9+
#include "shared/source/command_container/implicit_scaling.h"
910
#include "shared/source/device/device.h"
1011
#include "shared/source/execution_environment/execution_environment.h"
1112
#include "shared/source/execution_environment/root_device_environment.h"
@@ -52,6 +53,7 @@ void BufferHw<GfxFamily>::setArgStateful(void *memory, bool forceNonAuxMode, boo
5253
args.gmmHelper = device.getGmmHelper();
5354
args.useGlobalAtomics = useGlobalAtomics;
5455
args.areMultipleSubDevicesInContext = areMultipleSubDevicesInContext;
56+
args.implicitScaling = ImplicitScalingHelper::isImplicitScalingEnabled(device.getDeviceBitfield(), true);
5557
appendSurfaceStateArgs(args);
5658
EncodeSurfaceState<GfxFamily>::encodeBuffer(args);
5759
}

shared/source/command_container/command_encoder_xehp_and_later.inl

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -639,9 +639,7 @@ void EncodeSurfaceState<Family>::encodeExtraBufferParams(EncodeSurfaceStateArgs
639639
}
640640

641641
encodeExtraCacheSettings(surfaceState, *args.gmmHelper->getHardwareInfo());
642-
DeviceBitfield deviceBitfield{static_cast<uint32_t>(maxNBitValue(args.numAvailableDevices))};
643-
bool implicitScaling = ImplicitScalingHelper::isImplicitScalingEnabled(deviceBitfield, true);
644-
bool enablePartialWrites = implicitScaling;
642+
bool enablePartialWrites = args.implicitScaling;
645643
bool enableMultiGpuAtomics = enablePartialWrites;
646644

647645
if (DebugManager.flags.EnableMultiGpuAtomicsOptimization.get()) {

shared/source/command_container/definitions/encode_surface_state_args_base.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (C) 2021 Intel Corporation
2+
* Copyright (C) 2021-2022 Intel Corporation
33
*
44
* SPDX-License-Identifier: MIT
55
*
@@ -30,6 +30,7 @@ struct EncodeSurfaceStateArgsBase {
3030
bool isReadOnly = false;
3131
bool useGlobalAtomics = false;
3232
bool areMultipleSubDevicesInContext = false;
33+
bool implicitScaling = false;
3334

3435
protected:
3536
EncodeSurfaceStateArgsBase() = default;

shared/test/unit_test/encoders/command_encoder_tests_xehp_and_later.cpp

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (C) 2021 Intel Corporation
2+
* Copyright (C) 2021-2022 Intel Corporation
33
*
44
* SPDX-License-Identifier: MIT
55
*
@@ -95,3 +95,34 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterHardwareCommandsTest, givenPartitionArg
9595
ASSERT_NE(nullptr, storeDataImm);
9696
EXPECT_TRUE(storeDataImm->getWorkloadPartitionIdOffsetEnable());
9797
}
98+
99+
HWTEST2_F(XeHPAndLaterCommandEncoderTest,
100+
GivenImplicitAndAtomicsFlagsTrueWhenProgrammingSurfaceStateThenExpectMultiTileCorrectlySet, IsXEHP) {
101+
using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE;
102+
103+
auto memoryManager = pDevice->getExecutionEnvironment()->memoryManager.get();
104+
size_t allocationSize = MemoryConstants::pageSize;
105+
AllocationProperties properties(pDevice->getRootDeviceIndex(), allocationSize, GraphicsAllocation::AllocationType::BUFFER, pDevice->getDeviceBitfield());
106+
auto allocation = memoryManager->allocateGraphicsMemoryWithProperties(properties);
107+
108+
auto outSurfaceState = FamilyType::cmdInitRenderSurfaceState;
109+
110+
NEO::EncodeSurfaceStateArgs args;
111+
args.outMemory = &outSurfaceState;
112+
args.graphicsAddress = allocation->getGpuAddress();
113+
args.size = allocation->getUnderlyingBufferSize();
114+
args.mocs = pDevice->getGmmHelper()->getMOCS(GMM_RESOURCE_USAGE_OCL_BUFFER);
115+
args.numAvailableDevices = pDevice->getNumGenericSubDevices();
116+
args.allocation = allocation;
117+
args.gmmHelper = pDevice->getGmmHelper();
118+
args.areMultipleSubDevicesInContext = true;
119+
args.implicitScaling = true;
120+
args.useGlobalAtomics = true;
121+
122+
EncodeSurfaceState<FamilyType>::encodeBuffer(args);
123+
124+
EXPECT_FALSE(outSurfaceState.getDisableSupportForMultiGpuAtomics());
125+
EXPECT_FALSE(outSurfaceState.getDisableSupportForMultiGpuPartialWrites());
126+
127+
memoryManager->freeGraphicsMemory(allocation);
128+
}

0 commit comments

Comments
 (0)