Refactor implicit scaling parameters for surface state

zzdanowicz · Compute-Runtime-Automation · commit c36c08381218 · 2022-01-17T09:30:58.000+01:00
Related-To: NEO-6589

Signed-off-by: Zbigniew Zdanowicz &lt;zbigniew.zdanowicz@intel.com&gt;
diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl b/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl
@@ -266,6 +266,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(z
         args.gmmHelper = neoDevice->getGmmHelper();
         args.useGlobalAtomics = kernelImp->getKernelDescriptor().kernelAttributes.flags.useGlobalAtomics;
         args.areMultipleSubDevicesInContext = args.numAvailableDevices > 1;
+        args.implicitScaling = this->partitionCount > 1;
 
         NEO::EncodeSurfaceState<GfxFamily>::encodeBuffer(args);
         *reinterpret_cast<typename GfxFamily::RENDER_SURFACE_STATE *>(surfaceStateSpace) = surfaceState;
diff --git a/level_zero/core/source/kernel/kernel_hw.h b/level_zero/core/source/kernel/kernel_hw.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2020-2021 Intel Corporation
+ * Copyright (C) 2020-2022 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  *
@@ -63,12 +63,14 @@ struct KernelHw : public KernelImp {
         bufferSizeForSsh = alignUp(bufferSizeForSsh, alignment);
 
         bool l3Enabled = true;
-
         // Allocation MUST be cacheline (64 byte) aligned in order to enable L3 caching otherwise Heap corruption will occur coming from the KMD.
         // Most commonly this issue will occur with Host Point Allocations from customers.
         l3Enabled = isL3Capable(*alloc);
 
-        auto allocData = this->module->getDevice()->getDriverHandle()->getSvmAllocsManager()->getSVMAlloc(reinterpret_cast<void *>(alloc->getGpuAddress()));
+        Device *device = module->getDevice();
+        NEO::Device *neoDevice = device->getNEODevice();
+
+        auto allocData = device->getDriverHandle()->getSvmAllocsManager()->getSVMAlloc(reinterpret_cast<void *>(alloc->getGpuAddress()));
         if (allocData && allocData->allocationFlagsProperty.flags.locallyUncachedResource) {
             l3Enabled = false;
         }
@@ -77,18 +79,17 @@ struct KernelHw : public KernelImp {
             this->kernelRequiresQueueUncachedMocsCount++;
         }
 
-        NEO::Device *neoDevice = module->getDevice()->getNEODevice();
-
         NEO::EncodeSurfaceStateArgs args;
         args.outMemory = &surfaceState;
         args.graphicsAddress = bufferAddressForSsh;
         args.size = bufferSizeForSsh;
-        args.mocs = this->module->getDevice()->getMOCS(l3Enabled, false);
+        args.mocs = device->getMOCS(l3Enabled, false);
         args.numAvailableDevices = neoDevice->getNumGenericSubDevices();
         args.allocation = alloc;
         args.gmmHelper = neoDevice->getGmmHelper();
         args.useGlobalAtomics = kernelImmData->getDescriptor().kernelAttributes.flags.useGlobalAtomics;
         args.areMultipleSubDevicesInContext = args.numAvailableDevices > 1;
+        args.implicitScaling = device->isImplicitScalingCapable();
 
         NEO::EncodeSurfaceState<GfxFamily>::encodeBuffer(args);
         *reinterpret_cast<typename GfxFamily::RENDER_SURFACE_STATE *>(surfaceStateAddress) = surfaceState;
diff --git a/level_zero/core/source/kernel/kernel_imp.cpp b/level_zero/core/source/kernel/kernel_imp.cpp
@@ -84,7 +84,8 @@ KernelImmutableData::~KernelImmutableData() {
 
 inline void patchWithImplicitSurface(ArrayRef<uint8_t> crossThreadData, ArrayRef<uint8_t> surfaceStateHeap,
                                      uintptr_t ptrToPatchInCrossThreadData, NEO::GraphicsAllocation &allocation,
-                                     const NEO::ArgDescPointer &ptr, const NEO::Device &device, bool useGlobalAtomics) {
+                                     const NEO::ArgDescPointer &ptr, const NEO::Device &device, bool useGlobalAtomics,
+                                     bool implicitScaling) {
     if (false == crossThreadData.empty()) {
         NEO::patchPointer(crossThreadData, ptr, ptrToPatchInCrossThreadData);
     }
@@ -107,6 +108,7 @@ inline void patchWithImplicitSurface(ArrayRef<uint8_t> crossThreadData, ArrayRef
         args.numAvailableDevices = device.getNumGenericSubDevices();
         args.areMultipleSubDevicesInContext = args.numAvailableDevices > 1;
         args.mocs = hwHelper.getMocsIndex(*args.gmmHelper, true, false) << 1;
+        args.implicitScaling = implicitScaling;
 
         hwHelper.encodeBufferSurfaceState(args);
     }
@@ -179,7 +181,7 @@ void KernelImmutableData::initialize(NEO::KernelInfo *kernelInfo, Device *device
         patchWithImplicitSurface(crossThredDataArrayRef, surfaceStateHeapArrayRef,
                                  static_cast<uintptr_t>(globalConstBuffer->getGpuAddressToPatch()),
                                  *globalConstBuffer, kernelDescriptor->payloadMappings.implicitArgs.globalConstantsSurfaceAddress,
-                                 *neoDevice, kernelDescriptor->kernelAttributes.flags.useGlobalAtomics);
+                                 *neoDevice, kernelDescriptor->kernelAttributes.flags.useGlobalAtomics, deviceImp->isImplicitScalingCapable());
         this->residencyContainer.push_back(globalConstBuffer);
     } else if (nullptr != globalConstBuffer) {
         this->residencyContainer.push_back(globalConstBuffer);
@@ -191,7 +193,7 @@ void KernelImmutableData::initialize(NEO::KernelInfo *kernelInfo, Device *device
         patchWithImplicitSurface(crossThredDataArrayRef, surfaceStateHeapArrayRef,
                                  static_cast<uintptr_t>(globalVarBuffer->getGpuAddressToPatch()),
                                  *globalVarBuffer, kernelDescriptor->payloadMappings.implicitArgs.globalVariablesSurfaceAddress,
-                                 *neoDevice, kernelDescriptor->kernelAttributes.flags.useGlobalAtomics);
+                                 *neoDevice, kernelDescriptor->kernelAttributes.flags.useGlobalAtomics, deviceImp->isImplicitScalingCapable());
         this->residencyContainer.push_back(globalVarBuffer);
     } else if (nullptr != globalVarBuffer) {
         this->residencyContainer.push_back(globalVarBuffer);
@@ -758,7 +760,7 @@ void KernelImp::patchCrossthreadDataWithPrivateAllocation(NEO::GraphicsAllocatio
     patchWithImplicitSurface(crossThredDataArrayRef, surfaceStateHeapArrayRef,
                              static_cast<uintptr_t>(privateAllocation->getGpuAddressToPatch()),
                              *privateAllocation, kernelImmData->getDescriptor().payloadMappings.implicitArgs.privateMemoryAddress,
-                             *device->getNEODevice(), kernelAttributes.flags.useGlobalAtomics);
+                             *device->getNEODevice(), kernelAttributes.flags.useGlobalAtomics, device->isImplicitScalingCapable());
 }
 
 ze_result_t KernelImp::initialize(const ze_kernel_desc_t *desc) {
@@ -941,7 +943,7 @@ void KernelImp::setDebugSurface() {
         patchWithImplicitSurface(ArrayRef<uint8_t>(), surfaceStateHeapRef,
                                  0,
                                  *device->getDebugSurface(), this->getImmutableData()->getDescriptor().payloadMappings.implicitArgs.systemThreadSurfaceAddress,
-                                 *device->getNEODevice(), getKernelDescriptor().kernelAttributes.flags.useGlobalAtomics);
+                                 *device->getNEODevice(), getKernelDescriptor().kernelAttributes.flags.useGlobalAtomics, device->isImplicitScalingCapable());
     }
 }
 void *KernelImp::patchBindlessSurfaceState(NEO::GraphicsAllocation *alloc, uint32_t bindless) {
diff --git a/level_zero/core/test/unit_tests/fixtures/module_fixture.h b/level_zero/core/test/unit_tests/fixtures/module_fixture.h
@@ -1,12 +1,13 @@
 /*
- * Copyright (C) 2020-2021 Intel Corporation
+ * Copyright (C) 2020-2022 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  *
  */
 
 #pragma once
 
+#include "shared/source/command_container/implicit_scaling.h"
 #include "shared/source/helpers/file_io.h"
 #include "shared/source/memory_manager/allocation_properties.h"
 #include "shared/source/program/kernel_info.h"
@@ -392,5 +393,26 @@ struct ImportHostPointerModuleFixture : public ModuleFixture {
     void *hostPointer = nullptr;
 };
 
+struct MultiTileModuleFixture : public MultiDeviceModuleFixture {
+    void SetUp() {
+        DebugManager.flags.EnableImplicitScaling.set(1);
+        MultiDeviceFixture::numRootDevices = 1u;
+        MultiDeviceFixture::numSubDevices = 2u;
+
+        MultiDeviceModuleFixture::SetUp();
+        createModuleFromBinary(0);
+
+        device = driverHandle->devices[0];
+    }
+
+    void TearDown() {
+        MultiDeviceModuleFixture::TearDown();
+    }
+
+    DebugManagerStateRestore debugRestore;
+    VariableBackup<bool> backup{&NEO::ImplicitScaling::apiSupport, true};
+    L0::Device *device = nullptr;
+};
+
 } // namespace ult
 } // namespace L0
diff --git a/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp b/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp
@@ -2328,5 +2328,34 @@ TEST_F(KernelImplicitArgTests, givenKernelWithoutImplicitArgsWhenPatchingImplici
     EXPECT_EQ(0, memcmp(data, initData, 64));
 }
 
+using MultiTileModuleTest = Test<MultiTileModuleFixture>;
+
+HWTEST2_F(MultiTileModuleTest, GivenMultiTileDeviceWhenSettingKernelArgAndSurfaceStateThenMultiTileFlagsAreSetCorrectly, IsXEHP) {
+    using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE;
+    ze_kernel_desc_t desc = {};
+    desc.pKernelName = kernelName.c_str();
+
+    WhiteBoxKernelHw<gfxCoreFamily> mockKernel;
+    mockKernel.module = modules[0].get();
+    mockKernel.initialize(&desc);
+
+    auto &arg = const_cast<NEO::ArgDescPointer &>(mockKernel.kernelImmData->getDescriptor().payloadMappings.explicitArgs[0].template as<NEO::ArgDescPointer>());
+    arg.bindless = undefined<CrossThreadDataOffset>;
+    arg.bindful = 0x40;
+
+    constexpr size_t size = 128;
+    uint64_t gpuAddress = 0x2000;
+    char bufferArray[size] = {};
+    void *buffer = reinterpret_cast<void *>(bufferArray);
+    NEO::MockGraphicsAllocation mockAllocation(buffer, gpuAddress, size);
+
+    mockKernel.setBufferSurfaceState(0, buffer, &mockAllocation);
+
+    void *surfaceStateAddress = ptrOffset(mockKernel.surfaceStateHeapData.get(), arg.bindful);
+    RENDER_SURFACE_STATE *surfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(surfaceStateAddress);
+    EXPECT_FALSE(surfaceState->getDisableSupportForMultiGpuAtomics());
+    EXPECT_FALSE(surfaceState->getDisableSupportForMultiGpuPartialWrites());
+}
+
 } // namespace ult
 } // namespace L0
diff --git a/opencl/source/mem_obj/buffer_base.inl b/opencl/source/mem_obj/buffer_base.inl
@@ -1,11 +1,12 @@
 /*
- * Copyright (C) 2019-2021 Intel Corporation
+ * Copyright (C) 2019-2022 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  *
  */
 
 #include "shared/source/command_container/command_encoder.h"
+#include "shared/source/command_container/implicit_scaling.h"
 #include "shared/source/device/device.h"
 #include "shared/source/execution_environment/execution_environment.h"
 #include "shared/source/execution_environment/root_device_environment.h"
@@ -52,6 +53,7 @@ void BufferHw<GfxFamily>::setArgStateful(void *memory, bool forceNonAuxMode, boo
     args.gmmHelper = device.getGmmHelper();
     args.useGlobalAtomics = useGlobalAtomics;
     args.areMultipleSubDevicesInContext = areMultipleSubDevicesInContext;
+    args.implicitScaling = ImplicitScalingHelper::isImplicitScalingEnabled(device.getDeviceBitfield(), true);
     appendSurfaceStateArgs(args);
     EncodeSurfaceState<GfxFamily>::encodeBuffer(args);
 }
diff --git a/shared/source/command_container/command_encoder_xehp_and_later.inl b/shared/source/command_container/command_encoder_xehp_and_later.inl
@@ -639,9 +639,7 @@ void EncodeSurfaceState<Family>::encodeExtraBufferParams(EncodeSurfaceStateArgs
     }
 
     encodeExtraCacheSettings(surfaceState, *args.gmmHelper->getHardwareInfo());
-    DeviceBitfield deviceBitfield{static_cast<uint32_t>(maxNBitValue(args.numAvailableDevices))};
-    bool implicitScaling = ImplicitScalingHelper::isImplicitScalingEnabled(deviceBitfield, true);
-    bool enablePartialWrites = implicitScaling;
+    bool enablePartialWrites = args.implicitScaling;
     bool enableMultiGpuAtomics = enablePartialWrites;
 
     if (DebugManager.flags.EnableMultiGpuAtomicsOptimization.get()) {
diff --git a/shared/source/command_container/definitions/encode_surface_state_args_base.h b/shared/source/command_container/definitions/encode_surface_state_args_base.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2021 Intel Corporation
+ * Copyright (C) 2021-2022 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,6 +30,7 @@ struct EncodeSurfaceStateArgsBase {
     bool isReadOnly = false;
     bool useGlobalAtomics = false;
     bool areMultipleSubDevicesInContext = false;
+    bool implicitScaling = false;
 
   protected:
     EncodeSurfaceStateArgsBase() = default;
diff --git a/shared/test/unit_test/encoders/command_encoder_tests_xehp_and_later.cpp b/shared/test/unit_test/encoders/command_encoder_tests_xehp_and_later.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2021 Intel Corporation
+ * Copyright (C) 2021-2022 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  *
@@ -95,3 +95,34 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterHardwareCommandsTest, givenPartitionArg
     ASSERT_NE(nullptr, storeDataImm);
     EXPECT_TRUE(storeDataImm->getWorkloadPartitionIdOffsetEnable());
 }
+
+HWTEST2_F(XeHPAndLaterCommandEncoderTest,
+          GivenImplicitAndAtomicsFlagsTrueWhenProgrammingSurfaceStateThenExpectMultiTileCorrectlySet, IsXEHP) {
+    using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE;
+
+    auto memoryManager = pDevice->getExecutionEnvironment()->memoryManager.get();
+    size_t allocationSize = MemoryConstants::pageSize;
+    AllocationProperties properties(pDevice->getRootDeviceIndex(), allocationSize, GraphicsAllocation::AllocationType::BUFFER, pDevice->getDeviceBitfield());
+    auto allocation = memoryManager->allocateGraphicsMemoryWithProperties(properties);
+
+    auto outSurfaceState = FamilyType::cmdInitRenderSurfaceState;
+
+    NEO::EncodeSurfaceStateArgs args;
+    args.outMemory = &outSurfaceState;
+    args.graphicsAddress = allocation->getGpuAddress();
+    args.size = allocation->getUnderlyingBufferSize();
+    args.mocs = pDevice->getGmmHelper()->getMOCS(GMM_RESOURCE_USAGE_OCL_BUFFER);
+    args.numAvailableDevices = pDevice->getNumGenericSubDevices();
+    args.allocation = allocation;
+    args.gmmHelper = pDevice->getGmmHelper();
+    args.areMultipleSubDevicesInContext = true;
+    args.implicitScaling = true;
+    args.useGlobalAtomics = true;
+
+    EncodeSurfaceState<FamilyType>::encodeBuffer(args);
+
+    EXPECT_FALSE(outSurfaceState.getDisableSupportForMultiGpuAtomics());
+    EXPECT_FALSE(outSurfaceState.getDisableSupportForMultiGpuPartialWrites());
+
+    memoryManager->freeGraphicsMemory(allocation);
+}

Original file line number	Diff line number	Diff line change
`@@ -639,9 +639,7 @@ void EncodeSurfaceState<Family>::encodeExtraBufferParams(EncodeSurfaceStateArgs`
`639`	`639`	`}`
`640`	`640`
`641`	`641`	`encodeExtraCacheSettings(surfaceState, *args.gmmHelper->getHardwareInfo());`
`642`		`- DeviceBitfield deviceBitfield{static_cast<uint32_t>(maxNBitValue(args.numAvailableDevices))};`
`643`		`- bool implicitScaling = ImplicitScalingHelper::isImplicitScalingEnabled(deviceBitfield, true);`
`644`		`- bool enablePartialWrites = implicitScaling;`
	`642`	`+ bool enablePartialWrites = args.implicitScaling;`
`645`	`643`	`bool enableMultiGpuAtomics = enablePartialWrites;`
`646`	`644`
`647`	`645`	`if (DebugManager.flags.EnableMultiGpuAtomicsOptimization.get()) {`
Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,5 @@`
`1`	`1`	`/*`
`2`		`- * Copyright (C) 2021 Intel Corporation`
	`2`	`+ * Copyright (C) 2021-2022 Intel Corporation`
`3`	`3`	`*`
`4`	`4`	`* SPDX-License-Identifier: MIT`
`5`	`5`	`*`
`@@ -30,6 +30,7 @@ struct EncodeSurfaceStateArgsBase {`
`30`	`30`	`bool isReadOnly = false;`
`31`	`31`	`bool useGlobalAtomics = false;`
`32`	`32`	`bool areMultipleSubDevicesInContext = false;`
	`33`	`+ bool implicitScaling = false;`
`33`	`34`
`34`	`35`	`protected:`
`35`	`36`	`EncodeSurfaceStateArgsBase() = default;`