Skip to content

Commit 9dabc2d

Browse files
Fail build program in shared system USM + statefull access case OCL
Signed-off-by: Kamil Kopryk <[email protected]> Related-To: NEO-6075
1 parent b98cfdd commit 9dabc2d

File tree

12 files changed

+155
-8
lines changed

12 files changed

+155
-8
lines changed

opencl/source/program/build.cpp

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,7 @@ cl_int Program::build(
3434
const char *buildOptions,
3535
bool enableCaching) {
3636
cl_int retVal = CL_SUCCESS;
37-
std::string internalOptions;
38-
initInternalOptions(internalOptions);
37+
3938
auto defaultClDevice = deviceVector[0];
4039
UNRECOVERABLE_IF(defaultClDevice == nullptr);
4140
auto &defaultDevice = defaultClDevice->getDevice();
@@ -69,6 +68,9 @@ cl_int Program::build(
6968
} else if (this->createdFrom != CreatedFrom::BINARY) {
7069
options = "";
7170
}
71+
std::string internalOptions;
72+
initInternalOptions(internalOptions);
73+
7274
extractInternalOptions(options, internalOptions);
7375
applyAdditionalOptions(internalOptions);
7476

@@ -166,6 +168,10 @@ cl_int Program::build(
166168
phaseReached[clDevice->getRootDeviceIndex()] = BuildPhase::BinaryProcessing;
167169
}
168170

171+
if (containsStatefulAccess(defaultDevice.getRootDeviceIndex()) && forceToStatelessNeeded() && !isBuiltIn) {
172+
retVal = CL_BUILD_PROGRAM_FAILURE;
173+
}
174+
169175
if (retVal != CL_SUCCESS) {
170176
break;
171177
}

opencl/source/program/program.cpp

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ Program::Program(Context *context, bool isBuiltIn, const ClDeviceVector &clDevic
6666
kernelDebugEnabled = clDevices[0]->isDebuggerActive();
6767
}
6868
void Program::initInternalOptions(std::string &internalOptions) const {
69+
6970
auto pClDevice = clDevices[0];
7071
auto force32BitAddressess = pClDevice->getSharedDeviceInfo().force32BitAddressess;
7172
internalOptions = getOclVersionCompilerInternalOption(pClDevice->getEnabledClVersion());
@@ -74,7 +75,7 @@ void Program::initInternalOptions(std::string &internalOptions) const {
7475
CompilerOptions::concatenateAppend(internalOptions, CompilerOptions::arch32bit);
7576
}
7677

77-
if ((isBuiltIn && is32bit) || pClDevice->areSharedSystemAllocationsAllowed() ||
78+
if ((isBuiltIn && is32bit) || forceToStatelessNeeded() ||
7879
DebugManager.flags.DisableStatelessToStatefulOptimization.get()) {
7980
CompilerOptions::concatenateAppend(internalOptions, CompilerOptions::greaterThan4gbBuffersRequired);
8081
}
@@ -136,6 +137,20 @@ Program::~Program() {
136137
}
137138
}
138139

140+
bool Program::forceToStatelessNeeded() const {
141+
auto preferStateful = false;
142+
if (auto it = options.find(NEO::CompilerOptions::smallerThan4gbBuffersOnly.data()); it != std::string::npos) {
143+
preferStateful = true;
144+
}
145+
146+
if (DebugManager.flags.UseSmallerThan4gbBuffersOnly.get() != -1) {
147+
preferStateful = static_cast<bool>(DebugManager.flags.UseSmallerThan4gbBuffersOnly.get());
148+
}
149+
150+
auto forceStateless = !preferStateful && clDevices[0]->areSharedSystemAllocationsAllowed();
151+
return forceStateless;
152+
}
153+
139154
cl_int Program::createProgramFromBinary(
140155
const void *pBinary,
141156
size_t binarySize, ClDevice &clDevice) {
@@ -489,6 +504,22 @@ cl_int Program::packDeviceBinary(ClDevice &clDevice) {
489504
return CL_SUCCESS;
490505
}
491506

507+
bool Program::containsStatefulAccess(uint32_t rootDeviceIndex) const {
508+
auto &buildInfo = buildInfos[rootDeviceIndex];
509+
for (const auto &kernelInfo : buildInfo.kernelInfoArray) {
510+
for (const auto &arg : kernelInfo->kernelDescriptor.payloadMappings.explicitArgs) {
511+
auto isStatefulAccess = arg.is<ArgDescriptor::ArgTPointer>() &&
512+
(isValidOffset(arg.as<ArgDescPointer>().bindless) ||
513+
isValidOffset(arg.as<ArgDescPointer>().bindful));
514+
if (isStatefulAccess) {
515+
return true;
516+
}
517+
}
518+
}
519+
520+
return false;
521+
}
522+
492523
void Program::setBuildStatus(cl_build_status status) {
493524
for (auto &deviceBuildInfo : deviceBuildInfos) {
494525
deviceBuildInfo.second.buildStatus = status;

opencl/source/program/program.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -283,6 +283,10 @@ class Program : public BaseObject<_cl_program> {
283283
}
284284

285285
protected:
286+
bool forceToStatelessNeeded() const;
287+
288+
MOCKABLE_VIRTUAL bool containsStatefulAccess(uint32_t rootDeviceIndex) const;
289+
286290
MOCKABLE_VIRTUAL cl_int createProgramFromBinary(const void *pBinary, size_t binarySize, ClDevice &clDevice);
287291

288292
cl_int packDeviceBinary(ClDevice &clDevice);
@@ -366,6 +370,7 @@ class Program : public BaseObject<_cl_program> {
366370

367371
bool isBuiltIn = false;
368372
bool kernelDebugEnabled = false;
373+
bool containsStatefulAccesses = false;
369374
uint32_t maxRootDeviceIndex = std::numeric_limits<uint32_t>::max();
370375
std::mutex lockMutex;
371376
uint32_t exposedKernels = 0;

opencl/test/unit_test/mocks/mock_program.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ class MockProgram : public Program {
4040
using Program::debugDataSize;
4141
using Program::deviceBuildInfos;
4242
using Program::extractInternalOptions;
43+
using Program::forceToStatelessNeeded;
4344
using Program::getKernelInfo;
4445
using Program::internalOptionsToExtract;
4546
using Program::irBinary;

opencl/test/unit_test/program/program_tests.cpp

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1653,6 +1653,105 @@ TEST_F(ProgramTests, WhenProgramIsCreatedThenCorrectOclVersionIsInOptions) {
16531653
}
16541654
}
16551655

1656+
TEST_F(ProgramTests, whenForceToStatelessNeededIsCalledThenCorrectResultIsReturned) {
1657+
DebugManagerStateRestore restorer;
1658+
1659+
class MyMockProgram : public Program {
1660+
public:
1661+
using Program::forceToStatelessNeeded;
1662+
using Program::options;
1663+
using Program::Program;
1664+
};
1665+
1666+
MyMockProgram program(pContext, false, toClDeviceVector(*pClDevice));
1667+
1668+
{
1669+
DebugManager.flags.UseSmallerThan4gbBuffersOnly.set(-1);
1670+
program.options = "";
1671+
EXPECT_EQ(program.forceToStatelessNeeded(), pClDevice->areSharedSystemAllocationsAllowed());
1672+
}
1673+
{
1674+
DebugManager.flags.UseSmallerThan4gbBuffersOnly.set(-1);
1675+
program.options = "-cl-opt-smaller-than-4GB-buffers-only";
1676+
EXPECT_FALSE(program.forceToStatelessNeeded());
1677+
}
1678+
{
1679+
DebugManager.flags.UseSmallerThan4gbBuffersOnly.set(0);
1680+
program.options = "";
1681+
EXPECT_EQ(program.forceToStatelessNeeded(), pClDevice->areSharedSystemAllocationsAllowed());
1682+
}
1683+
{
1684+
DebugManager.flags.UseSmallerThan4gbBuffersOnly.set(0);
1685+
program.options = "-cl-opt-smaller-than-4GB-buffers-only";
1686+
EXPECT_EQ(program.forceToStatelessNeeded(), pClDevice->areSharedSystemAllocationsAllowed());
1687+
}
1688+
{
1689+
DebugManager.flags.UseSmallerThan4gbBuffersOnly.set(1);
1690+
program.options = "";
1691+
EXPECT_FALSE(program.forceToStatelessNeeded());
1692+
}
1693+
{
1694+
DebugManager.flags.UseSmallerThan4gbBuffersOnly.set(1);
1695+
program.options = "-cl-opt-smaller-than-4GB-buffers-only";
1696+
EXPECT_FALSE(program.forceToStatelessNeeded());
1697+
}
1698+
}
1699+
1700+
TEST_F(ProgramTests, givenStatefulAndStatelessAccessesWhenProgramBuildIsCalledThenCorrectResultIsReturned) {
1701+
DebugManagerStateRestore restorer;
1702+
1703+
class MyMockProgram : public Program {
1704+
public:
1705+
using Program::containsStatefulAccess;
1706+
using Program::createdFrom;
1707+
using Program::irBinary;
1708+
using Program::irBinarySize;
1709+
using Program::isBuiltIn;
1710+
using Program::options;
1711+
using Program::Program;
1712+
using Program::sourceCode;
1713+
1714+
bool containsStatefulAccess(uint32_t rootDeviceIndex) const override {
1715+
return hasStatefulAccess;
1716+
}
1717+
1718+
bool hasStatefulAccess = false;
1719+
};
1720+
1721+
MyMockProgram program(pContext, false, toClDeviceVector(*pClDevice));
1722+
program.isBuiltIn = false;
1723+
program.sourceCode = "test_kernel";
1724+
program.createdFrom = Program::CreatedFrom::SOURCE;
1725+
1726+
{
1727+
EXPECT_EQ(CL_SUCCESS, program.build(toClDeviceVector(*pClDevice), nullptr, false));
1728+
}
1729+
{
1730+
program.hasStatefulAccess = true;
1731+
DebugManager.flags.UseSmallerThan4gbBuffersOnly.set(1);
1732+
EXPECT_EQ(CL_SUCCESS, program.build(toClDeviceVector(*pClDevice), nullptr, false));
1733+
}
1734+
{
1735+
program.hasStatefulAccess = true;
1736+
DebugManager.flags.UseSmallerThan4gbBuffersOnly.set(0);
1737+
if (pClDevice->areSharedSystemAllocationsAllowed()) {
1738+
EXPECT_EQ(CL_BUILD_PROGRAM_FAILURE, program.build(toClDeviceVector(*pClDevice), nullptr, false));
1739+
} else {
1740+
EXPECT_EQ(CL_SUCCESS, program.build(toClDeviceVector(*pClDevice), nullptr, false));
1741+
}
1742+
}
1743+
{
1744+
MyMockProgram programWithBuiltIn(pContext, true, toClDeviceVector(*pClDevice));
1745+
programWithBuiltIn.irBinary.reset(new char[16]);
1746+
programWithBuiltIn.irBinarySize = 16;
1747+
1748+
programWithBuiltIn.isBuiltIn = true;
1749+
programWithBuiltIn.hasStatefulAccess = true;
1750+
DebugManager.flags.UseSmallerThan4gbBuffersOnly.set(0);
1751+
EXPECT_EQ(CL_SUCCESS, programWithBuiltIn.build(toClDeviceVector(*pClDevice), nullptr, false));
1752+
}
1753+
}
1754+
16561755
TEST_F(ProgramTests, GivenForcedClVersionWhenProgramIsCreatedThenCorrectOclOptionIsPresent) {
16571756
std::pair<unsigned int, std::string> testedValues[] = {
16581757
{0, "-ocl-version=120"},

opencl/test/unit_test/test_files/igdrcl.config

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,7 @@ ForceCsrFlushing = 0
110110
ForceCsrReprogramming = 0
111111
OmitTimestampPacketDependencies = 0
112112
DisableStatelessToStatefulOptimization = 0
113+
UseSmallerThan4gbBuffersOnly = -1
113114
DisableConcurrentBlockExecution = 0
114115
UseNoRingFlushesKmdMode = 1
115116
DisableZeroCopyForUseHostPtr = 0

opencl/test/unit_test/test_files/kernel_num_args.cl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,4 +6,5 @@
66
*/
77

88
__kernel void test(__global float *argGlobal, __read_only image3d_t argImg3D, __constant float *argConst) {
9+
argGlobal[0] = argConst[0];
910
}

opencl/test/unit_test/test_files/test_constant_memory.cl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,6 @@ __constant uint constant_a[2] = {0xabcd5432u, 0xaabb5533u};
1010
__kernel void test(__global uint *in, __global uint *out) {
1111
int i = get_global_id(0);
1212
int j = get_global_id(0) % (sizeof(constant_a) / sizeof(constant_a[0]));
13-
13+
in[0] = 0;
1414
out[i] = constant_a[j];
1515
}

shared/source/compiler_interface/compiler_options/compiler_options_base.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
namespace NEO {
1414
namespace CompilerOptions {
1515
static constexpr ConstStringRef greaterThan4gbBuffersRequired = "-cl-intel-greater-than-4GB-buffer-required";
16+
static constexpr ConstStringRef smallerThan4gbBuffersOnly = "-cl-opt-smaller-than-4GB-buffers-only";
1617
static constexpr ConstStringRef hasBufferOffsetArg = "-cl-intel-has-buffer-offset-arg";
1718
static constexpr ConstStringRef kernelDebugEnable = "-cl-kernel-debug-enable";
1819
static constexpr ConstStringRef arch32bit = "-m32";

shared/source/debug_settings/debug_variables_base.inl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -207,6 +207,7 @@ DECLARE_DEBUG_VARIABLE(bool, DisableStatelessToStatefulOptimization, false, "Dis
207207
DECLARE_DEBUG_VARIABLE(bool, DisableConcurrentBlockExecution, false, "disables concurrent block kernel execution")
208208
DECLARE_DEBUG_VARIABLE(bool, UseNoRingFlushesKmdMode, true, "Windows only, passes flag to KMD that informs KMD to not emit any ring buffer flushes.")
209209
DECLARE_DEBUG_VARIABLE(bool, DisableZeroCopyForUseHostPtr, false, "When active all buffer allocations created with CL_MEM_USE_HOST_PTR flag will not share memory with CPU.")
210+
DECLARE_DEBUG_VARIABLE(int32_t, UseSmallerThan4gbBuffersOnly, -1, " -1: default, 0: disabled, 1: enabled. When enabled driver will not force stateless accesses when shared system USM is active")
210211
DECLARE_DEBUG_VARIABLE(int32_t, EnableHostPtrTracking, -1, "Enable host ptr tracking: -1 - default platform setting, 0 - disabled, 1 - enabled")
211212
DECLARE_DEBUG_VARIABLE(int32_t, MaxHwThreadsPercent, 0, "If not zero then maximum number of used HW threads is capped to max * MaxHwThreadsPercent / 100")
212213
DECLARE_DEBUG_VARIABLE(int32_t, MinHwThreadsUnoccupied, 0, "If not zero then maximum number of used HW threads is reduced by MinHwThreadsUnoccupied")

shared/source/helpers/compiler_hw_info_config_base.inl

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,6 @@
1010
#include "shared/source/helpers/compiler_hw_info_config.h"
1111

1212
namespace NEO {
13-
template <PRODUCT_FAMILY gfxProduct>
14-
bool CompilerHwInfoConfigHw<gfxProduct>::isForceToStatelessRequired() const {
15-
return false;
16-
}
1713

1814
template <PRODUCT_FAMILY gfxProduct>
1915
bool CompilerHwInfoConfigHw<gfxProduct>::isForceEmuInt32DivRemSPRequired() const {

shared/source/helpers/compiler_hw_info_config_bdw_and_later.inl

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,4 +16,9 @@ bool CompilerHwInfoConfigHw<gfxProduct>::isMidThreadPreemptionSupported(const Ha
1616
return hwInfo.featureTable.ftrGpGpuMidThreadLevelPreempt;
1717
}
1818

19+
template <PRODUCT_FAMILY gfxProduct>
20+
bool CompilerHwInfoConfigHw<gfxProduct>::isForceToStatelessRequired() const {
21+
return false;
22+
}
23+
1924
} // namespace NEO

0 commit comments

Comments
 (0)