Skip to content

Commit 3a9ea8e

Browse files
aratajewweb-flow
authored andcommitted
Support SPV_INTEL_subgroup_buffer_prefetch and cl_intel_subgroups_buffer_prefetch extensions
This change also implements support for `SPV_INTEL_cache_controls` extension for prefetches from `SPV_INTEL_subgroup_buffer_prefetch` and `cl_intel_subgroups_buffer_prefetch` extensions. (cherry picked from commit af4d16f)
1 parent b5003a8 commit 3a9ea8e

File tree

16 files changed

+1599
-8
lines changed

16 files changed

+1599
-8
lines changed

IGC/AdaptorOCL/preprocess_spvir/HandleSPIRVDecorations/HandleSpirvDecorationMetadata.cpp

Lines changed: 130 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,42 @@ void HandleSpirvDecorationMetadata::visit1DBlockWriteCallInst(CallInst& I)
220220
}
221221
}
222222

223+
void HandleSpirvDecorationMetadata::visit1DBlockPrefetchCallInst(CallInst& I)
224+
{
225+
Value* ptr = I.getArgOperand(0);
226+
auto spirvDecorations = parseSPIRVDecorationsFromMD(ptr);
227+
for (auto& [DecorationId, MDNodes] : spirvDecorations)
228+
{
229+
switch (DecorationId)
230+
{
231+
// IDecCacheControlLoadINTEL
232+
case DecorationIdCacheControlLoad:
233+
{
234+
handleCacheControlINTELFor1DBlockIO<LoadCacheControl>(I, MDNodes);
235+
break;
236+
}
237+
}
238+
}
239+
}
240+
241+
void HandleSpirvDecorationMetadata::visitOCL1DBlockPrefetchCallInst(CallInst& I, SmallVectorImpl<StringRef>& Matches)
242+
{
243+
Value* ptr = I.getArgOperand(0);
244+
auto spirvDecorations = parseSPIRVDecorationsFromMD(ptr);
245+
for (auto& [DecorationId, MDNodes] : spirvDecorations)
246+
{
247+
switch (DecorationId)
248+
{
249+
// IDecCacheControlLoadINTEL
250+
case DecorationIdCacheControlLoad:
251+
{
252+
handleCacheControlINTELForOCL1DBlockPrefetch(I, MDNodes, Matches);
253+
break;
254+
}
255+
}
256+
}
257+
}
258+
223259
void HandleSpirvDecorationMetadata::visitCallInst(CallInst& I)
224260
{
225261
Function* F = I.getCalledFunction();
@@ -229,14 +265,17 @@ void HandleSpirvDecorationMetadata::visitCallInst(CallInst& I)
229265
"_Z[0-9]+(intel_sub_group_2d_block_(prefetch|read|read_transform|read_transpose)_[0-9]+b_[0-9]+r[0-9]+x[0-9]+c)");
230266
Regex pattern2DBlockWrite(
231267
"_Z[0-9]+(intel_sub_group_2d_block_write_[0-9]+b_[0-9]+r[0-9]+x[0-9]+c)");
268+
Regex patternOCL1DBlockPrefetch("_Z[0-9]+(intel_sub_group_block_prefetch_(uc|us|ui|ul)(2|4|8|16)?)");
232269
#if defined(IGC_SCALAR_USE_KHRONOS_SPIRV_TRANSLATOR)
233270
Regex patternPrefetch("_Z[0-9]+__spirv_ocl_prefetch");
234271
Regex pattern1DBlockRead("_Z[0-9]+__spirv_SubgroupBlockReadINTEL");
235272
Regex pattern1DBlockWrite("_Z[0-9]+__spirv_SubgroupBlockWriteINTEL");
273+
Regex pattern1DBlockPrefetch("_Z[0-9]+__spirv_SubgroupBlockPrefetchINTEL");
236274
#else // IGC Legacy SPIRV Translator
237275
Regex patternPrefetch("__builtin_spirv_OpenCL_prefetch");
238276
Regex pattern1DBlockRead("__builtin_spirv_OpSubgroupBlockReadINTEL");
239277
Regex pattern1DBlockWrite("__builtin_spirv_OpSubgroupBlockWriteINTEL");
278+
Regex pattern1DBlockPrefetch("__builtin_spirv_OpSubgroupBlockPrefetchINTEL");
240279
#endif
241280

242281
SmallVector<StringRef, 4> Matches;
@@ -262,6 +301,14 @@ void HandleSpirvDecorationMetadata::visitCallInst(CallInst& I)
262301
{
263302
visit1DBlockWriteCallInst(I);
264303
}
304+
else if (pattern1DBlockPrefetch.match(funcName, &Matches))
305+
{
306+
visit1DBlockPrefetchCallInst(I);
307+
}
308+
else if (patternOCL1DBlockPrefetch.match(funcName, &Matches))
309+
{
310+
visitOCL1DBlockPrefetchCallInst(I, Matches);
311+
}
265312
}
266313

267314
template<typename T>
@@ -395,8 +442,16 @@ void HandleSpirvDecorationMetadata::handleCacheControlINTELFor1DBlockIO(CallInst
395442
std::string funcName;
396443
if constexpr (std::is_same_v<T, LoadCacheControl>)
397444
{
398-
operationType = I.getType();
399-
funcName = "SubgroupBlockReadINTEL";
445+
if (auto isPrefetch = I.getType()->isVoidTy())
446+
{
447+
operationType = IGCLLVM::getNonOpaquePtrEltTy(I.getArgOperand(0)->getType());
448+
funcName = "SubgroupBlockPrefetchINTEL";
449+
}
450+
else
451+
{
452+
operationType = I.getType();
453+
funcName = "SubgroupBlockReadINTEL";
454+
}
400455
}
401456
else
402457
{
@@ -458,3 +513,76 @@ void HandleSpirvDecorationMetadata::handleCacheControlINTELFor1DBlockIO(CallInst
458513
if (F->getNumUses() == 0)
459514
m_BuiltinsToRemove.insert(F);
460515
}
516+
517+
void HandleSpirvDecorationMetadata::handleCacheControlINTELForOCL1DBlockPrefetch(CallInst& I, SmallPtrSetImpl<MDNode*>& MDNodes, SmallVectorImpl<StringRef>& Matches)
518+
{
519+
IGC_ASSERT(Matches[1].startswith("intel_sub_group_block_prefetch"));
520+
521+
CacheControlFromMDNodes cacheControl = resolveCacheControlFromMDNodes<LoadCacheControl>(m_pCtx, MDNodes);
522+
if (cacheControl.isEmpty) return;
523+
if (cacheControl.isInvalid)
524+
{
525+
m_pCtx->EmitWarning("Unsupported cache controls configuration requested. Applying default configuration.");
526+
return;
527+
}
528+
529+
Function* F = I.getCalledFunction();
530+
IGC_ASSERT(F);
531+
532+
Type* pointeeTy = IGCLLVM::getNonOpaquePtrEltTy(I.getArgOperand(0)->getType());
533+
IGC_ASSERT(pointeeTy->isIntegerTy());
534+
535+
StringRef numElementsFromName = Matches[3] != "" ? Matches[3] : "1";
536+
uint32_t numElementsToPrefetch = std::stoi(numElementsFromName.str());
537+
IGC_ASSERT(numElementsToPrefetch == 1 ||
538+
numElementsToPrefetch == 2 ||
539+
numElementsToPrefetch == 4 ||
540+
numElementsToPrefetch == 8 ||
541+
numElementsToPrefetch == 16);
542+
543+
uint32_t typeSizeInBytes = pointeeTy->getIntegerBitWidth() / 8;
544+
545+
Value* numBytesArg =
546+
(ConstantInt::get(Type::getInt32Ty(I.getContext()), (typeSizeInBytes * numElementsToPrefetch)));
547+
548+
std::string typeName;
549+
switch (typeSizeInBytes)
550+
{
551+
case 1:
552+
typeName = "char";
553+
break;
554+
case 2:
555+
typeName = "short";
556+
break;
557+
case 4:
558+
typeName = "int";
559+
break;
560+
case 8:
561+
typeName = "long";
562+
break;
563+
default:
564+
IGC_ASSERT(0 && "Unsupported block prefetch!");
565+
break;
566+
}
567+
568+
SmallVector<Value*, 3> args(I.args());
569+
args.push_back(numBytesArg);
570+
args.push_back(ConstantInt::get(Type::getInt32Ty(I.getContext()), cacheControl.value));
571+
572+
SmallVector<Type*, 3> argTypes;
573+
for (const auto& arg : args)
574+
argTypes.push_back(arg->getType());
575+
576+
auto* funcTy = FunctionType::get(I.getType(), argTypes, false);
577+
auto newFuncName = "__internal_SubgroupBlockPrefetchINTEL_" + typeName + "_cache_controls";
578+
auto newFunction = m_Module->getOrInsertFunction(newFuncName, funcTy);
579+
580+
auto newCall = CallInst::Create(newFunction, args, "", &I);
581+
I.replaceAllUsesWith(newCall);
582+
I.eraseFromParent();
583+
m_changed = true;
584+
585+
// Cleanup unused function if all calls have been replaced with the internal version
586+
if (F->getNumUses() == 0)
587+
m_BuiltinsToRemove.insert(F);
588+
}

IGC/AdaptorOCL/preprocess_spvir/HandleSPIRVDecorations/HandleSpirvDecorationMetadata.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,8 @@ namespace IGC
5252
void visitPrefetchCallInst(llvm::CallInst& I);
5353
void visit1DBlockReadCallInst(llvm::CallInst& I);
5454
void visit1DBlockWriteCallInst(llvm::CallInst& I);
55+
void visit1DBlockPrefetchCallInst(llvm::CallInst& I);
56+
void visitOCL1DBlockPrefetchCallInst(llvm::CallInst& I, llvm::SmallVectorImpl<llvm::StringRef>& Matches);
5557

5658
private:
5759
llvm::Module* m_Module = nullptr;
@@ -71,5 +73,6 @@ namespace IGC
7173
void handleCacheControlINTELForPrefetch(llvm::CallInst& I, llvm::SmallPtrSetImpl<llvm::MDNode*>& MDNodes);
7274
template<typename T>
7375
void handleCacheControlINTELFor1DBlockIO(llvm::CallInst& I, llvm::SmallPtrSetImpl<llvm::MDNode*>& MDNodes);
76+
void handleCacheControlINTELForOCL1DBlockPrefetch(llvm::CallInst& I, llvm::SmallPtrSetImpl<llvm::MDNode*>& MDNodes, llvm::SmallVectorImpl<llvm::StringRef>& Matches);
7477
};
7578
}

IGC/BiFModule/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -480,7 +480,7 @@ set(KHR_DEFINES "cl_khr_f16" "cl_khr_fp64" "cl_khr_gl_msaa_sharing" "cl_khr_mipm
480480
"cl_intel_subgroups_char" "cl_intel_subgroups_long" "cl_intel_subgroup_local_block_io" "cl_intel_64bit_global_atomics_placeholder"
481481
"cl_khr_subgroup_extended_types" "cl_khr_subgroup_non_uniform_vote" "cl_khr_subgroup_ballot" "cl_khr_subgroup_shuffle"
482482
"cl_khr_subgroup_shuffle_relative" "cl_khr_subgroup_non_uniform_arithmetic" "cl_khr_subgroup_clustered_reduce"
483-
"cl_khr_extended_bit_ops" "cl_intel_bit_instructions" "cl_intel_global_float_atomics")
483+
"cl_khr_extended_bit_ops" "cl_intel_bit_instructions" "cl_intel_global_float_atomics" "cl_intel_subgroups_buffer_prefetch")
484484
set(KHR_DEFINES ${KHR_DEFINES} "cl_intel_subgroup_matrix_multiply_accumulate" "cl_intel_subgroup_split_matrix_multiply_accumulate")
485485
set(KHR_DEFINES ${KHR_DEFINES} "cl_intel_rt_production")
486486
set(KHR_DEFINES ${KHR_DEFINES} "cl_intel_subgroup_matrix_multiply_accumulate_tf32")

IGC/BiFModule/Headers/spirv.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4830,6 +4830,13 @@ long4 SPIRV_OVERLOADABLE SPIRV_BUILTIN(SubgroupImageBlockReadINTEL, _v4i64_img2
48304830
long8 SPIRV_OVERLOADABLE SPIRV_BUILTIN(SubgroupImageBlockReadINTEL, _v8i64_img2d_ro_v2i32, _Rlong8)(global Img2d_ro* image, int2 coord);
48314831
#endif // cl_intel_subgroups_long
48324832

4833+
#ifdef cl_intel_subgroups_buffer_prefetch
4834+
void SPIRV_OVERLOADABLE SPIRV_BUILTIN(SubgroupBlockPrefetchINTEL, _p1i8, )(const global uchar* ptr, uint num_bytes);
4835+
void SPIRV_OVERLOADABLE SPIRV_BUILTIN(SubgroupBlockPrefetchINTEL, _p1i16, )(const global ushort* ptr, uint num_bytes);
4836+
void SPIRV_OVERLOADABLE SPIRV_BUILTIN(SubgroupBlockPrefetchINTEL, _p1i32, )(const global uint* ptr, uint num_bytes);
4837+
void SPIRV_OVERLOADABLE SPIRV_BUILTIN(SubgroupBlockPrefetchINTEL, _p1i64, )(const global ulong* ptr, uint num_bytes);
4838+
#endif // cl_intel_subgroups_buffer_prefetch
4839+
48334840
#define DECL_SUB_GROUP_BROADCAST(TYPE, TYPE_ABBR) \
48344841
DECL_SUB_GROUP_BROADCAST_BASE(TYPE, TYPE_ABBR) \
48354842
DECL_SUB_GROUP_BROADCAST_BASE(TYPE##2, v2##TYPE_ABBR) \

IGC/BiFModule/Implementation/IGCBiF_Intrinsics_Lsc.cl

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -405,6 +405,31 @@ ulong __builtin_IB_lsc_atomic_cmpxchg_global_ulong(volatile __global ulong *bas
405405
ulong __builtin_IB_lsc_atomic_cmpxchg_local_ulong(volatile __local ulong *base, int immElemOff, ulong cmp, ulong val);
406406
#endif // cl_intel_pvc_lsc_validation
407407

408+
#ifdef cl_intel_subgroups_buffer_prefetch
409+
// 1D Block prefetches
410+
void __builtin_IB_lsc_simd_block_prefetch_uchar(const __global uchar*, enum LSC_LDCC cacheOpt);
411+
void __builtin_IB_lsc_simd_block_prefetch_uchar2(const __global uchar*, enum LSC_LDCC cacheOpt);
412+
void __builtin_IB_lsc_simd_block_prefetch_uchar4(const __global uchar*, enum LSC_LDCC cacheOpt);
413+
void __builtin_IB_lsc_simd_block_prefetch_uchar8(const __global uchar*, enum LSC_LDCC cacheOpt);
414+
void __builtin_IB_lsc_simd_block_prefetch_uchar16(const __global uchar*, enum LSC_LDCC cacheOpt);
415+
416+
void __builtin_IB_lsc_simd_block_prefetch_ushort(const __global ushort*, enum LSC_LDCC cacheOpt);
417+
void __builtin_IB_lsc_simd_block_prefetch_ushort2(const __global ushort*, enum LSC_LDCC cacheOpt);
418+
void __builtin_IB_lsc_simd_block_prefetch_ushort4(const __global ushort*, enum LSC_LDCC cacheOpt);
419+
void __builtin_IB_lsc_simd_block_prefetch_ushort8(const __global ushort*, enum LSC_LDCC cacheOpt);
420+
void __builtin_IB_lsc_simd_block_prefetch_ushort16(const __global ushort*, enum LSC_LDCC cacheOpt);
421+
422+
void __builtin_IB_lsc_simd_block_prefetch_uint(const __global uint*, enum LSC_LDCC cacheOpt);
423+
void __builtin_IB_lsc_simd_block_prefetch_uint2(const __global uint*, enum LSC_LDCC cacheOpt);
424+
void __builtin_IB_lsc_simd_block_prefetch_uint4(const __global uint*, enum LSC_LDCC cacheOpt);
425+
void __builtin_IB_lsc_simd_block_prefetch_uint8(const __global uint*, enum LSC_LDCC cacheOpt);
426+
427+
void __builtin_IB_lsc_simd_block_prefetch_ulong(const __global ulong*, enum LSC_LDCC cacheOpt);
428+
void __builtin_IB_lsc_simd_block_prefetch_ulong2(const __global ulong*, enum LSC_LDCC cacheOpt);
429+
void __builtin_IB_lsc_simd_block_prefetch_ulong4(const __global ulong*, enum LSC_LDCC cacheOpt);
430+
void __builtin_IB_lsc_simd_block_prefetch_ulong8(const __global ulong*, enum LSC_LDCC cacheOpt);
431+
#endif // cl_intel_subgroups_buffer_prefetch
432+
408433
#ifdef cl_intel_subgroup_extended_block_read
409434
// 2d block read
410435
ushort2 __builtin_IB_subgroup_block_read_flat_u8_m1k32v2(long baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, int2 coord);

IGC/BiFModule/Implementation/group.cl

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1738,6 +1738,129 @@ DEF_INTEL_SUB_GROUP_BLOCK_READ_GLOBAL(long4, v4i64, long, i64, simd_block_read_4
17381738
DEF_INTEL_SUB_GROUP_BLOCK_READ_GLOBAL(long8, v8i64, long, i64, simd_block_read_8_global_l)
17391739
#endif // cl_intel_subgroups_long
17401740

1741+
#ifdef cl_intel_subgroups_buffer_prefetch
1742+
1743+
void __internal_SubgroupBlockPrefetchINTEL_char_cache_controls(const global uchar* ptr, uint num_bytes, enum LSC_LDCC cacheOpt)
1744+
{
1745+
if (BIF_FLAG_CTRL_GET(UseLSC))
1746+
{
1747+
if (num_bytes == 1)
1748+
{
1749+
__builtin_IB_lsc_simd_block_prefetch_uchar(ptr, cacheOpt);
1750+
}
1751+
else if (num_bytes == 2)
1752+
{
1753+
__builtin_IB_lsc_simd_block_prefetch_uchar2(ptr, cacheOpt);
1754+
}
1755+
else if (num_bytes == 4)
1756+
{
1757+
__builtin_IB_lsc_simd_block_prefetch_uchar4(ptr, cacheOpt);
1758+
}
1759+
else if (num_bytes == 8)
1760+
{
1761+
__builtin_IB_lsc_simd_block_prefetch_uchar8(ptr, cacheOpt);
1762+
}
1763+
else if (num_bytes == 16)
1764+
{
1765+
__builtin_IB_lsc_simd_block_prefetch_uchar16(ptr, cacheOpt);
1766+
}
1767+
}
1768+
}
1769+
1770+
void SPIRV_OVERLOADABLE SPIRV_BUILTIN(SubgroupBlockPrefetchINTEL, _p1i8, )(const global uchar* ptr, uint num_bytes)
1771+
{
1772+
__internal_SubgroupBlockPrefetchINTEL_char_cache_controls(ptr, num_bytes, LSC_LDCC_DEFAULT);
1773+
}
1774+
1775+
void __internal_SubgroupBlockPrefetchINTEL_short_cache_controls(const global ushort* ptr, uint num_bytes, enum LSC_LDCC cacheOpt)
1776+
{
1777+
if (BIF_FLAG_CTRL_GET(UseLSC))
1778+
{
1779+
if (num_bytes == 2)
1780+
{
1781+
__builtin_IB_lsc_simd_block_prefetch_ushort(ptr, cacheOpt);
1782+
}
1783+
else if (num_bytes == 4)
1784+
{
1785+
__builtin_IB_lsc_simd_block_prefetch_ushort2(ptr, cacheOpt);
1786+
}
1787+
else if (num_bytes == 8)
1788+
{
1789+
__builtin_IB_lsc_simd_block_prefetch_ushort4(ptr, cacheOpt);
1790+
}
1791+
else if (num_bytes == 16)
1792+
{
1793+
__builtin_IB_lsc_simd_block_prefetch_ushort8(ptr, cacheOpt);
1794+
}
1795+
else if (num_bytes == 32)
1796+
{
1797+
__builtin_IB_lsc_simd_block_prefetch_ushort16(ptr, cacheOpt);
1798+
}
1799+
}
1800+
}
1801+
1802+
void SPIRV_OVERLOADABLE SPIRV_BUILTIN(SubgroupBlockPrefetchINTEL, _p1i16, )(const global ushort* ptr, uint num_bytes)
1803+
{
1804+
__internal_SubgroupBlockPrefetchINTEL_short_cache_controls(ptr, num_bytes, LSC_LDCC_DEFAULT);
1805+
}
1806+
1807+
void __internal_SubgroupBlockPrefetchINTEL_int_cache_controls(const global uint* ptr, uint num_bytes, enum LSC_LDCC cacheOpt)
1808+
{
1809+
if (BIF_FLAG_CTRL_GET(UseLSC))
1810+
{
1811+
if (num_bytes == 4)
1812+
{
1813+
__builtin_IB_lsc_simd_block_prefetch_uint(ptr, cacheOpt);
1814+
}
1815+
else if (num_bytes == 8)
1816+
{
1817+
__builtin_IB_lsc_simd_block_prefetch_uint2(ptr, cacheOpt);
1818+
}
1819+
else if (num_bytes == 16)
1820+
{
1821+
__builtin_IB_lsc_simd_block_prefetch_uint4(ptr, cacheOpt);
1822+
}
1823+
else if (num_bytes == 32)
1824+
{
1825+
__builtin_IB_lsc_simd_block_prefetch_uint8(ptr, cacheOpt);
1826+
}
1827+
}
1828+
}
1829+
1830+
void SPIRV_OVERLOADABLE SPIRV_BUILTIN(SubgroupBlockPrefetchINTEL, _p1i32, )(const global uint* ptr, uint num_bytes)
1831+
{
1832+
__internal_SubgroupBlockPrefetchINTEL_int_cache_controls(ptr, num_bytes, LSC_LDCC_DEFAULT);
1833+
}
1834+
1835+
void __internal_SubgroupBlockPrefetchINTEL_long_cache_controls(const global ulong* ptr, uint num_bytes, enum LSC_LDCC cacheOpt)
1836+
{
1837+
if (BIF_FLAG_CTRL_GET(UseLSC))
1838+
{
1839+
if (num_bytes == 8)
1840+
{
1841+
__builtin_IB_lsc_simd_block_prefetch_ulong(ptr, cacheOpt);
1842+
}
1843+
else if (num_bytes == 16)
1844+
{
1845+
__builtin_IB_lsc_simd_block_prefetch_ulong2(ptr, cacheOpt);
1846+
}
1847+
else if (num_bytes == 32)
1848+
{
1849+
__builtin_IB_lsc_simd_block_prefetch_ulong4(ptr, cacheOpt);
1850+
}
1851+
else if (num_bytes == 64)
1852+
{
1853+
__builtin_IB_lsc_simd_block_prefetch_ulong8(ptr, cacheOpt);
1854+
}
1855+
}
1856+
}
1857+
1858+
void SPIRV_OVERLOADABLE SPIRV_BUILTIN(SubgroupBlockPrefetchINTEL, _p1i64, )(const global ulong* ptr, uint num_bytes)
1859+
{
1860+
__internal_SubgroupBlockPrefetchINTEL_long_cache_controls(ptr, num_bytes, LSC_LDCC_DEFAULT);
1861+
}
1862+
#endif // cl_intel_subgroups_buffer_prefetch
1863+
17411864
#ifdef cl_intel_subgroup_local_block_io
17421865

17431866
#define DEF_INTEL_SUB_GROUP_BLOCK_READ_LOCAL(TYPE, TYPE_ABBR, ELEM_TYPE, ELEM_TYPE_ABBR, INTERNAL_FUNC) \

0 commit comments

Comments
 (0)