Skip to content

Commit a068489

Browse files
mbelickipszymich
authored andcommitted
Using explicit subgroup operations in JointMatrix loads/stores.
This patch changes implementation of JointMatrix loads and stores to use explicit subgroup operations for loads/stores. This prvents some missed optimizations by load/store merge optimization.
1 parent df974fb commit a068489

File tree

9 files changed

+237
-131
lines changed

9 files changed

+237
-131
lines changed

IGC/BiFModule/Languages/OpenCL/PreRelease/IBiF_matrix.cl

Lines changed: 159 additions & 104 deletions
Large diffs are not rendered by default.

IGC/Compiler/CISACodeGen/EmitVISAPass.cpp

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8082,6 +8082,7 @@ void EmitPass::EmitGenIntrinsicMessage(llvm::GenIntrinsicInst* inst)
80828082
case GenISAIntrinsic::GenISA_LSCAtomicFP32:
80838083
case GenISAIntrinsic::GenISA_LSCAtomicInts:
80848084
case GenISAIntrinsic::GenISA_LSC2DBlockRead:
8085+
case GenISAIntrinsic::GenISA_LSC2DBlockWrite:
80858086
emitLSCIntrinsic(inst);
80868087
break;
80878088
case GenISAIntrinsic::GenISA_dummyInst:
@@ -20220,8 +20221,10 @@ void EmitPass::emitLSCStore(
2022020221
resource, addr_size, data_order, immOffset, cacheOpts);
2022120222
}
2022220223

20223-
void EmitPass::emitLSC2DBlockRead(llvm::GenIntrinsicInst* inst)
20224+
void EmitPass::emitLSC2DBlockOperation(llvm::GenIntrinsicInst* inst)
2022420225
{
20226+
const bool isRead = inst->getIntrinsicID() == GenISAIntrinsic::GenISA_LSC2DBlockRead;
20227+
2022520228
CVariable* pFlatImageBaseoffset = GetSymbol(inst->getOperand(0));
2022620229
CVariable* pFlatImageWidth = GetSymbol(inst->getOperand(1));
2022720230
CVariable* pFlatImageHeight = GetSymbol(inst->getOperand(2));
@@ -20253,8 +20256,13 @@ void EmitPass::emitLSC2DBlockRead(llvm::GenIntrinsicInst* inst)
2025320256
CName::NONE);
2025420257
}
2025520258

20259+
if (isRead == false)
20260+
{
20261+
destination = GetSymbol(inst->getOperand(12));
20262+
}
20263+
2025620264
m_encoder->LSC_2DBlockMessage(
20257-
LSC_LOAD_BLOCK2D,
20265+
isRead ? LSC_LOAD_BLOCK2D : LSC_STORE_BLOCK2D,
2025820266
nullptr,
2025920267
destination,
2026020268
nullptr, //pImgBTI - not needed for read
@@ -20272,7 +20280,7 @@ void EmitPass::emitLSC2DBlockRead(llvm::GenIntrinsicInst* inst)
2027220280
pFlatImagePitch);
2027320281
m_encoder->Push();
2027420282

20275-
if (destination != m_destination)
20283+
if (isRead && destination != m_destination)
2027620284
{
2027720285
// m1 v2 block read
2027820286
m_encoder->Copy(m_destination, destination);
@@ -20426,7 +20434,8 @@ void EmitPass::emitLSCIntrinsic(llvm::GenIntrinsicInst* GII)
2042620434
emitLSCFence(GII);
2042720435
break;
2042820436
case GenISAIntrinsic::GenISA_LSC2DBlockRead:
20429-
emitLSC2DBlockRead(GII);
20437+
case GenISAIntrinsic::GenISA_LSC2DBlockWrite:
20438+
emitLSC2DBlockOperation(GII);
2043020439
break;
2043120440
default:
2043220441
if (isLSCAtomic(iid)) { ////// GenISA_LSCAtomic*

IGC/Compiler/CISACodeGen/EmitVISAPass.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -509,7 +509,7 @@ class EmitPass : public llvm::FunctionPass
509509
void emitLscIntrinsicStore(llvm::GenIntrinsicInst* GII);
510510

511511
void emitLSCFence(llvm::GenIntrinsicInst* inst);
512-
void emitLSC2DBlockRead(llvm::GenIntrinsicInst* inst);
512+
void emitLSC2DBlockOperation(llvm::GenIntrinsicInst* inst);
513513
void emitLSCAtomic(llvm::GenIntrinsicInst* inst);
514514
void emitLSCIntrinsic(llvm::GenIntrinsicInst* GII);
515515
void emitLSCLoad(

IGC/Compiler/Optimizer/OpenCLPasses/JointMatrixFuncsResolutionPass.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -268,9 +268,9 @@ std::string JointMatrixFuncsResolutionPass::GetLoadStoreMatrixFuncName
268268
}
269269

270270
/* On PVC due to SIMD16 different SIMD lane contribution is used for matrix A.
271-
* Therefore different load function is required. */
272-
if (m_Ctx->platform.hasExecSize16DPAS()
273-
&& (matrixLayout == LayoutPackedA || matrixLayout == LayoutRowMajor)) {
271+
* Additionally we use block 2d operations on PVC, so it's easier to
272+
* implement SG16 loads and stores as separate builtins. */
273+
if (m_Ctx->platform.hasExecSize16DPAS()) {
274274
name += "SG16_";
275275
}
276276

IGC/Compiler/Optimizer/OpenCLPasses/SubGroupFuncs/SubGroupFuncsResolution.cpp

Lines changed: 39 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,8 @@ const std::array<std::pair<std::string, WaveOps>, 13> SubGroupFuncsResolution::m
194194
};
195195

196196

197-
const llvm::StringRef SubGroupFuncsResolution::SUBGROUP_BLOCK_READ = "__builtin_IB_subgroup_block_read_flat";
197+
const llvm::StringRef SubGroupFuncsResolution::SUBGROUP_BLOCK_READ = "__builtin_IB_subgroup_block_read_flat";
198+
const llvm::StringRef SubGroupFuncsResolution::SUBGROUP_BLOCK_WRITE = "__builtin_IB_subgroup_block_write_flat";
198199

199200
SubGroupFuncsResolution::SubGroupFuncsResolution(void) : FunctionPass(ID)
200201
{
@@ -927,7 +928,11 @@ void SubGroupFuncsResolution::visitCallInst(CallInst& CI)
927928
}
928929
else if (funcName.consume_front(SubGroupFuncsResolution::SUBGROUP_BLOCK_READ))
929930
{
930-
subGroup2DBlockRead(CI, funcName);
931+
subGroup2DBlockOperation(CI, funcName, true);
932+
}
933+
else if (funcName.consume_front(SubGroupFuncsResolution::SUBGROUP_BLOCK_WRITE))
934+
{
935+
subGroup2DBlockOperation(CI, funcName, false);
931936
}
932937
else
933938
{
@@ -937,7 +942,7 @@ void SubGroupFuncsResolution::visitCallInst(CallInst& CI)
937942
m_changed = true;
938943
}
939944

940-
void SubGroupFuncsResolution::subGroup2DBlockRead(llvm::CallInst& CI, llvm::StringRef funcName)
945+
void SubGroupFuncsResolution::subGroup2DBlockOperation(llvm::CallInst& CI, llvm::StringRef funcName, bool isRead)
941946
{
942947
IGC::IGCMD::MetaDataUtils* pMdUtils = getAnalysis<MetaDataUtilsWrapper>().getMetaDataUtils();
943948
IGC::IGCMD::FunctionInfoMetaDataHandle funcInfoMD = pMdUtils->getFunctionsInfoItem(CI.getParent()->getParent());
@@ -998,11 +1003,15 @@ void SubGroupFuncsResolution::subGroup2DBlockRead(llvm::CallInst& CI, llvm::Stri
9981003
}
9991004
else
10001005
{
1001-
IGC_ASSERT_MESSAGE(0, "Unrecognized m element in __builtin_IB_subgroup_block_read_flat.");
1006+
IGC_ASSERT_MESSAGE(0, "Unrecognized m element in __builtin_IB_subgroup_block_read/write_flat.");
10021007
return;
10031008
}
10041009

1005-
if (funcName.consume_front("k16"))
1010+
if (funcName.consume_front("k8"))
1011+
{
1012+
tileWidth = 8;
1013+
}
1014+
else if (funcName.consume_front("k16"))
10061015
{
10071016
tileWidth = 16;
10081017
}
@@ -1012,11 +1021,18 @@ void SubGroupFuncsResolution::subGroup2DBlockRead(llvm::CallInst& CI, llvm::Stri
10121021
}
10131022
else
10141023
{
1015-
IGC_ASSERT_MESSAGE(0, "Unrecognized k element in __builtin_IB_subgroup_block_read_flat.");
1024+
IGC_ASSERT_MESSAGE(0, "Unrecognized k element in __builtin_IB_subgroup_block_read/write_flat.");
10161025
return;
10171026
}
10181027

1019-
IGC_ASSERT_MESSAGE(funcName.consume_front("v2"), "Unrecognized v element in __builtin_IB_subgroup_block_read_flat.");
1028+
if (funcName.consume_front("v1"))
1029+
{
1030+
numBlocksV = 1;
1031+
}
1032+
else
1033+
{
1034+
IGC_ASSERT_MESSAGE(funcName.consume_front("v2"), "Unrecognized v element in __builtin_IB_subgroup_block_read/write_flat.");
1035+
}
10201036
}
10211037
else if (isTranspose && !isVnniTransform)
10221038
{
@@ -1112,17 +1128,24 @@ void SubGroupFuncsResolution::subGroup2DBlockRead(llvm::CallInst& CI, llvm::Stri
11121128
args.push_back(isTransposeConstant);
11131129
args.push_back(isVnniTransformConstant);
11141130

1131+
Function* BlockFunc = nullptr;
1132+
if (isRead) {
1133+
BlockFunc = GenISAIntrinsic::getDeclaration(
1134+
CI.getCalledFunction()->getParent(),
1135+
GenISAIntrinsic::GenISA_LSC2DBlockRead,
1136+
CI.getCalledFunction()->getReturnType());
1137+
} else {
1138+
args.push_back(CI.getArgOperand(5));
1139+
BlockFunc = GenISAIntrinsic::getDeclaration(
1140+
CI.getCalledFunction()->getParent(),
1141+
GenISAIntrinsic::GenISA_LSC2DBlockWrite,
1142+
CI.getCalledFunction()->getReturnType());
1143+
}
11151144

1116-
Function* Block2DReadFunc = GenISAIntrinsic::getDeclaration(
1117-
CI.getCalledFunction()->getParent(),
1118-
GenISAIntrinsic::GenISA_LSC2DBlockRead,
1119-
CI.getCalledFunction()->getReturnType());
1120-
1121-
auto* BlockRead = cast<GenIntrinsicInst>(
1122-
CallInst::Create(Block2DReadFunc, args, "", &CI));
1123-
BlockRead->setDebugLoc(CI.getDebugLoc());
1145+
auto* BlockOp = cast<GenIntrinsicInst>(CallInst::Create(BlockFunc, args, "", &CI));
1146+
BlockOp->setDebugLoc(CI.getDebugLoc());
11241147

1125-
CI.replaceAllUsesWith(BlockRead);
1148+
CI.replaceAllUsesWith(BlockOp);
11261149
CI.eraseFromParent();
11271150
}
11281151

IGC/Compiler/Optimizer/OpenCLPasses/SubGroupFuncs/SubGroupFuncsResolution.hpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ namespace IGC
6868
void CheckMediaBlockInstError(llvm::GenIntrinsicInst* inst, bool isRead);
6969

7070
void subGroupArithmetic(llvm::CallInst& CI, WaveOps op, GroupOpType groupType);
71-
void subGroup2DBlockRead(llvm::CallInst& CI, llvm::StringRef funcName);
71+
void subGroup2DBlockOperation(llvm::CallInst& CI, llvm::StringRef funcName, bool isRead);
7272

7373
static const llvm::StringRef SUB_GROUP_BARRIER;
7474
static const llvm::StringRef GET_MAX_SUB_GROUP_SIZE;
@@ -209,6 +209,7 @@ namespace IGC
209209
static const llvm::StringRef SUB_GROUP_CLUSTERED_REDUCE;
210210

211211
static const llvm::StringRef SUBGROUP_BLOCK_READ;
212+
static const llvm::StringRef SUBGROUP_BLOCK_WRITE;
212213

213214
private:
214215
/// @brief Container for instructions to be deleted after visiting a function.

IGC/GenISAIntrinsics/Intrinsic_definitions.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2186,6 +2186,24 @@
21862186
"only and elemSize 32)")],
21872187
"None"]],
21882188
####################################################################################################
2189+
"GenISA_LSC2DBlockWrite": ["LSC 2d block write",
2190+
[("void", "nothing is returned"),
2191+
[("long", "flat image base offset"),
2192+
("int", "flat image base width"),
2193+
("int", "flat image base height"),
2194+
("int", "flat image base pitch"),
2195+
("int", "offset x"),
2196+
("int", "offset y"),
2197+
("int", "elemSize"),
2198+
("int", "tile width"),
2199+
("int", "tile height"),
2200+
("int", "V - num blocks (2 for simple 2d block read)"),
2201+
("bool", "transpose"),
2202+
("bool", "vnni transform (for transpose+transform use transpose "+\
2203+
"only and elemSize 32)"),
2204+
("anyint", "stored value")],
2205+
"None"]],
2206+
####################################################################################################
21892207
"GenISA_LSCAtomicFP32": ["LSC atomic FP32 add,sub,min,max,fcas",
21902208
[("float", "return old value"),
21912209
[("anyptr", "memory pointer: ugm, ugml, tgm, slm"),

IGC/VectorCompiler/lib/GenXOpts/CMTrans/GenXImportOCLBiF.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -453,7 +453,7 @@ static void InitializeBIFlags(Module &M) {
453453
// FIXME: target specific, but subtarget cannot be reached in middle-end.
454454
initializeVarWithValue("__HasInt64SLMAtomicCAS", 0);
455455

456-
initializeVarWithValue("__JointMatrixLoadStoreOpt", 2);
456+
initializeVarWithValue("__JointMatrixLoadStoreOpt", 3);
457457
}
458458

459459
static bool isOCLBuiltinDecl(const Function &F) {

IGC/common/igc_flags.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -691,7 +691,7 @@ DECLARE_IGC_REGKEY(bool, EnableLocalIdCalculationInShader, false,
691691
"Enables calcualtion of local thread IDs in shader. Valid only in compute"\
692692
"shaders on XeHP+. IDs are calculated only if HW generated IDs cannot be"\
693693
"used.", true)
694-
DECLARE_IGC_REGKEY(int, JointMatrixLoadStoreOpt, 2, "Selects scalar (0) or vector (1) or vector optimized (2) implementation of Joint Matrix Load/Store built-ins", true)
694+
DECLARE_IGC_REGKEY(int, JointMatrixLoadStoreOpt, 3, "Selects subgroup (0), or block read/write (1), or optimized block read/write (2), 2d block read/write (3) implementation of Joint Matrix Load/Store built-ins", true)
695695
DECLARE_IGC_REGKEY(bool, EnableVector8LoadStore, false, "Enable Vectorizer to generate 8x32i and 4x64i loads and stores", true)
696696
DECLARE_IGC_REGKEY(bool, EnableZEBinary, true, "Force-enable output in ZE binary format. Leave unset for compiler to choose based on current platform's support for ZE binary", true)
697697
DECLARE_IGC_REGKEY(bool, ExcludeIRFromZEBinary, false, "Exclude IR sections from ZE binary", true)

0 commit comments

Comments
 (0)