Skip to content

Commit 8e1ed2c

Browse files
iwwuigcbot
authored andcommitted
Fix the optimization EnableMergeTransposeSLM
1 parent 905c6be commit 8e1ed2c

File tree

2 files changed

+103
-90
lines changed

2 files changed

+103
-90
lines changed

IGC/Compiler/CISACodeGen/MemOpt.cpp

Lines changed: 102 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -571,7 +571,7 @@ bool MemOpt::mergeLoad(LoadInst* LeadingLoad,
571571
// list.
572572

573573
// Two edges of the region where loads are merged into.
574-
int64_t HighestOffset = LdSize, HighestOffset4NonSeq = 0;
574+
int64_t HighestOffset = LdSize, HighestOffset4Transpose = 0;
575575
int64_t LowestOffset = 0;
576576

577577
bool bCheckNext = true;
@@ -651,7 +651,7 @@ bool MemOpt::mergeLoad(LoadInst* LeadingLoad,
651651

652652
unsigned NextLoadSize = unsigned(DL->getTypeStoreSize(NextLoadType));
653653

654-
bool enableNonSeqMerge = false;
654+
bool enableTransposeSLM = false;
655655
uint32_t LeadInt2PtrOffset = 0;
656656

657657
// detect if we can merge non-sequential SLM loads
@@ -665,46 +665,41 @@ bool MemOpt::mergeLoad(LoadInst* LeadingLoad,
665665
// %1190 = load float, float addrspace(3) * %1185, align 4
666666
// %1191 = load float, float addrspace(3) * %1187, align 4
667667
// %1192 = load float, float addrspace(3) * %1189, align 4
668-
if (IGC_IS_FLAG_ENABLED(MergeSLMLoad)) {
668+
if (IGC_IS_FLAG_ENABLED(EnableMergeTransposeSLM)) {
669669
unsigned int resourceIndex = 0;
670670
bool direct = false;
671671
BufferType bufType = IGC::DecodeAS4GFXResource(
672672
LeadingLoad->getPointerAddressSpace(), direct, resourceIndex);
673673

674674
// initialize for first check in the loop
675-
if (!HighestOffset4NonSeq)
676-
HighestOffset4NonSeq = NextLoadSize * ArrayElem;
675+
if (!HighestOffset4Transpose)
676+
HighestOffset4Transpose = NextLoadSize * ArrayElem;
677677

678678
if (SLM == bufType &&
679-
(Off > ArrayElem) && !(Off % ArrayElem) &&
680-
LeadingLoad->getPointerAddressSpace() == NextLoad->getPointerAddressSpace()) {
681-
679+
(Off > ArrayElem) && !(Off % ArrayElem))
680+
{
682681
GetElementPtrInst* LeadGEP = dyn_cast<GetElementPtrInst>(LeadingLoad->getOperand(0));
683682
GetElementPtrInst* NextGEP = dyn_cast<GetElementPtrInst>(NextLoad->getOperand(0));
684683

685-
if (!LeadGEP || !NextGEP)
686-
continue;
687-
688-
if ((LeadGEP->getOperand(1) != NextGEP->getOperand(1)) ||
689-
(LeadGEP->getOperand(2) != NextGEP->getOperand(2)))
690-
continue;
691-
692-
if (!isa<IntToPtrInst>(LeadGEP->getPointerOperand()))
693-
continue;
694-
695-
if (IntToPtrInst* Int2Ptr =
696-
dyn_cast<IntToPtrInst>(LeadGEP->getPointerOperand())) {
697-
684+
if (LeadGEP && NextGEP &&
685+
(LeadGEP->getOperand(1) == NextGEP->getOperand(1)) &&
686+
(LeadGEP->getOperand(2) == NextGEP->getOperand(2)) &&
687+
isa<IntToPtrInst>(LeadGEP->getPointerOperand()))
688+
{
689+
IntToPtrInst* Int2Ptr = dyn_cast<IntToPtrInst>(LeadGEP->getPointerOperand());
698690
if (const ConstantInt* CI =
699-
dyn_cast<ConstantInt>(Int2Ptr->getOperand(0))) {
700-
if (CI->getType()->isIntegerTy()) {
691+
dyn_cast<ConstantInt>(Int2Ptr->getOperand(0)))
692+
{
693+
if (CI->getType()->isIntegerTy())
694+
{
701695
LeadInt2PtrOffset = (uint32_t)CI->getZExtValue();
702-
enableNonSeqMerge = true;
696+
enableTransposeSLM = true;
703697
}
704-
}
705-
} // if (IntToPtrInst* Int2Ptr
706-
} //if (SLM == bufType&& ..
707-
} // if (IGC_IS_FLAG_ENABLED(MergeSLMLoad)
698+
} // if (const ConstantInt* CI = ..
699+
} // if (LeadGEP && NextGEP ...
700+
701+
} //if (SLM == bufType && ..
702+
} // if (IGC_IS_FLAG_ENABLED(EnableMergeTransposeSLM)
708703

709704
// By assuming dead load elimination always works correctly, if the load on
710705
// the same location is observed again, that is probably because there is
@@ -716,23 +711,23 @@ bool MemOpt::mergeLoad(LoadInst* LeadingLoad,
716711
int64_t newLowestOffset;
717712
uint64_t newNumElts;
718713

719-
if (IGC_IS_FLAG_ENABLED(MergeSLMLoad) && enableNonSeqMerge) {
714+
if (IGC_IS_FLAG_ENABLED(EnableMergeTransposeSLM) && enableTransposeSLM) {
720715
if (!bCheckNext)
721716
continue;
722717

723-
newHighestOffset = std::max(Off + ArrayElem * NextLoadSize, HighestOffset4NonSeq);
718+
newHighestOffset = std::max(Off + ArrayElem * NextLoadSize, HighestOffset4Transpose);
724719
newLowestOffset = std::min(Off, LowestOffset);
725720
newNumElts = uint64_t((newHighestOffset - newLowestOffset) /
726721
LdScalarSize) / ArrayElem;
727722

728-
// Update HighestOffset4NonSeq for each iteration and
723+
// Update HighestOffset4Transpose for each iteration and
729724
// check against the next expected in the sequence
730725
// Example: Off = 576 when checking 'i32 5184' entry
731726
// (Offset 0) %1184 = inttoptr i32 4608 to[144 x float] addrspace(3) *
732727
// %1185 = getelementptr[144 x float], [144 x float] addrspace(3) * %1184, i32 0, i32 % 1179
733728
// (Offset 576) %1186 = inttoptr i32 5184 to[144 x float] addrspace(3) *
734-
if (Off != HighestOffset4NonSeq) {
735-
bCheckNext = false; // abort enableNonSeqMerge checking
729+
if (Off != HighestOffset4Transpose) {
730+
bCheckNext = false; // abort enableTransposeSLM checking
736731
continue;
737732
}
738733

@@ -742,6 +737,10 @@ bool MemOpt::mergeLoad(LoadInst* LeadingLoad,
742737
// 'NumElts' to 2 as if the i32 wasn't present.
743738
if (uint64_t(newHighestOffset - newLowestOffset) % (LdScalarSize * ArrayElem) != 0)
744739
continue;
740+
741+
// Limit to 3 entries for merging
742+
if (newNumElts > 3)
743+
continue;
745744
}
746745
else {
747746
newHighestOffset = std::max(Off + NextLoadSize, HighestOffset);
@@ -755,14 +754,14 @@ bool MemOpt::mergeLoad(LoadInst* LeadingLoad,
755754
// 'NumElts' to 2 as if the i32 wasn't present.
756755
if (uint64_t(newHighestOffset - newLowestOffset) % LdScalarSize != 0)
757756
continue;
758-
}
759757

760-
// Bail out if the resulting vector load is already not profitable.
761-
if (newNumElts > profitVec[0])
762-
continue;
758+
// Bail out if the resulting vector load is already not profitable.
759+
if (newNumElts > profitVec[0])
760+
continue;
761+
}
763762

764-
if (enableNonSeqMerge) {
765-
HighestOffset4NonSeq = newHighestOffset;
763+
if (enableTransposeSLM) {
764+
HighestOffset4Transpose = newHighestOffset;
766765
}
767766
else {
768767
HighestOffset = newHighestOffset;
@@ -861,36 +860,40 @@ bool MemOpt::mergeLoad(LoadInst* LeadingLoad,
861860
}
862861

863862
uint32_t ArrayElem = 1;
864-
if (IGC_IS_FLAG_ENABLED(MergeSLMLoad)) {
865-
uint32_t newInt2PtrOffset = std::get<3>(LoadsToMerge.back());
863+
uint32_t newInt2PtrOffset = std::get<3>(LoadsToMerge.back());
866864

867-
// lookup 144
868-
// %55 = getelementptr [144 x float], [144 x float] addrspace(3)* %54, i32 0, i32 %36
869-
if (GetElementPtrInst* GEP =
870-
dyn_cast<GetElementPtrInst>(FirstLoad->getPointerOperand())) {
871-
Value* GEPptr = GEP->getPointerOperand();
872-
if (GEPptr->getType()->isPointerTy()) {
873-
Type* GEPElemType = GEPptr->getType()->getPointerElementType();
874-
if (GEPElemType->isArrayTy())
875-
ArrayElem = (uint32_t)GEPElemType->getArrayNumElements();
876-
}
865+
// lookup 144
866+
// %55 = getelementptr [144 x float], [144 x float] addrspace(3)* %54, i32 0, i32 %36
867+
if (GetElementPtrInst* GEP =
868+
dyn_cast<GetElementPtrInst>(FirstLoad->getPointerOperand())) {
869+
Value* GEPptr = GEP->getPointerOperand();
870+
if (GEPptr->getType()->isPointerTy()) {
871+
Type* GEPElemType = GEPptr->getType()->getPointerElementType();
872+
if (GEPElemType->isArrayTy())
873+
ArrayElem = (uint32_t)GEPElemType->getArrayNumElements();
877874
}
878-
if (newInt2PtrOffset && ArrayElem) {
879-
Type* newArrayType = PointerType::get(
880-
ArrayType::get(LeadingLoadScalarType, ArrayElem * NumElts),
881-
LeadingLoad->getPointerAddressSpace());
882-
883-
Value* NewInt2Ptr = Builder.getInt32(std::get<3>(LoadsToMerge.back()));
875+
}
876+
// If newInt2PtrOffset is non-zero, that means enableTransposeSLM is set
877+
// Prepare new instructions
878+
if (newInt2PtrOffset && ArrayElem) {
879+
Type* newArrayType = PointerType::get(
880+
ArrayType::get(LeadingLoadScalarType, ArrayElem * NumElts),
881+
LeadingLoad->getPointerAddressSpace());
884882

885-
NewInt2Ptr = createBitOrPointerCast(NewInt2Ptr, newArrayType, Builder);
886-
GetElementPtrInst* LeadGEP =
887-
dyn_cast<GetElementPtrInst>(FirstLoad->getPointerOperand());
883+
Value* NewInt2Ptr = Builder.getInt32(std::get<3>(LoadsToMerge.back()));
888884

889-
Value* GEPArg[] = { LeadGEP->getOperand(1), LeadGEP->getOperand(2) };
885+
NewInt2Ptr = createBitOrPointerCast(NewInt2Ptr, newArrayType, Builder);
886+
GetElementPtrInst* LeadGEP =
887+
dyn_cast<GetElementPtrInst>(FirstLoad->getPointerOperand());
890888

891-
Ptr = Builder.CreateGEP(NewInt2Ptr, GEPArg);
892-
} // if (newInt2PtrOffset && ArrayElem)
893-
} // if (IGC_IS_FLAG_ENABLED(MergeSLMLoad)
889+
// We need to adjust the buffer index by multiply by 3
890+
// From x0, x1, ... y0, y1, ... z0, z1, ...
891+
// To x0, y0, z0, x1, y1, z1, ....
892+
Value* NewGEPOffset =
893+
Builder.CreateMul(LeadGEP->getOperand(2), Builder.getInt32(3));
894+
Value* GEPArg[] = { LeadGEP->getOperand(1), NewGEPOffset };
895+
Ptr = Builder.CreateGEP(NewInt2Ptr, GEPArg);
896+
} // if (newInt2PtrOffset && ArrayElem)
894897

895898
Type* NewLoadType = IGCLLVM::FixedVectorType::get(LeadingLoadScalarType, NumElts);
896899
Type* NewPointerType =
@@ -919,7 +922,7 @@ bool MemOpt::mergeLoad(LoadInst* LeadingLoad,
919922
allInvariantLoads = false;
920923
}
921924

922-
if (IGC_IS_FLAG_ENABLED(MergeSLMLoad) && newInt2PtrOffset)
925+
if (IGC_IS_FLAG_ENABLED(EnableMergeTransposeSLM) && newInt2PtrOffset)
923926
Pos = unsigned((std::get<1>(I) - FirstOffset) / LdScalarSize / ArrayElem);
924927
else
925928
Pos = unsigned((std::get<1>(I) - FirstOffset) / LdScalarSize);
@@ -1041,7 +1044,7 @@ bool MemOpt::mergeStore(StoreInst* LeadingStore,
10411044
// be merged into the "previous" tailing store.
10421045

10431046
// Two edges of the region where stores are merged into.
1044-
int64_t LastToLeading = StSize, LastToLeading4NonSeq = 0;
1047+
int64_t LastToLeading = StSize, LastToLeading4Transpose = 0;
10451048
int64_t LeadingToFirst = 0;
10461049

10471050
// List of instructions need dependency check.
@@ -1119,8 +1122,8 @@ bool MemOpt::mergeStore(StoreInst* LeadingStore,
11191122

11201123
unsigned NextStoreSize = unsigned(DL->getTypeStoreSize(NextStoreType));
11211124

1122-
bool enableNonSeqMerge = false;
1123-
// LeadInt2PtrOffset is non-zero for enableNonSeqMerge case so we can re-create
1125+
bool enableTransposeSLM = false;
1126+
// LeadInt2PtrOffset is non-zero for enableTransposeSLM case so we can re-create
11241127
// new inttoptr inst.
11251128
uint32_t LeadInt2PtrOffset = 0;
11261129

@@ -1135,19 +1138,17 @@ bool MemOpt::mergeStore(StoreInst* LeadingStore,
11351138
// store float% 51, float addrspace(3)*% 55, align 4
11361139
// store float% 52, float addrspace(3)*% 57, align 4
11371140
// store float% 53, float addrspace(3)*% 59, align 4
1138-
if (IGC_IS_FLAG_ENABLED(MergeSLMStore)) {
1141+
if (IGC_IS_FLAG_ENABLED(EnableMergeTransposeSLM)) {
11391142
unsigned int resourceIndex = 0;
11401143
bool direct = false;
11411144
BufferType bufType = IGC::DecodeAS4GFXResource(
11421145
LeadingStore->getPointerAddressSpace(), direct, resourceIndex);
1143-
if (SLM == bufType &&
1144-
(abs(Off) > ArrayElem) &&
1145-
LeadingStore->getPointerAddressSpace() == NextStore->getPointerAddressSpace()) {
1146+
if (SLM == bufType && (abs(Off) > ArrayElem)) {
11461147

1147-
if ((Off > 0 && Off != LastToLeading4NonSeq + ArrayElem * NextStoreSize) ||
1148+
if ((Off > 0 && Off != LastToLeading4Transpose + ArrayElem * NextStoreSize) ||
11481149
(Off < 0 && (-Off) != (LeadingToFirst + ArrayElem * NextStoreSize)))
11491150
continue;
1150-
else {
1151+
else { // check if it matches the pattern for enableTransposeSLM
11511152
GetElementPtrInst* LeadGEP = dyn_cast<GetElementPtrInst>(LeadingStore->getOperand(1));
11521153
GetElementPtrInst* NextGEP = dyn_cast<GetElementPtrInst>(NextStore->getOperand(1));
11531154
if (!LeadGEP || !NextGEP)
@@ -1166,21 +1167,31 @@ bool MemOpt::mergeStore(StoreInst* LeadingStore,
11661167
dyn_cast<ConstantInt>(Int2Ptr->getOperand(0))) {
11671168
if (CI->getType()->isIntegerTy()) {
11681169
LeadInt2PtrOffset = (uint32_t)CI->getZExtValue();
1169-
enableNonSeqMerge = true;
1170+
enableTransposeSLM = true;
11701171
}
11711172
}
1172-
}
1173-
}
1173+
} // if (IntToPtrInst* Int2Ptr
1174+
1175+
if (!enableTransposeSLM)
1176+
continue;
1177+
1178+
NumElts += getNumElements(NextStoreType);
1179+
1180+
// Limit to 3 entries for merging
1181+
if (NumElts > 3)
1182+
break;
1183+
} // else { // check if it matches the pattern for enableTransposeSLM
11741184
} // if (SLM == bufType && ...
11751185
} else if ((Off > 0 && Off != LastToLeading) ||
11761186
(Off < 0 && (-Off) != (LeadingToFirst + NextStoreSize)))
11771187
// Check it's consecutive to the current stores to be merged.
11781188
continue;
1179-
1180-
NumElts += getNumElements(NextStoreType);
1181-
// Bail out if the resulting vector store is already not profitable.
1182-
if (NumElts > profitVec[0])
1183-
break;
1189+
else {
1190+
NumElts += getNumElements(NextStoreType);
1191+
// Bail out if the resulting vector store is already not profitable.
1192+
if (NumElts > profitVec[0])
1193+
break;
1194+
}
11841195

11851196
// This store is to be merged. Remove it from check list.
11861197
CheckList.pop_back();
@@ -1197,7 +1208,7 @@ bool MemOpt::mergeStore(StoreInst* LeadingStore,
11971208

11981209
if (Off > 0) {
11991210
LastToLeading = Off + NextStoreSize;
1200-
LastToLeading4NonSeq = Off;
1211+
LastToLeading4Transpose = Off;
12011212
}
12021213
else
12031214
LeadingToFirst = (-Off);
@@ -1279,7 +1290,7 @@ bool MemOpt::mergeStore(StoreInst* LeadingStore,
12791290
}
12801291
}
12811292

1282-
// If newInt2PtrOffset is non-zero, that means enableNonSeqMerge is set
1293+
// If newInt2PtrOffset is non-zero, that means enableTransposeSLM is set
12831294
// Prepare new instructions
12841295
if (newInt2PtrOffset && ArrayElem) {
12851296
Type* newArrayType = PointerType::get(
@@ -1291,10 +1302,13 @@ bool MemOpt::mergeStore(StoreInst* LeadingStore,
12911302
GetElementPtrInst* LeadGEP =
12921303
dyn_cast<GetElementPtrInst>(FirstStore->getOperand(1));
12931304

1294-
Value* GEPArg[] = { LeadGEP->getOperand(1), LeadGEP->getOperand(2) };
1295-
1296-
BitCastPtr = Builder.CreateGEP(
1297-
NewInt2Ptr, GEPArg);
1305+
// We need to adjust the buffer index by multiply by 3
1306+
// From x0, x1, ... y0, y1, ... z0, z1, ...
1307+
// To x0, y0, z0, x1, y1, z1, ....
1308+
Value* NewGEPOffset =
1309+
Builder.CreateMul(LeadGEP->getOperand(2), Builder.getInt32(3));
1310+
Value* GEPArg[] = { LeadGEP->getOperand(1), NewGEPOffset };
1311+
BitCastPtr = Builder.CreateGEP(NewInt2Ptr, GEPArg);
12981312
}
12991313

13001314
for (auto& I : StoresToMerge) {

IGC/common/igc_flags.def

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -151,8 +151,7 @@ DECLARE_IGC_REGKEY(bool, EnableBlendToDiscard, true, "Enable blend to
151151
DECLARE_IGC_REGKEY(bool, EnableBlendToFill, true, "Enable blend to fill based on blend state.", false)
152152
DECLARE_IGC_REGKEY(bool, UseTiledCSThreadOrder, true, "Use 4x4 disaptch for CS order when it seems beneficial", false)
153153
DECLARE_IGC_REGKEY(bool, EnableWaveForce32, false, "Force Wave to use simd32", false)
154-
DECLARE_IGC_REGKEY(bool, MergeSLMLoad, false, "Merge SLM Load opt", false)
155-
DECLARE_IGC_REGKEY(bool, MergeSLMStore, false, "Merge SLM Store opt", false)
154+
DECLARE_IGC_REGKEY(bool, EnableMergeTransposeSLM, false, "Transpose SLM float3 storage from 3 separate x,y,z buffers to 1 big buffer with xyz consecutively", false)
156155
DECLARE_IGC_REGKEY(bool, EnableSLMConstProp, true, "Enable SLM constant propagation (compute shader only).", false)
157156
DECLARE_IGC_REGKEY(bool, EnableStatelessToStatefull, true, "Enable Stateless To Statefull transformation for global and constant address space in OpenCL kernels", false)
158157
DECLARE_IGC_REGKEY(bool, EnableStatefulToken, true, "Enable generating patch token to indicate a ptr argument is fully converted to stateful (temporary)", false)

0 commit comments

Comments
 (0)