@@ -571,7 +571,7 @@ bool MemOpt::mergeLoad(LoadInst* LeadingLoad,
571
571
// list.
572
572
573
573
// Two edges of the region where loads are merged into.
574
- int64_t HighestOffset = LdSize, HighestOffset4NonSeq = 0 ;
574
+ int64_t HighestOffset = LdSize, HighestOffset4Transpose = 0 ;
575
575
int64_t LowestOffset = 0 ;
576
576
577
577
bool bCheckNext = true ;
@@ -651,7 +651,7 @@ bool MemOpt::mergeLoad(LoadInst* LeadingLoad,
651
651
652
652
unsigned NextLoadSize = unsigned (DL->getTypeStoreSize (NextLoadType));
653
653
654
- bool enableNonSeqMerge = false ;
654
+ bool enableTransposeSLM = false ;
655
655
uint32_t LeadInt2PtrOffset = 0 ;
656
656
657
657
// detect if we can merge non-sequential SLM loads
@@ -665,46 +665,41 @@ bool MemOpt::mergeLoad(LoadInst* LeadingLoad,
665
665
// %1190 = load float, float addrspace(3) * %1185, align 4
666
666
// %1191 = load float, float addrspace(3) * %1187, align 4
667
667
// %1192 = load float, float addrspace(3) * %1189, align 4
668
- if (IGC_IS_FLAG_ENABLED (MergeSLMLoad )) {
668
+ if (IGC_IS_FLAG_ENABLED (EnableMergeTransposeSLM )) {
669
669
unsigned int resourceIndex = 0 ;
670
670
bool direct = false ;
671
671
BufferType bufType = IGC::DecodeAS4GFXResource (
672
672
LeadingLoad->getPointerAddressSpace (), direct, resourceIndex);
673
673
674
674
// initialize for first check in the loop
675
- if (!HighestOffset4NonSeq )
676
- HighestOffset4NonSeq = NextLoadSize * ArrayElem;
675
+ if (!HighestOffset4Transpose )
676
+ HighestOffset4Transpose = NextLoadSize * ArrayElem;
677
677
678
678
if (SLM == bufType &&
679
- (Off > ArrayElem) && !(Off % ArrayElem) &&
680
- LeadingLoad->getPointerAddressSpace () == NextLoad->getPointerAddressSpace ()) {
681
-
679
+ (Off > ArrayElem) && !(Off % ArrayElem))
680
+ {
682
681
GetElementPtrInst* LeadGEP = dyn_cast<GetElementPtrInst>(LeadingLoad->getOperand (0 ));
683
682
GetElementPtrInst* NextGEP = dyn_cast<GetElementPtrInst>(NextLoad->getOperand (0 ));
684
683
685
- if (!LeadGEP || !NextGEP)
686
- continue ;
687
-
688
- if ((LeadGEP->getOperand (1 ) != NextGEP->getOperand (1 )) ||
689
- (LeadGEP->getOperand (2 ) != NextGEP->getOperand (2 )))
690
- continue ;
691
-
692
- if (!isa<IntToPtrInst>(LeadGEP->getPointerOperand ()))
693
- continue ;
694
-
695
- if (IntToPtrInst* Int2Ptr =
696
- dyn_cast<IntToPtrInst>(LeadGEP->getPointerOperand ())) {
697
-
684
+ if (LeadGEP && NextGEP &&
685
+ (LeadGEP->getOperand (1 ) == NextGEP->getOperand (1 )) &&
686
+ (LeadGEP->getOperand (2 ) == NextGEP->getOperand (2 )) &&
687
+ isa<IntToPtrInst>(LeadGEP->getPointerOperand ()))
688
+ {
689
+ IntToPtrInst* Int2Ptr = dyn_cast<IntToPtrInst>(LeadGEP->getPointerOperand ());
698
690
if (const ConstantInt* CI =
699
- dyn_cast<ConstantInt>(Int2Ptr->getOperand (0 ))) {
700
- if (CI->getType ()->isIntegerTy ()) {
691
+ dyn_cast<ConstantInt>(Int2Ptr->getOperand (0 )))
692
+ {
693
+ if (CI->getType ()->isIntegerTy ())
694
+ {
701
695
LeadInt2PtrOffset = (uint32_t )CI->getZExtValue ();
702
- enableNonSeqMerge = true ;
696
+ enableTransposeSLM = true ;
703
697
}
704
- }
705
- } // if (IntToPtrInst* Int2Ptr
706
- } // if (SLM == bufType&& ..
707
- } // if (IGC_IS_FLAG_ENABLED(MergeSLMLoad)
698
+ } // if (const ConstantInt* CI = ..
699
+ } // if (LeadGEP && NextGEP ...
700
+
701
+ } // if (SLM == bufType && ..
702
+ } // if (IGC_IS_FLAG_ENABLED(EnableMergeTransposeSLM)
708
703
709
704
// By assuming dead load elimination always works correctly, if the load on
710
705
// the same location is observed again, that is probably because there is
@@ -716,23 +711,23 @@ bool MemOpt::mergeLoad(LoadInst* LeadingLoad,
716
711
int64_t newLowestOffset;
717
712
uint64_t newNumElts;
718
713
719
- if (IGC_IS_FLAG_ENABLED (MergeSLMLoad ) && enableNonSeqMerge ) {
714
+ if (IGC_IS_FLAG_ENABLED (EnableMergeTransposeSLM ) && enableTransposeSLM ) {
720
715
if (!bCheckNext)
721
716
continue ;
722
717
723
- newHighestOffset = std::max (Off + ArrayElem * NextLoadSize, HighestOffset4NonSeq );
718
+ newHighestOffset = std::max (Off + ArrayElem * NextLoadSize, HighestOffset4Transpose );
724
719
newLowestOffset = std::min (Off, LowestOffset);
725
720
newNumElts = uint64_t ((newHighestOffset - newLowestOffset) /
726
721
LdScalarSize) / ArrayElem;
727
722
728
- // Update HighestOffset4NonSeq for each iteration and
723
+ // Update HighestOffset4Transpose for each iteration and
729
724
// check against the next expected in the sequence
730
725
// Example: Off = 576 when checking 'i32 5184' entry
731
726
// (Offset 0) %1184 = inttoptr i32 4608 to[144 x float] addrspace(3) *
732
727
// %1185 = getelementptr[144 x float], [144 x float] addrspace(3) * %1184, i32 0, i32 % 1179
733
728
// (Offset 576) %1186 = inttoptr i32 5184 to[144 x float] addrspace(3) *
734
- if (Off != HighestOffset4NonSeq ) {
735
- bCheckNext = false ; // abort enableNonSeqMerge checking
729
+ if (Off != HighestOffset4Transpose ) {
730
+ bCheckNext = false ; // abort enableTransposeSLM checking
736
731
continue ;
737
732
}
738
733
@@ -742,6 +737,10 @@ bool MemOpt::mergeLoad(LoadInst* LeadingLoad,
742
737
// 'NumElts' to 2 as if the i32 wasn't present.
743
738
if (uint64_t (newHighestOffset - newLowestOffset) % (LdScalarSize * ArrayElem) != 0 )
744
739
continue ;
740
+
741
+ // Limit to 3 entries for merging
742
+ if (newNumElts > 3 )
743
+ continue ;
745
744
}
746
745
else {
747
746
newHighestOffset = std::max (Off + NextLoadSize, HighestOffset);
@@ -755,14 +754,14 @@ bool MemOpt::mergeLoad(LoadInst* LeadingLoad,
755
754
// 'NumElts' to 2 as if the i32 wasn't present.
756
755
if (uint64_t (newHighestOffset - newLowestOffset) % LdScalarSize != 0 )
757
756
continue ;
758
- }
759
757
760
- // Bail out if the resulting vector load is already not profitable.
761
- if (newNumElts > profitVec[0 ])
762
- continue ;
758
+ // Bail out if the resulting vector load is already not profitable.
759
+ if (newNumElts > profitVec[0 ])
760
+ continue ;
761
+ }
763
762
764
- if (enableNonSeqMerge ) {
765
- HighestOffset4NonSeq = newHighestOffset;
763
+ if (enableTransposeSLM ) {
764
+ HighestOffset4Transpose = newHighestOffset;
766
765
}
767
766
else {
768
767
HighestOffset = newHighestOffset;
@@ -861,36 +860,40 @@ bool MemOpt::mergeLoad(LoadInst* LeadingLoad,
861
860
}
862
861
863
862
uint32_t ArrayElem = 1 ;
864
- if (IGC_IS_FLAG_ENABLED (MergeSLMLoad)) {
865
- uint32_t newInt2PtrOffset = std::get<3 >(LoadsToMerge.back ());
863
+ uint32_t newInt2PtrOffset = std::get<3 >(LoadsToMerge.back ());
866
864
867
- // lookup 144
868
- // %55 = getelementptr [144 x float], [144 x float] addrspace(3)* %54, i32 0, i32 %36
869
- if (GetElementPtrInst* GEP =
870
- dyn_cast<GetElementPtrInst>(FirstLoad->getPointerOperand ())) {
871
- Value* GEPptr = GEP->getPointerOperand ();
872
- if (GEPptr->getType ()->isPointerTy ()) {
873
- Type* GEPElemType = GEPptr->getType ()->getPointerElementType ();
874
- if (GEPElemType->isArrayTy ())
875
- ArrayElem = (uint32_t )GEPElemType->getArrayNumElements ();
876
- }
865
+ // lookup 144
866
+ // %55 = getelementptr [144 x float], [144 x float] addrspace(3)* %54, i32 0, i32 %36
867
+ if (GetElementPtrInst* GEP =
868
+ dyn_cast<GetElementPtrInst>(FirstLoad->getPointerOperand ())) {
869
+ Value* GEPptr = GEP->getPointerOperand ();
870
+ if (GEPptr->getType ()->isPointerTy ()) {
871
+ Type* GEPElemType = GEPptr->getType ()->getPointerElementType ();
872
+ if (GEPElemType->isArrayTy ())
873
+ ArrayElem = (uint32_t )GEPElemType->getArrayNumElements ();
877
874
}
878
- if (newInt2PtrOffset && ArrayElem) {
879
- Type* newArrayType = PointerType::get (
880
- ArrayType::get (LeadingLoadScalarType, ArrayElem * NumElts),
881
- LeadingLoad->getPointerAddressSpace ());
882
-
883
- Value* NewInt2Ptr = Builder.getInt32 (std::get<3 >(LoadsToMerge.back ()));
875
+ }
876
+ // If newInt2PtrOffset is non-zero, that means enableTransposeSLM is set
877
+ // Prepare new instructions
878
+ if (newInt2PtrOffset && ArrayElem) {
879
+ Type* newArrayType = PointerType::get (
880
+ ArrayType::get (LeadingLoadScalarType, ArrayElem * NumElts),
881
+ LeadingLoad->getPointerAddressSpace ());
884
882
885
- NewInt2Ptr = createBitOrPointerCast (NewInt2Ptr, newArrayType, Builder);
886
- GetElementPtrInst* LeadGEP =
887
- dyn_cast<GetElementPtrInst>(FirstLoad->getPointerOperand ());
883
+ Value* NewInt2Ptr = Builder.getInt32 (std::get<3 >(LoadsToMerge.back ()));
888
884
889
- Value* GEPArg[] = { LeadGEP->getOperand (1 ), LeadGEP->getOperand (2 ) };
885
+ NewInt2Ptr = createBitOrPointerCast (NewInt2Ptr, newArrayType, Builder);
886
+ GetElementPtrInst* LeadGEP =
887
+ dyn_cast<GetElementPtrInst>(FirstLoad->getPointerOperand ());
890
888
891
- Ptr = Builder.CreateGEP (NewInt2Ptr, GEPArg);
892
- } // if (newInt2PtrOffset && ArrayElem)
893
- } // if (IGC_IS_FLAG_ENABLED(MergeSLMLoad)
889
+ // We need to adjust the buffer index by multiply by 3
890
+ // From x0, x1, ... y0, y1, ... z0, z1, ...
891
+ // To x0, y0, z0, x1, y1, z1, ....
892
+ Value* NewGEPOffset =
893
+ Builder.CreateMul (LeadGEP->getOperand (2 ), Builder.getInt32 (3 ));
894
+ Value* GEPArg[] = { LeadGEP->getOperand (1 ), NewGEPOffset };
895
+ Ptr = Builder.CreateGEP (NewInt2Ptr, GEPArg);
896
+ } // if (newInt2PtrOffset && ArrayElem)
894
897
895
898
Type* NewLoadType = IGCLLVM::FixedVectorType::get (LeadingLoadScalarType, NumElts);
896
899
Type* NewPointerType =
@@ -919,7 +922,7 @@ bool MemOpt::mergeLoad(LoadInst* LeadingLoad,
919
922
allInvariantLoads = false ;
920
923
}
921
924
922
- if (IGC_IS_FLAG_ENABLED (MergeSLMLoad ) && newInt2PtrOffset)
925
+ if (IGC_IS_FLAG_ENABLED (EnableMergeTransposeSLM ) && newInt2PtrOffset)
923
926
Pos = unsigned ((std::get<1 >(I) - FirstOffset) / LdScalarSize / ArrayElem);
924
927
else
925
928
Pos = unsigned ((std::get<1 >(I) - FirstOffset) / LdScalarSize);
@@ -1041,7 +1044,7 @@ bool MemOpt::mergeStore(StoreInst* LeadingStore,
1041
1044
// be merged into the "previous" tailing store.
1042
1045
1043
1046
// Two edges of the region where stores are merged into.
1044
- int64_t LastToLeading = StSize, LastToLeading4NonSeq = 0 ;
1047
+ int64_t LastToLeading = StSize, LastToLeading4Transpose = 0 ;
1045
1048
int64_t LeadingToFirst = 0 ;
1046
1049
1047
1050
// List of instructions need dependency check.
@@ -1119,8 +1122,8 @@ bool MemOpt::mergeStore(StoreInst* LeadingStore,
1119
1122
1120
1123
unsigned NextStoreSize = unsigned (DL->getTypeStoreSize (NextStoreType));
1121
1124
1122
- bool enableNonSeqMerge = false ;
1123
- // LeadInt2PtrOffset is non-zero for enableNonSeqMerge case so we can re-create
1125
+ bool enableTransposeSLM = false ;
1126
+ // LeadInt2PtrOffset is non-zero for enableTransposeSLM case so we can re-create
1124
1127
// new inttoptr inst.
1125
1128
uint32_t LeadInt2PtrOffset = 0 ;
1126
1129
@@ -1135,19 +1138,17 @@ bool MemOpt::mergeStore(StoreInst* LeadingStore,
1135
1138
// store float% 51, float addrspace(3)*% 55, align 4
1136
1139
// store float% 52, float addrspace(3)*% 57, align 4
1137
1140
// store float% 53, float addrspace(3)*% 59, align 4
1138
- if (IGC_IS_FLAG_ENABLED (MergeSLMStore )) {
1141
+ if (IGC_IS_FLAG_ENABLED (EnableMergeTransposeSLM )) {
1139
1142
unsigned int resourceIndex = 0 ;
1140
1143
bool direct = false ;
1141
1144
BufferType bufType = IGC::DecodeAS4GFXResource (
1142
1145
LeadingStore->getPointerAddressSpace (), direct, resourceIndex);
1143
- if (SLM == bufType &&
1144
- (abs (Off) > ArrayElem) &&
1145
- LeadingStore->getPointerAddressSpace () == NextStore->getPointerAddressSpace ()) {
1146
+ if (SLM == bufType && (abs (Off) > ArrayElem)) {
1146
1147
1147
- if ((Off > 0 && Off != LastToLeading4NonSeq + ArrayElem * NextStoreSize) ||
1148
+ if ((Off > 0 && Off != LastToLeading4Transpose + ArrayElem * NextStoreSize) ||
1148
1149
(Off < 0 && (-Off) != (LeadingToFirst + ArrayElem * NextStoreSize)))
1149
1150
continue ;
1150
- else {
1151
+ else { // check if it matches the pattern for enableTransposeSLM
1151
1152
GetElementPtrInst* LeadGEP = dyn_cast<GetElementPtrInst>(LeadingStore->getOperand (1 ));
1152
1153
GetElementPtrInst* NextGEP = dyn_cast<GetElementPtrInst>(NextStore->getOperand (1 ));
1153
1154
if (!LeadGEP || !NextGEP)
@@ -1166,21 +1167,31 @@ bool MemOpt::mergeStore(StoreInst* LeadingStore,
1166
1167
dyn_cast<ConstantInt>(Int2Ptr->getOperand (0 ))) {
1167
1168
if (CI->getType ()->isIntegerTy ()) {
1168
1169
LeadInt2PtrOffset = (uint32_t )CI->getZExtValue ();
1169
- enableNonSeqMerge = true ;
1170
+ enableTransposeSLM = true ;
1170
1171
}
1171
1172
}
1172
- }
1173
- }
1173
+ } // if (IntToPtrInst* Int2Ptr
1174
+
1175
+ if (!enableTransposeSLM)
1176
+ continue ;
1177
+
1178
+ NumElts += getNumElements (NextStoreType);
1179
+
1180
+ // Limit to 3 entries for merging
1181
+ if (NumElts > 3 )
1182
+ break ;
1183
+ } // else { // check if it matches the pattern for enableTransposeSLM
1174
1184
} // if (SLM == bufType && ...
1175
1185
} else if ((Off > 0 && Off != LastToLeading) ||
1176
1186
(Off < 0 && (-Off) != (LeadingToFirst + NextStoreSize)))
1177
1187
// Check it's consecutive to the current stores to be merged.
1178
1188
continue ;
1179
-
1180
- NumElts += getNumElements (NextStoreType);
1181
- // Bail out if the resulting vector store is already not profitable.
1182
- if (NumElts > profitVec[0 ])
1183
- break ;
1189
+ else {
1190
+ NumElts += getNumElements (NextStoreType);
1191
+ // Bail out if the resulting vector store is already not profitable.
1192
+ if (NumElts > profitVec[0 ])
1193
+ break ;
1194
+ }
1184
1195
1185
1196
// This store is to be merged. Remove it from check list.
1186
1197
CheckList.pop_back ();
@@ -1197,7 +1208,7 @@ bool MemOpt::mergeStore(StoreInst* LeadingStore,
1197
1208
1198
1209
if (Off > 0 ) {
1199
1210
LastToLeading = Off + NextStoreSize;
1200
- LastToLeading4NonSeq = Off;
1211
+ LastToLeading4Transpose = Off;
1201
1212
}
1202
1213
else
1203
1214
LeadingToFirst = (-Off);
@@ -1279,7 +1290,7 @@ bool MemOpt::mergeStore(StoreInst* LeadingStore,
1279
1290
}
1280
1291
}
1281
1292
1282
- // If newInt2PtrOffset is non-zero, that means enableNonSeqMerge is set
1293
+ // If newInt2PtrOffset is non-zero, that means enableTransposeSLM is set
1283
1294
// Prepare new instructions
1284
1295
if (newInt2PtrOffset && ArrayElem) {
1285
1296
Type* newArrayType = PointerType::get (
@@ -1291,10 +1302,13 @@ bool MemOpt::mergeStore(StoreInst* LeadingStore,
1291
1302
GetElementPtrInst* LeadGEP =
1292
1303
dyn_cast<GetElementPtrInst>(FirstStore->getOperand (1 ));
1293
1304
1294
- Value* GEPArg[] = { LeadGEP->getOperand (1 ), LeadGEP->getOperand (2 ) };
1295
-
1296
- BitCastPtr = Builder.CreateGEP (
1297
- NewInt2Ptr, GEPArg);
1305
+ // We need to adjust the buffer index by multiply by 3
1306
+ // From x0, x1, ... y0, y1, ... z0, z1, ...
1307
+ // To x0, y0, z0, x1, y1, z1, ....
1308
+ Value* NewGEPOffset =
1309
+ Builder.CreateMul (LeadGEP->getOperand (2 ), Builder.getInt32 (3 ));
1310
+ Value* GEPArg[] = { LeadGEP->getOperand (1 ), NewGEPOffset };
1311
+ BitCastPtr = Builder.CreateGEP (NewInt2Ptr, GEPArg);
1298
1312
}
1299
1313
1300
1314
for (auto & I : StoresToMerge) {
0 commit comments