Skip to content

Commit 1061511

Browse files
committed
[X86PreAMXConfig] Use IRBuilder to insert instructions (NFC)
Use an IRBuilder to insert instructions in preWriteTileCfg(). While here, also remove some unnecessary bool return values. There are some test changes because the IRBuilder folds "trunc i16 8 to i8" to "i8 8", and that has knock-on effects on instruction naming. I ran into this when converting tests to opaque pointers and noticed that this pass introduces unnecessary "bitcast ptr to ptr" instructions.
1 parent 65f44c9 commit 1061511

File tree

2 files changed

+86
-99
lines changed

2 files changed

+86
-99
lines changed

llvm/lib/Target/X86/X86PreAMXConfig.cpp

Lines changed: 18 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -98,10 +98,10 @@ class X86PreAMXConfig {
9898
public:
9999
X86PreAMXConfig(Function &Func) : F(Func) {}
100100
bool preTileConfig();
101-
bool addTileConfig(Instruction *ModelStart, SmallVector<Value *, 8> &Shapes);
101+
void addTileConfig(Instruction *ModelStart, SmallVector<Value *, 8> &Shapes);
102102
bool findConfigShapes(PosAndShapesMap &PosAndShapes);
103103
bool getKeyAMXShapes(IntrinsicInst *KeyAMX, SmallVector<Value *, 8> &Shapes);
104-
bool preWriteTileCfg(Value *I8Ptr, Instruction *Pos,
104+
void preWriteTileCfg(Value *I8Ptr, IRBuilderBase &Builder,
105105
SmallVector<Value *, 8> &Shapes);
106106
BasicBlock::iterator
107107
getShapesAndConfigPosEnd(BasicBlock::iterator Iter,
@@ -150,41 +150,37 @@ class X86PreAMXConfig {
150150
// %td = tail call x86_amx @llvm.x86.tdpbssd.internal(m, n, k, t1, t2, t3)
151151
// call void @llvm.x86.tilestored64.internal(... td) area
152152
// --------------------------------------------------------------------------
153-
bool X86PreAMXConfig::preWriteTileCfg(Value *I8Ptr, Instruction *Pos,
153+
void X86PreAMXConfig::preWriteTileCfg(Value *I8Ptr, IRBuilderBase &Builder,
154154
SmallVector<Value *, 8> &Shapes) {
155-
bool Write = false;
156-
LLVMContext &Ctx = Pos->getParent()->getContext();
155+
LLVMContext &Ctx = Builder.getContext();
157156
Type *I8Ty = Type::getInt8Ty(Ctx);
158157
Type *I16Ty = Type::getInt16Ty(Ctx);
159158

160159
// TODO: Currently we defaultly set Palette = 1, it may be assigned to
161160
// other value in the future.
162161
Value *PaletteOffset = ConstantInt::get(Type::getInt64Ty(Ctx), 0);
163162
Value *PaletteValue = ConstantInt::get(Type::getInt8Ty(Ctx), 1);
164-
Value *PalettePos =
165-
GetElementPtrInst::Create(I8Ty, I8Ptr, PaletteOffset, "", Pos);
166-
new StoreInst(PaletteValue, PalettePos, Pos);
163+
Value *PalettePos = Builder.CreateGEP(I8Ty, I8Ptr, PaletteOffset);
164+
Builder.CreateStore(PaletteValue, PalettePos);
167165

168166
for (int I = 0, E = Shapes.size() / 2; I < E; I++) {
169167
Value *RowOffset = ConstantInt::get(Type::getInt64Ty(Ctx), 48 + I);
170168
Value *ColOffset = ConstantInt::get(Type::getInt64Ty(Ctx), 16 + I * 2);
171169
const std::string ShapeName = "amx.tmm." + itostr(I);
172-
Value *RowPos = GetElementPtrInst::Create(I8Ty, I8Ptr, RowOffset,
173-
ShapeName + ".shape.row", Pos);
174-
Value *ColPos = GetElementPtrInst::Create(I8Ty, I8Ptr, ColOffset, "", Pos);
175-
ColPos = new BitCastInst(ColPos, PointerType::get(I16Ty, 0),
176-
ShapeName + ".shape.col", Pos);
170+
Value *RowPos = Builder.CreateGEP(I8Ty, I8Ptr, RowOffset,
171+
ShapeName + ".shape.row");
172+
Value *ColPos = Builder.CreateGEP(I8Ty, I8Ptr, ColOffset);
173+
ColPos = Builder.CreateBitCast(ColPos, PointerType::get(I16Ty, 0),
174+
ShapeName + ".shape.col");
177175
Value *Row = Shapes[I * 2];
178176
Value *Col = Shapes[I * 2 + 1];
179-
Row = new TruncInst(Row, I8Ty, "", Pos);
180-
new StoreInst(Row, RowPos, Pos);
181-
new StoreInst(Col, ColPos, Pos);
182-
Write = true;
177+
Row = Builder.CreateTrunc(Row, I8Ty);
178+
Builder.CreateStore(Row, RowPos);
179+
Builder.CreateStore(Col, ColPos);
183180
}
184-
return Write;
185181
}
186182

187-
bool X86PreAMXConfig::addTileConfig(Instruction *ModelStart,
183+
void X86PreAMXConfig::addTileConfig(Instruction *ModelStart,
188184
SmallVector<Value *, 8> &Shapes) {
189185
Module *M = F.getParent();
190186
IRBuilder<> Builder(ModelStart);
@@ -199,17 +195,11 @@ bool X86PreAMXConfig::addTileConfig(Instruction *ModelStart,
199195
Addr->setAlignment(Alignment);
200196
Value *I8Ptr = Builder.CreateBitCast(Addr, Builder.getInt8PtrTy());
201197

202-
std::array<Value *, 1> Args = {I8Ptr};
203-
Instruction *Cfg =
204-
Builder.CreateIntrinsic(Intrinsic::x86_ldtilecfg_internal, None, Args);
205-
206-
Value *Val0 = Constant::getNullValue(V512Ty);
207-
Instruction *Init0 = new StoreInst(Val0, Addr, false, Alignment, Cfg);
208-
assert(Init0 && "Not Zero initilizate the cfg mem!");
198+
Builder.CreateAlignedStore(Constant::getNullValue(V512Ty), Addr, Alignment);
209199

210-
preWriteTileCfg(I8Ptr, Cfg, Shapes);
200+
preWriteTileCfg(I8Ptr, Builder, Shapes);
211201

212-
return Init0;
202+
Builder.CreateIntrinsic(Intrinsic::x86_ldtilecfg_internal, None, {I8Ptr});
213203
}
214204

215205
// Todo: We may need to handle "more than one store" case in the future.

llvm/test/CodeGen/X86/AMX/amx-configO2toO0-precfg.ll

Lines changed: 68 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -46,113 +46,110 @@ define dso_local void @test_api(i32 %cond, i16 signext %row, i16 signext %col) l
4646
; CHECK-NEXT: [[AMX_TMM_0_SHAPE_ROW1:%.*]] = getelementptr i8, i8* [[TMP12]], i64 48
4747
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i8, i8* [[TMP12]], i64 16
4848
; CHECK-NEXT: [[AMX_TMM_0_SHAPE_COL2:%.*]] = bitcast i8* [[TMP14]] to i16*
49-
; CHECK-NEXT: [[TMP15:%.*]] = trunc i16 8 to i8
50-
; CHECK-NEXT: store i8 [[TMP15]], i8* [[AMX_TMM_0_SHAPE_ROW1]], align 1
49+
; CHECK-NEXT: store i8 8, i8* [[AMX_TMM_0_SHAPE_ROW1]], align 1
5150
; CHECK-NEXT: store i16 [[COL:%.*]], i16* [[AMX_TMM_0_SHAPE_COL2]], align 2
5251
; CHECK-NEXT: call void @llvm.x86.ldtilecfg.internal(i8* [[TMP12]])
5352
; CHECK-NEXT: [[I9:%.*]] = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 [[COL]], i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32)
5453
; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 8, i16 [[COL]], i8* [[I3]], i64 64, x86_amx [[I9]])
55-
; CHECK-NEXT: [[TMP16:%.*]] = bitcast <16 x i32>* [[TMP5]] to i8*
54+
; CHECK-NEXT: [[TMP15:%.*]] = bitcast <16 x i32>* [[TMP5]] to i8*
5655
; CHECK-NEXT: store <16 x i32> zeroinitializer, <16 x i32>* [[TMP5]], align 4
57-
; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i8, i8* [[TMP16]], i64 0
58-
; CHECK-NEXT: store i8 1, i8* [[TMP17]], align 1
59-
; CHECK-NEXT: [[AMX_TMM_0_SHAPE_ROW3:%.*]] = getelementptr i8, i8* [[TMP16]], i64 48
60-
; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i8, i8* [[TMP16]], i64 16
61-
; CHECK-NEXT: [[AMX_TMM_0_SHAPE_COL4:%.*]] = bitcast i8* [[TMP18]] to i16*
62-
; CHECK-NEXT: [[TMP19:%.*]] = trunc i16 [[ROW]] to i8
63-
; CHECK-NEXT: store i8 [[TMP19]], i8* [[AMX_TMM_0_SHAPE_ROW3]], align 1
56+
; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i8, i8* [[TMP15]], i64 0
57+
; CHECK-NEXT: store i8 1, i8* [[TMP16]], align 1
58+
; CHECK-NEXT: [[AMX_TMM_0_SHAPE_ROW3:%.*]] = getelementptr i8, i8* [[TMP15]], i64 48
59+
; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i8, i8* [[TMP15]], i64 16
60+
; CHECK-NEXT: [[AMX_TMM_0_SHAPE_COL4:%.*]] = bitcast i8* [[TMP17]] to i16*
61+
; CHECK-NEXT: [[TMP18:%.*]] = trunc i16 [[ROW]] to i8
62+
; CHECK-NEXT: store i8 [[TMP18]], i8* [[AMX_TMM_0_SHAPE_ROW3]], align 1
6463
; CHECK-NEXT: store i16 [[COL]], i16* [[AMX_TMM_0_SHAPE_COL4]], align 2
65-
; CHECK-NEXT: call void @llvm.x86.ldtilecfg.internal(i8* [[TMP16]])
64+
; CHECK-NEXT: call void @llvm.x86.ldtilecfg.internal(i8* [[TMP15]])
6665
; CHECK-NEXT: [[I10:%.*]] = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 [[ROW]], i16 [[COL]], i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32)
6766
; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 [[COL]], i8* [[I1]], i64 64, x86_amx [[I10]])
6867
; CHECK-NEXT: br label [[IF_END:%.*]]
6968
; CHECK: if.else:
70-
; CHECK-NEXT: [[TMP20:%.*]] = bitcast <16 x i32>* [[TMP4]] to i8*
69+
; CHECK-NEXT: [[TMP19:%.*]] = bitcast <16 x i32>* [[TMP4]] to i8*
7170
; CHECK-NEXT: store <16 x i32> zeroinitializer, <16 x i32>* [[TMP4]], align 4
72-
; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i8, i8* [[TMP20]], i64 0
73-
; CHECK-NEXT: store i8 1, i8* [[TMP21]], align 1
74-
; CHECK-NEXT: [[AMX_TMM_0_SHAPE_ROW5:%.*]] = getelementptr i8, i8* [[TMP20]], i64 48
75-
; CHECK-NEXT: [[TMP22:%.*]] = getelementptr i8, i8* [[TMP20]], i64 16
76-
; CHECK-NEXT: [[AMX_TMM_0_SHAPE_COL6:%.*]] = bitcast i8* [[TMP22]] to i16*
77-
; CHECK-NEXT: [[TMP23:%.*]] = trunc i16 [[ROW]] to i8
78-
; CHECK-NEXT: store i8 [[TMP23]], i8* [[AMX_TMM_0_SHAPE_ROW5]], align 1
71+
; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i8, i8* [[TMP19]], i64 0
72+
; CHECK-NEXT: store i8 1, i8* [[TMP20]], align 1
73+
; CHECK-NEXT: [[AMX_TMM_0_SHAPE_ROW5:%.*]] = getelementptr i8, i8* [[TMP19]], i64 48
74+
; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i8, i8* [[TMP19]], i64 16
75+
; CHECK-NEXT: [[AMX_TMM_0_SHAPE_COL6:%.*]] = bitcast i8* [[TMP21]] to i16*
76+
; CHECK-NEXT: [[TMP22:%.*]] = trunc i16 [[ROW]] to i8
77+
; CHECK-NEXT: store i8 [[TMP22]], i8* [[AMX_TMM_0_SHAPE_ROW5]], align 1
7978
; CHECK-NEXT: store i16 8, i16* [[AMX_TMM_0_SHAPE_COL6]], align 2
80-
; CHECK-NEXT: call void @llvm.x86.ldtilecfg.internal(i8* [[TMP20]])
79+
; CHECK-NEXT: call void @llvm.x86.ldtilecfg.internal(i8* [[TMP19]])
8180
; CHECK-NEXT: [[I11:%.*]] = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 [[ROW]], i16 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32)
8281
; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 8, i8* [[I5]], i64 64, x86_amx [[I11]])
83-
; CHECK-NEXT: [[TMP24:%.*]] = bitcast <16 x i32>* [[TMP3]] to i8*
82+
; CHECK-NEXT: [[TMP23:%.*]] = bitcast <16 x i32>* [[TMP3]] to i8*
8483
; CHECK-NEXT: store <16 x i32> zeroinitializer, <16 x i32>* [[TMP3]], align 4
85-
; CHECK-NEXT: [[TMP25:%.*]] = getelementptr i8, i8* [[TMP24]], i64 0
86-
; CHECK-NEXT: store i8 1, i8* [[TMP25]], align 1
87-
; CHECK-NEXT: [[AMX_TMM_0_SHAPE_ROW7:%.*]] = getelementptr i8, i8* [[TMP24]], i64 48
88-
; CHECK-NEXT: [[TMP26:%.*]] = getelementptr i8, i8* [[TMP24]], i64 16
89-
; CHECK-NEXT: [[AMX_TMM_0_SHAPE_COL8:%.*]] = bitcast i8* [[TMP26]] to i16*
90-
; CHECK-NEXT: [[TMP27:%.*]] = trunc i16 8 to i8
91-
; CHECK-NEXT: store i8 [[TMP27]], i8* [[AMX_TMM_0_SHAPE_ROW7]], align 1
84+
; CHECK-NEXT: [[TMP24:%.*]] = getelementptr i8, i8* [[TMP23]], i64 0
85+
; CHECK-NEXT: store i8 1, i8* [[TMP24]], align 1
86+
; CHECK-NEXT: [[AMX_TMM_0_SHAPE_ROW7:%.*]] = getelementptr i8, i8* [[TMP23]], i64 48
87+
; CHECK-NEXT: [[TMP25:%.*]] = getelementptr i8, i8* [[TMP23]], i64 16
88+
; CHECK-NEXT: [[AMX_TMM_0_SHAPE_COL8:%.*]] = bitcast i8* [[TMP25]] to i16*
89+
; CHECK-NEXT: store i8 8, i8* [[AMX_TMM_0_SHAPE_ROW7]], align 1
9290
; CHECK-NEXT: store i16 [[COL]], i16* [[AMX_TMM_0_SHAPE_COL8]], align 2
93-
; CHECK-NEXT: call void @llvm.x86.ldtilecfg.internal(i8* [[TMP24]])
91+
; CHECK-NEXT: call void @llvm.x86.ldtilecfg.internal(i8* [[TMP23]])
9492
; CHECK-NEXT: [[I12:%.*]] = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 [[COL]], i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32)
9593
; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 8, i16 [[COL]], i8* [[I3]], i64 64, x86_amx [[I12]])
96-
; CHECK-NEXT: [[TMP28:%.*]] = bitcast <16 x i32>* [[TMP2]] to i8*
94+
; CHECK-NEXT: [[TMP26:%.*]] = bitcast <16 x i32>* [[TMP2]] to i8*
9795
; CHECK-NEXT: store <16 x i32> zeroinitializer, <16 x i32>* [[TMP2]], align 4
98-
; CHECK-NEXT: [[TMP29:%.*]] = getelementptr i8, i8* [[TMP28]], i64 0
99-
; CHECK-NEXT: store i8 1, i8* [[TMP29]], align 1
100-
; CHECK-NEXT: [[AMX_TMM_0_SHAPE_ROW9:%.*]] = getelementptr i8, i8* [[TMP28]], i64 48
101-
; CHECK-NEXT: [[TMP30:%.*]] = getelementptr i8, i8* [[TMP28]], i64 16
102-
; CHECK-NEXT: [[AMX_TMM_0_SHAPE_COL10:%.*]] = bitcast i8* [[TMP30]] to i16*
103-
; CHECK-NEXT: [[TMP31:%.*]] = trunc i16 [[ROW]] to i8
104-
; CHECK-NEXT: store i8 [[TMP31]], i8* [[AMX_TMM_0_SHAPE_ROW9]], align 1
96+
; CHECK-NEXT: [[TMP27:%.*]] = getelementptr i8, i8* [[TMP26]], i64 0
97+
; CHECK-NEXT: store i8 1, i8* [[TMP27]], align 1
98+
; CHECK-NEXT: [[AMX_TMM_0_SHAPE_ROW9:%.*]] = getelementptr i8, i8* [[TMP26]], i64 48
99+
; CHECK-NEXT: [[TMP28:%.*]] = getelementptr i8, i8* [[TMP26]], i64 16
100+
; CHECK-NEXT: [[AMX_TMM_0_SHAPE_COL10:%.*]] = bitcast i8* [[TMP28]] to i16*
101+
; CHECK-NEXT: [[TMP29:%.*]] = trunc i16 [[ROW]] to i8
102+
; CHECK-NEXT: store i8 [[TMP29]], i8* [[AMX_TMM_0_SHAPE_ROW9]], align 1
105103
; CHECK-NEXT: store i16 [[COL]], i16* [[AMX_TMM_0_SHAPE_COL10]], align 2
106-
; CHECK-NEXT: call void @llvm.x86.ldtilecfg.internal(i8* [[TMP28]])
104+
; CHECK-NEXT: call void @llvm.x86.ldtilecfg.internal(i8* [[TMP26]])
107105
; CHECK-NEXT: [[I13:%.*]] = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 [[ROW]], i16 [[COL]], i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32)
108106
; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 [[COL]], i8* [[I1]], i64 64, x86_amx [[I13]])
109107
; CHECK-NEXT: br label [[IF_END]]
110108
; CHECK: if.end:
111-
; CHECK-NEXT: [[TMP32:%.*]] = bitcast <16 x i32>* [[TMP1]] to i8*
109+
; CHECK-NEXT: [[TMP30:%.*]] = bitcast <16 x i32>* [[TMP1]] to i8*
112110
; CHECK-NEXT: store <16 x i32> zeroinitializer, <16 x i32>* [[TMP1]], align 4
113-
; CHECK-NEXT: [[TMP33:%.*]] = getelementptr i8, i8* [[TMP32]], i64 0
114-
; CHECK-NEXT: store i8 1, i8* [[TMP33]], align 1
115-
; CHECK-NEXT: [[AMX_TMM_0_SHAPE_ROW11:%.*]] = getelementptr i8, i8* [[TMP32]], i64 48
116-
; CHECK-NEXT: [[TMP34:%.*]] = getelementptr i8, i8* [[TMP32]], i64 16
117-
; CHECK-NEXT: [[AMX_TMM_0_SHAPE_COL12:%.*]] = bitcast i8* [[TMP34]] to i16*
118-
; CHECK-NEXT: [[TMP35:%.*]] = trunc i16 [[ROW]] to i8
119-
; CHECK-NEXT: store i8 [[TMP35]], i8* [[AMX_TMM_0_SHAPE_ROW11]], align 1
111+
; CHECK-NEXT: [[TMP31:%.*]] = getelementptr i8, i8* [[TMP30]], i64 0
112+
; CHECK-NEXT: store i8 1, i8* [[TMP31]], align 1
113+
; CHECK-NEXT: [[AMX_TMM_0_SHAPE_ROW11:%.*]] = getelementptr i8, i8* [[TMP30]], i64 48
114+
; CHECK-NEXT: [[TMP32:%.*]] = getelementptr i8, i8* [[TMP30]], i64 16
115+
; CHECK-NEXT: [[AMX_TMM_0_SHAPE_COL12:%.*]] = bitcast i8* [[TMP32]] to i16*
116+
; CHECK-NEXT: [[TMP33:%.*]] = trunc i16 [[ROW]] to i8
117+
; CHECK-NEXT: store i8 [[TMP33]], i8* [[AMX_TMM_0_SHAPE_ROW11]], align 1
120118
; CHECK-NEXT: store i16 [[COL]], i16* [[AMX_TMM_0_SHAPE_COL12]], align 2
121-
; CHECK-NEXT: [[AMX_TMM_1_SHAPE_ROW:%.*]] = getelementptr i8, i8* [[TMP32]], i64 49
122-
; CHECK-NEXT: [[TMP36:%.*]] = getelementptr i8, i8* [[TMP32]], i64 18
123-
; CHECK-NEXT: [[AMX_TMM_1_SHAPE_COL:%.*]] = bitcast i8* [[TMP36]] to i16*
124-
; CHECK-NEXT: [[TMP37:%.*]] = trunc i16 [[ROW]] to i8
125-
; CHECK-NEXT: store i8 [[TMP37]], i8* [[AMX_TMM_1_SHAPE_ROW]], align 1
119+
; CHECK-NEXT: [[AMX_TMM_1_SHAPE_ROW:%.*]] = getelementptr i8, i8* [[TMP30]], i64 49
120+
; CHECK-NEXT: [[TMP34:%.*]] = getelementptr i8, i8* [[TMP30]], i64 18
121+
; CHECK-NEXT: [[AMX_TMM_1_SHAPE_COL:%.*]] = bitcast i8* [[TMP34]] to i16*
122+
; CHECK-NEXT: [[TMP35:%.*]] = trunc i16 [[ROW]] to i8
123+
; CHECK-NEXT: store i8 [[TMP35]], i8* [[AMX_TMM_1_SHAPE_ROW]], align 1
126124
; CHECK-NEXT: store i16 8, i16* [[AMX_TMM_1_SHAPE_COL]], align 2
127-
; CHECK-NEXT: [[AMX_TMM_2_SHAPE_ROW:%.*]] = getelementptr i8, i8* [[TMP32]], i64 50
128-
; CHECK-NEXT: [[TMP38:%.*]] = getelementptr i8, i8* [[TMP32]], i64 20
129-
; CHECK-NEXT: [[AMX_TMM_2_SHAPE_COL:%.*]] = bitcast i8* [[TMP38]] to i16*
130-
; CHECK-NEXT: [[TMP39:%.*]] = trunc i16 8 to i8
131-
; CHECK-NEXT: store i8 [[TMP39]], i8* [[AMX_TMM_2_SHAPE_ROW]], align 1
125+
; CHECK-NEXT: [[AMX_TMM_2_SHAPE_ROW:%.*]] = getelementptr i8, i8* [[TMP30]], i64 50
126+
; CHECK-NEXT: [[TMP36:%.*]] = getelementptr i8, i8* [[TMP30]], i64 20
127+
; CHECK-NEXT: [[AMX_TMM_2_SHAPE_COL:%.*]] = bitcast i8* [[TMP36]] to i16*
128+
; CHECK-NEXT: store i8 8, i8* [[AMX_TMM_2_SHAPE_ROW]], align 1
132129
; CHECK-NEXT: store i16 [[COL]], i16* [[AMX_TMM_2_SHAPE_COL]], align 2
133-
; CHECK-NEXT: [[AMX_TMM_3_SHAPE_ROW:%.*]] = getelementptr i8, i8* [[TMP32]], i64 51
134-
; CHECK-NEXT: [[TMP40:%.*]] = getelementptr i8, i8* [[TMP32]], i64 22
135-
; CHECK-NEXT: [[AMX_TMM_3_SHAPE_COL:%.*]] = bitcast i8* [[TMP40]] to i16*
136-
; CHECK-NEXT: [[TMP41:%.*]] = trunc i16 [[ROW]] to i8
137-
; CHECK-NEXT: store i8 [[TMP41]], i8* [[AMX_TMM_3_SHAPE_ROW]], align 1
130+
; CHECK-NEXT: [[AMX_TMM_3_SHAPE_ROW:%.*]] = getelementptr i8, i8* [[TMP30]], i64 51
131+
; CHECK-NEXT: [[TMP37:%.*]] = getelementptr i8, i8* [[TMP30]], i64 22
132+
; CHECK-NEXT: [[AMX_TMM_3_SHAPE_COL:%.*]] = bitcast i8* [[TMP37]] to i16*
133+
; CHECK-NEXT: [[TMP38:%.*]] = trunc i16 [[ROW]] to i8
134+
; CHECK-NEXT: store i8 [[TMP38]], i8* [[AMX_TMM_3_SHAPE_ROW]], align 1
138135
; CHECK-NEXT: store i16 [[COL]], i16* [[AMX_TMM_3_SHAPE_COL]], align 2
139-
; CHECK-NEXT: call void @llvm.x86.ldtilecfg.internal(i8* [[TMP32]])
136+
; CHECK-NEXT: call void @llvm.x86.ldtilecfg.internal(i8* [[TMP30]])
140137
; CHECK-NEXT: [[I14:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[ROW]], i16 8, i8* [[I5]], i64 64)
141138
; CHECK-NEXT: [[I15:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 [[COL]], i8* [[I3]], i64 64)
142139
; CHECK-NEXT: [[I16:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[ROW]], i16 [[COL]], i8* [[I1]], i64 64)
143140
; CHECK-NEXT: [[I17:%.*]] = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 [[ROW]], i16 [[COL]], i16 8, x86_amx [[I16]], x86_amx [[I14]], x86_amx [[I15]])
144141
; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 [[COL]], i8* [[I7]], i64 64, x86_amx [[I17]])
145-
; CHECK-NEXT: [[TMP42:%.*]] = bitcast <16 x i32>* [[TMP0]] to i8*
142+
; CHECK-NEXT: [[TMP39:%.*]] = bitcast <16 x i32>* [[TMP0]] to i8*
146143
; CHECK-NEXT: store <16 x i32> zeroinitializer, <16 x i32>* [[TMP0]], align 4
147-
; CHECK-NEXT: [[TMP43:%.*]] = getelementptr i8, i8* [[TMP42]], i64 0
148-
; CHECK-NEXT: store i8 1, i8* [[TMP43]], align 1
149-
; CHECK-NEXT: [[AMX_TMM_0_SHAPE_ROW13:%.*]] = getelementptr i8, i8* [[TMP42]], i64 48
150-
; CHECK-NEXT: [[TMP44:%.*]] = getelementptr i8, i8* [[TMP42]], i64 16
151-
; CHECK-NEXT: [[AMX_TMM_0_SHAPE_COL14:%.*]] = bitcast i8* [[TMP44]] to i16*
152-
; CHECK-NEXT: [[TMP45:%.*]] = trunc i16 [[ROW]] to i8
153-
; CHECK-NEXT: store i8 [[TMP45]], i8* [[AMX_TMM_0_SHAPE_ROW13]], align 1
144+
; CHECK-NEXT: [[TMP40:%.*]] = getelementptr i8, i8* [[TMP39]], i64 0
145+
; CHECK-NEXT: store i8 1, i8* [[TMP40]], align 1
146+
; CHECK-NEXT: [[AMX_TMM_0_SHAPE_ROW13:%.*]] = getelementptr i8, i8* [[TMP39]], i64 48
147+
; CHECK-NEXT: [[TMP41:%.*]] = getelementptr i8, i8* [[TMP39]], i64 16
148+
; CHECK-NEXT: [[AMX_TMM_0_SHAPE_COL14:%.*]] = bitcast i8* [[TMP41]] to i16*
149+
; CHECK-NEXT: [[TMP42:%.*]] = trunc i16 [[ROW]] to i8
150+
; CHECK-NEXT: store i8 [[TMP42]], i8* [[AMX_TMM_0_SHAPE_ROW13]], align 1
154151
; CHECK-NEXT: store i16 [[COL]], i16* [[AMX_TMM_0_SHAPE_COL14]], align 2
155-
; CHECK-NEXT: call void @llvm.x86.ldtilecfg.internal(i8* [[TMP42]])
152+
; CHECK-NEXT: call void @llvm.x86.ldtilecfg.internal(i8* [[TMP39]])
156153
; CHECK-NEXT: [[I18:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[ROW]], i16 [[COL]], i8* [[I7]], i64 64)
157154
; CHECK-NEXT: tail call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 [[COL]], i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32, x86_amx [[I18]])
158155
; CHECK-NEXT: ret void

0 commit comments

Comments
 (0)