Skip to content

Commit 964565f

Browse files
authored
[SystemZ] i128 cost model (#78528)
Update SystemZTTI to reflect the recent change of handling i128 as a legal type in vector registers.
1 parent 15c1c85 commit 964565f

File tree

8 files changed

+337
-62
lines changed

8 files changed

+337
-62
lines changed

llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp

Lines changed: 74 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -75,8 +75,8 @@ InstructionCost SystemZTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
7575
// here, so that constant hoisting will ignore this constant.
7676
if (BitSize == 0)
7777
return TTI::TCC_Free;
78-
// No cost model for operations on integers larger than 64 bit implemented yet.
79-
if (BitSize > 64)
78+
// No cost model for operations on integers larger than 128 bit implemented yet.
79+
if ((!ST->hasVector() && BitSize > 64) || BitSize > 128)
8080
return TTI::TCC_Free;
8181

8282
if (Imm == 0)
@@ -96,7 +96,8 @@ InstructionCost SystemZTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
9696
return 2 * TTI::TCC_Basic;
9797
}
9898

99-
return 4 * TTI::TCC_Basic;
99+
// i128 immediates loads from Constant Pool
100+
return 2 * TTI::TCC_Basic;
100101
}
101102

102103
InstructionCost SystemZTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
@@ -479,21 +480,27 @@ InstructionCost SystemZTTIImpl::getArithmeticInstrCost(
479480
return LIBCALL_COST;
480481

481482
// Give discount for some combined logical operations if supported.
482-
if (Args.size() == 2 && ST->hasMiscellaneousExtensions3()) {
483+
if (Args.size() == 2) {
483484
if (Opcode == Instruction::Xor) {
484485
for (const Value *A : Args) {
485486
if (const Instruction *I = dyn_cast<Instruction>(A))
486487
if (I->hasOneUse() &&
487-
(I->getOpcode() == Instruction::And ||
488-
I->getOpcode() == Instruction::Or ||
488+
(I->getOpcode() == Instruction::Or ||
489+
I->getOpcode() == Instruction::And ||
489490
I->getOpcode() == Instruction::Xor))
490-
return 0;
491+
if ((ScalarBits <= 64 && ST->hasMiscellaneousExtensions3()) ||
492+
(isInt128InVR(Ty) &&
493+
(I->getOpcode() == Instruction::Or || ST->hasVectorEnhancements1())))
494+
return 0;
491495
}
492496
}
493-
else if (Opcode == Instruction::Or || Opcode == Instruction::And) {
497+
else if (Opcode == Instruction::And || Opcode == Instruction::Or) {
494498
for (const Value *A : Args) {
495499
if (const Instruction *I = dyn_cast<Instruction>(A))
496-
if (I->hasOneUse() && I->getOpcode() == Instruction::Xor)
500+
if ((I->hasOneUse() && I->getOpcode() == Instruction::Xor) &&
501+
((ScalarBits <= 64 && ST->hasMiscellaneousExtensions3()) ||
502+
(isInt128InVR(Ty) &&
503+
(Opcode == Instruction::And || ST->hasVectorEnhancements1()))))
497504
return 0;
498505
}
499506
}
@@ -774,29 +781,63 @@ InstructionCost SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
774781
assert (!Dst->isVectorTy());
775782

776783
if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP) {
784+
if (Src->isIntegerTy(128))
785+
return LIBCALL_COST;
777786
if (SrcScalarBits >= 32 ||
778787
(I != nullptr && isa<LoadInst>(I->getOperand(0))))
779788
return 1;
780789
return SrcScalarBits > 1 ? 2 /*i8/i16 extend*/ : 5 /*branch seq.*/;
781790
}
782791

783-
if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt) &&
784-
Src->isIntegerTy(1)) {
785-
if (ST->hasLoadStoreOnCond2())
786-
return 2; // li 0; loc 1
787-
788-
// This should be extension of a compare i1 result, which is done with
789-
// ipm and a varying sequence of instructions.
790-
unsigned Cost = 0;
791-
if (Opcode == Instruction::SExt)
792-
Cost = (DstScalarBits < 64 ? 3 : 4);
793-
if (Opcode == Instruction::ZExt)
794-
Cost = 3;
795-
Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I) : nullptr);
796-
if (CmpOpTy != nullptr && CmpOpTy->isFloatingPointTy())
797-
// If operands of an fp-type was compared, this costs +1.
798-
Cost++;
799-
return Cost;
792+
if ((Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI) &&
793+
Dst->isIntegerTy(128))
794+
return LIBCALL_COST;
795+
796+
if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt)) {
797+
if (Src->isIntegerTy(1)) {
798+
if (DstScalarBits == 128)
799+
return 5 /*branch seq.*/;
800+
801+
if (ST->hasLoadStoreOnCond2())
802+
return 2; // li 0; loc 1
803+
804+
// This should be extension of a compare i1 result, which is done with
805+
// ipm and a varying sequence of instructions.
806+
unsigned Cost = 0;
807+
if (Opcode == Instruction::SExt)
808+
Cost = (DstScalarBits < 64 ? 3 : 4);
809+
if (Opcode == Instruction::ZExt)
810+
Cost = 3;
811+
Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I) : nullptr);
812+
if (CmpOpTy != nullptr && CmpOpTy->isFloatingPointTy())
813+
// If operands of an fp-type was compared, this costs +1.
814+
Cost++;
815+
return Cost;
816+
}
817+
else if (isInt128InVR(Dst)) {
818+
// Extensions from GPR to i128 (in VR) typically costs two instructions,
819+
// but a zero-extending load would be just one extra instruction.
820+
if (Opcode == Instruction::ZExt && I != nullptr)
821+
if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0)))
822+
if (Ld->hasOneUse())
823+
return 1;
824+
return 2;
825+
}
826+
}
827+
828+
if (Opcode == Instruction::Trunc && isInt128InVR(Src) && I != nullptr) {
829+
if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0)))
830+
if (Ld->hasOneUse())
831+
return 0; // Will be converted to GPR load.
832+
bool OnlyTruncatingStores = true;
833+
for (const User *U : I->users())
834+
if (!isa<StoreInst>(U)) {
835+
OnlyTruncatingStores = false;
836+
break;
837+
}
838+
if (OnlyTruncatingStores)
839+
return 0;
840+
return 2; // Vector element extraction.
800841
}
801842
}
802843
else if (ST->hasVector()) {
@@ -930,7 +971,7 @@ InstructionCost SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
930971
// A loaded value compared with 0 with multiple users becomes Load and
931972
// Test. The load is then not foldable, so return 0 cost for the ICmp.
932973
unsigned ScalarBits = ValTy->getScalarSizeInBits();
933-
if (I != nullptr && ScalarBits >= 32)
974+
if (I != nullptr && (ScalarBits == 32 || ScalarBits == 64))
934975
if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0)))
935976
if (const ConstantInt *C = dyn_cast<ConstantInt>(I->getOperand(1)))
936977
if (!Ld->hasOneUse() && Ld->getParent() == I->getParent() &&
@@ -943,8 +984,8 @@ InstructionCost SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
943984
return Cost;
944985
}
945986
case Instruction::Select:
946-
if (ValTy->isFloatingPointTy())
947-
return 4; // No load on condition for FP - costs a conditional jump.
987+
if (ValTy->isFloatingPointTy() || isInt128InVR(ValTy))
988+
return 4; // No LOC for FP / i128 - costs a conditional jump.
948989
return 1; // Load On Condition / Select Register.
949990
}
950991
}
@@ -1157,6 +1198,10 @@ InstructionCost SystemZTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
11571198
return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
11581199
CostKind);
11591200

1201+
// FP128 is a legal type but kept in a register pair on older CPUs.
1202+
if (Src->isFP128Ty() && !ST->hasVectorEnhancements1())
1203+
return 2;
1204+
11601205
unsigned NumOps =
11611206
(Src->isVectorTy() ? getNumVectorRegs(Src) : getNumberOfParts(Src));
11621207

@@ -1177,10 +1222,6 @@ InstructionCost SystemZTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
11771222
}
11781223
}
11791224

1180-
if (Src->getScalarSizeInBits() == 128)
1181-
// 128 bit scalars are held in a pair of two 64 bit registers.
1182-
NumOps *= 2;
1183-
11841225
return NumOps;
11851226
}
11861227

llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@ class SystemZTTIImpl : public BasicTTIImplBase<SystemZTTIImpl> {
2828

2929
unsigned const LIBCALL_COST = 30;
3030

31+
bool isInt128InVR(Type *Ty) { return Ty->isIntegerTy(128) && ST->hasVector(); }
32+
3133
public:
3234
explicit SystemZTTIImpl(const SystemZTargetMachine *TM, const Function &F)
3335
: BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=systemz-unknown -mcpu=z13 | FileCheck %s
2+
;
3+
4+
define i128 @fun1(i128 %val1, i128 %val2) {
5+
; CHECK-LABEL: 'fun1'
6+
; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %cmp = icmp eq i128 %val1, %val2
7+
; CHECK: Cost Model: Found an estimated cost of 5 for instruction: %v128 = sext i1 %cmp to i128
8+
%cmp = icmp eq i128 %val1, %val2
9+
%v128 = sext i1 %cmp to i128
10+
ret i128 %v128
11+
}
12+
13+
define i128 @fun2(i128 %val1, i128 %val2) {
14+
; CHECK-LABEL: 'fun2'
15+
; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %cmp = icmp eq i128 %val1, %val2
16+
; CHECK: Cost Model: Found an estimated cost of 5 for instruction: %v128 = zext i1 %cmp to i128
17+
%cmp = icmp eq i128 %val1, %val2
18+
%v128 = zext i1 %cmp to i128
19+
ret i128 %v128
20+
}
21+
22+
define i128 @fun3(i128 %val1, i128 %val2,
23+
i128 %val3, i128 %val4) {
24+
; CHECK-LABEL: 'fun3'
25+
; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %cmp = icmp eq i128 %val1, %val2
26+
; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %add = add i128 %val3, %val4
27+
; CHECK: Cost Model: Found an estimated cost of 4 for instruction: %sel = select i1 %cmp, i128 %val3, i128 %add
28+
%cmp = icmp eq i128 %val1, %val2
29+
%add = add i128 %val3, %val4
30+
%sel = select i1 %cmp, i128 %val3, i128 %add
31+
ret i128 %sel
32+
}
33+
34+
define i128 @fun4(ptr %src) {
35+
; CHECK-LABEL: 'fun4'
36+
; CHECK: Cost Model: Found an estimated cost of 2 for instruction: %res = sext i64 %v to i128
37+
%v = load i64, ptr %src, align 8
38+
%res = sext i64 %v to i128
39+
ret i128 %res
40+
}
41+
42+
define i128 @fun5(i64 %lhs, i64 %rhs) {
43+
; CHECK-LABEL: 'fun5'
44+
; CHECK: Cost Model: Found an estimated cost of 2 for instruction: %res = sext i64 %v to i128
45+
%v = add i64 %lhs, %rhs
46+
%res = sext i64 %v to i128
47+
ret i128 %res
48+
}
49+
50+
define i128 @fun6(ptr %src) {
51+
; CHECK-LABEL: 'fun6'
52+
; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %res = zext i64 %v to i128
53+
%v = load i64, ptr %src, align 8
54+
%res = zext i64 %v to i128
55+
ret i128 %res
56+
}
57+
58+
define i128 @fun7(i64 %lhs, i64 %rhs) {
59+
; CHECK-LABEL: 'fun7'
60+
; CHECK: Cost Model: Found an estimated cost of 2 for instruction: %res = zext i64 %v to i128
61+
%v = add i64 %lhs, %rhs
62+
%res = zext i64 %v to i128
63+
ret i128 %res
64+
}
65+
66+
; Truncating store is free.
67+
define void @fun8(i128 %lhs, i128 %rhs, ptr %dst) {
68+
; CHECK-LABEL: 'fun8'
69+
; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %t = trunc i128 %v to i64
70+
%v = add i128 %lhs, %rhs
71+
%t = trunc i128 %v to i64
72+
store i64 %t, ptr %dst, align 8
73+
ret void
74+
}
75+
76+
; If there is a non-store user, an extraction is needed.
77+
define i64 @fun9(i128 %lhs, i128 %rhs, ptr %dst) {
78+
; CHECK-LABEL: 'fun9'
79+
; CHECK: Cost Model: Found an estimated cost of 2 for instruction: %t = trunc i128 %v to i64
80+
%v = add i128 %lhs, %rhs
81+
%t = trunc i128 %v to i64
82+
store i64 %t, ptr %dst, align 8
83+
ret i64 %t
84+
}
85+
86+
; Truncation of load is free.
87+
define i64 @fun10(ptr %src) {
88+
; CHECK-LABEL: 'fun10'
89+
; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %t = trunc i128 %v to i64
90+
%v = load i128, ptr %src, align 8
91+
%t = trunc i128 %v to i64
92+
ret i64 %t
93+
}
94+
95+
; If the load has another user, the truncation becomes an extract.
96+
define i64 @fun11(ptr %src, i128 %val2, ptr %dst) {
97+
; CHECK-LABEL: 'fun11'
98+
; CHECK: Cost Model: Found an estimated cost of 2 for instruction: %t = trunc i128 %v to i64
99+
%v = load i128, ptr %src, align 8
100+
%t = trunc i128 %v to i64
101+
%a = add i128 %v, %val2
102+
store i128 %a, ptr %dst
103+
ret i64 %t
104+
}
105+
106+
; Trunction with a GPR use typically requires an extraction.
107+
define i64 @fun12(i128 %lhs, i128 %rhs) {
108+
; CHECK-LABEL: 'fun12'
109+
; CHECK: Cost Model: Found an estimated cost of 2 for instruction: %t = trunc i128 %v to i64
110+
%v = add i128 %lhs, %rhs
111+
%t = trunc i128 %v to i64
112+
ret i64 %t
113+
}
114+
115+
; Fp<->Int conversions require libcalls.
116+
define void @fun13() {
117+
; CHECK-LABEL: 'fun13'
118+
; CHECK: Cost Model: Found an estimated cost of 30 for instruction: %v0 = fptosi fp128 undef to i128
119+
; CHECK: Cost Model: Found an estimated cost of 30 for instruction: %v1 = fptosi double undef to i128
120+
; CHECK: Cost Model: Found an estimated cost of 30 for instruction: %v2 = fptosi float undef to i128
121+
; CHECK: Cost Model: Found an estimated cost of 30 for instruction: %v3 = fptoui fp128 undef to i128
122+
; CHECK: Cost Model: Found an estimated cost of 30 for instruction: %v4 = fptoui double undef to i128
123+
; CHECK: Cost Model: Found an estimated cost of 30 for instruction: %v5 = fptoui float undef to i128
124+
; CHECK: Cost Model: Found an estimated cost of 30 for instruction: %v6 = sitofp i128 undef to fp128
125+
; CHECK: Cost Model: Found an estimated cost of 30 for instruction: %v7 = sitofp i128 undef to double
126+
; CHECK: Cost Model: Found an estimated cost of 30 for instruction: %v8 = sitofp i128 undef to float
127+
; CHECK: Cost Model: Found an estimated cost of 30 for instruction: %v9 = uitofp i128 undef to fp128
128+
; CHECK: Cost Model: Found an estimated cost of 30 for instruction: %v10 = uitofp i128 undef to double
129+
; CHECK: Cost Model: Found an estimated cost of 30 for instruction: %v11 = uitofp i128 undef to float
130+
%v0 = fptosi fp128 undef to i128
131+
%v1 = fptosi double undef to i128
132+
%v2 = fptosi float undef to i128
133+
%v3 = fptoui fp128 undef to i128
134+
%v4 = fptoui double undef to i128
135+
%v5 = fptoui float undef to i128
136+
%v6 = sitofp i128 undef to fp128
137+
%v7 = sitofp i128 undef to double
138+
%v8 = sitofp i128 undef to float
139+
%v9 = uitofp i128 undef to fp128
140+
%v10 = uitofp i128 undef to double
141+
%v11 = uitofp i128 undef to float
142+
ret void
143+
}

llvm/test/Analysis/CostModel/SystemZ/int-arith.ll

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ define void @add() {
88
%res1 = add i16 undef, undef
99
%res2 = add i32 undef, undef
1010
%res3 = add i64 undef, undef
11+
%resQ = add i128 undef, undef
1112
%res4 = add <2 x i8> undef, undef
1213
%res5 = add <2 x i16> undef, undef
1314
%res6 = add <2 x i32> undef, undef
@@ -29,6 +30,7 @@ define void @add() {
2930
; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %res1 = add i16 undef, undef
3031
; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %res2 = add i32 undef, undef
3132
; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %res3 = add i64 undef, undef
33+
; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %resQ = add i128 undef, undef
3234
; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %res4 = add <2 x i8> undef, undef
3335
; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %res5 = add <2 x i16> undef, undef
3436
; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %res6 = add <2 x i32> undef, undef
@@ -54,6 +56,7 @@ define void @sub() {
5456
%res1 = sub i16 undef, undef
5557
%res2 = sub i32 undef, undef
5658
%res3 = sub i64 undef, undef
59+
%resQ = sub i128 undef, undef
5760
%res4 = sub <2 x i8> undef, undef
5861
%res5 = sub <2 x i16> undef, undef
5962
%res6 = sub <2 x i32> undef, undef
@@ -75,6 +78,7 @@ define void @sub() {
7578
; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %res1 = sub i16 undef, undef
7679
; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %res2 = sub i32 undef, undef
7780
; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %res3 = sub i64 undef, undef
81+
; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %resQ = sub i128 undef, undef
7882
; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %res4 = sub <2 x i8> undef, undef
7983
; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %res5 = sub <2 x i16> undef, undef
8084
; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %res6 = sub <2 x i32> undef, undef
@@ -100,6 +104,7 @@ define void @mul() {
100104
%res1 = mul i16 undef, undef
101105
%res2 = mul i32 undef, undef
102106
%res3 = mul i64 undef, undef
107+
%resQ = mul i128 undef, undef
103108
%res4 = mul <2 x i8> undef, undef
104109
%res5 = mul <2 x i16> undef, undef
105110
%res6 = mul <2 x i32> undef, undef
@@ -121,6 +126,7 @@ define void @mul() {
121126
; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %res1 = mul i16 undef, undef
122127
; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %res2 = mul i32 undef, undef
123128
; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %res3 = mul i64 undef, undef
129+
; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %resQ = mul i128 undef, undef
124130
; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %res4 = mul <2 x i8> undef, undef
125131
; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %res5 = mul <2 x i16> undef, undef
126132
; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %res6 = mul <2 x i32> undef, undef

0 commit comments

Comments
 (0)