Skip to content

Commit 9a98ab5

Browse files
[AArch64][SVE2] Change the cost of extends with S/URHADD to 0
When SVE2 is enabled, we can combine an add of 1, add & shift right by 1 to a single s/urhadd instruction. If the operands to the adds are extended, these extends will fold into the s/urhadd and their costs should be 0. Reviewed By: david-arm, dtemirbulatov Differential Revision: https://reviews.llvm.org/D157628
1 parent 06ef752 commit 9a98ab5

File tree

3 files changed

+259
-0
lines changed

3 files changed

+259
-0
lines changed

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2044,6 +2044,56 @@ bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
20442044
return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize;
20452045
}
20462046

2047+
// s/urhadd instructions implement the following pattern, making the
2048+
// extends free:
2049+
// %x = add ((zext i8 -> i16), 1)
2050+
// %y = (zext i8 -> i16)
2051+
// trunc i16 (lshr (add %x, %y), 1) -> i8
2052+
//
2053+
bool AArch64TTIImpl::isExtPartOfAvgExpr(const Instruction *ExtUser,
2054+
const CastInst *Ext, Type *Dst,
2055+
Type *Src) {
2056+
2057+
// The source should be a legal vector type.
2058+
if (!Src->isVectorTy() || !TLI->isTypeLegal(TLI->getValueType(DL, Src)) ||
2059+
(Src->isScalableTy() && !ST->hasSVE2()))
2060+
return false;
2061+
2062+
if (ExtUser->getOpcode() != Instruction::Add || !ExtUser->hasOneUse())
2063+
return false;
2064+
2065+
// Look for trunc/shl/add before trying to match the pattern.
2066+
const Instruction *Add = ExtUser;
2067+
auto *AddUser =
2068+
dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
2069+
if (AddUser && AddUser->getOpcode() == Instruction::Add)
2070+
Add = AddUser;
2071+
2072+
auto *Shr = dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
2073+
if (!Shr || Shr->getOpcode() != Instruction::LShr)
2074+
return false;
2075+
2076+
auto *Trunc = dyn_cast_or_null<Instruction>(Shr->getUniqueUndroppableUser());
2077+
if (!Trunc || Trunc->getOpcode() != Instruction::Trunc ||
2078+
Src->getScalarSizeInBits() !=
2079+
cast<CastInst>(Trunc)->getDestTy()->getScalarSizeInBits())
2080+
return false;
2081+
2082+
// Try to match the whole pattern. Ext could be either the first or second
2083+
// m_ZExtOrSExt matched.
2084+
Instruction *Ex1, *Ex2;
2085+
if (!(match(Add, m_c_Add(m_Instruction(Ex1),
2086+
m_c_Add(m_Instruction(Ex2), m_SpecificInt(1))))))
2087+
return false;
2088+
2089+
// Ensure both extends are of the same type
2090+
if (match(Ex1, m_ZExtOrSExt(m_Value())) &&
2091+
Ex1->getOpcode() == Ex2->getOpcode())
2092+
return true;
2093+
2094+
return false;
2095+
}
2096+
20472097
InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
20482098
Type *Src,
20492099
TTI::CastContextHint CCH,
@@ -2068,6 +2118,11 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
20682118
} else // Others are free so long as isWideningInstruction returned true.
20692119
return 0;
20702120
}
2121+
2122+
// The cast will be free for the s/urhadd instructions
2123+
if ((isa<ZExtInst>(I) || isa<SExtInst>(I)) &&
2124+
isExtPartOfAvgExpr(SingleUser, cast<CastInst>(I), Dst, Src))
2125+
return 0;
20712126
}
20722127

20732128
// TODO: Allow non-throughput costs that aren't binary.

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,9 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
163163
TTI::TargetCostKind CostKind,
164164
const Instruction *I = nullptr);
165165

166+
bool isExtPartOfAvgExpr(const Instruction *ExtUser, const CastInst *Ext,
167+
Type *Dst, Type *Src);
168+
166169
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
167170
TTI::CastContextHint CCH,
168171
TTI::TargetCostKind CostKind,
Lines changed: 201 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,201 @@
1+
; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s -check-prefix=SVE
2+
; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s --check-prefix=SVE2
3+
4+
; SRHADD
5+
6+
define void @srhadd_i8_sext_i16_fixed(ptr %a, ptr %b, ptr %dst) {
7+
; SVE-LABEL: 'srhadd_i8_sext_i16_fixed'
8+
; SVE: Cost Model: Found an estimated cost of 0 for instruction: %ext1 = sext <16 x i8> %ld1 to <16 x i16>
9+
; SVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %ext2 = sext <16 x i8> %ld2 to <16 x i16>
10+
;
11+
; SVE2-LABEL: 'srhadd_i8_sext_i16_fixed'
12+
; SVE2: Cost Model: Found an estimated cost of 0 for instruction: %ext1 = sext <16 x i8> %ld1 to <16 x i16>
13+
; SVE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %ext2 = sext <16 x i8> %ld2 to <16 x i16>
14+
;
15+
%ld1 = load <16 x i8>, ptr %a
16+
%ld2 = load <16 x i8>, ptr %b
17+
%ext1 = sext <16 x i8> %ld1 to <16 x i16>
18+
%ext2 = sext <16 x i8> %ld2 to <16 x i16>
19+
%add1 = add nuw nsw <16 x i16> %ext1, shufflevector (<16 x i16> insertelement (<16 x i16> poison, i16 1, i64 0), <16 x i16> poison, <16 x i32> zeroinitializer)
20+
%add2 = add nuw nsw <16 x i16> %add1, %ext2
21+
%shr = lshr <16 x i16> %add2, shufflevector (<16 x i16> insertelement (<16 x i16> poison, i16 1, i64 0), <16 x i16> poison, <16 x i32> zeroinitializer)
22+
%trunc = trunc <16 x i16> %shr to <16 x i8>
23+
store <16 x i8> %trunc, ptr %a
24+
ret void
25+
}
26+
27+
define void @srhadd_i8_sext_i16_scalable(ptr %a, ptr %b, ptr %dst) {
28+
; SVE-LABEL: 'srhadd_i8_sext_i16_scalable'
29+
; SVE: Cost Model: Found an estimated cost of 2 for instruction: %ext1 = sext <vscale x 16 x i8> %ld1 to <vscale x 16 x i16>
30+
; SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ext2 = sext <vscale x 16 x i8> %ld2 to <vscale x 16 x i16>
31+
;
32+
; SVE2-LABEL: 'srhadd_i8_sext_i16_scalable'
33+
; SVE2: Cost Model: Found an estimated cost of 0 for instruction: %ext1 = sext <vscale x 16 x i8> %ld1 to <vscale x 16 x i16>
34+
; SVE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %ext2 = sext <vscale x 16 x i8> %ld2 to <vscale x 16 x i16>
35+
;
36+
%ld1 = load <vscale x 16 x i8>, ptr %a
37+
%ld2 = load <vscale x 16 x i8>, ptr %b
38+
%ext1 = sext <vscale x 16 x i8> %ld1 to <vscale x 16 x i16>
39+
%ext2 = sext <vscale x 16 x i8> %ld2 to <vscale x 16 x i16>
40+
%add1 = add nuw nsw <vscale x 16 x i16> %ext1, shufflevector (<vscale x 16 x i16> insertelement (<vscale x 16 x i16> poison, i16 1, i64 0), <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer)
41+
%add2 = add nuw nsw <vscale x 16 x i16> %add1, %ext2
42+
%shr = lshr <vscale x 16 x i16> %add2, shufflevector (<vscale x 16 x i16> insertelement (<vscale x 16 x i16> poison, i16 1, i64 0), <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer)
43+
%trunc = trunc <vscale x 16 x i16> %shr to <vscale x 16 x i8>
44+
store <vscale x 16 x i8> %trunc, ptr %a
45+
ret void
46+
}
47+
48+
define void @srhadd_i16_sext_i64_scalable(ptr %a, ptr %b, ptr %dst) {
49+
; SVE-LABEL: 'srhadd_i16_sext_i64_scalable'
50+
; SVE: Cost Model: Found an estimated cost of 6 for instruction: %ext1 = sext <vscale x 8 x i16> %ld1 to <vscale x 8 x i64>
51+
; SVE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %ext2 = sext <vscale x 8 x i16> %ld2 to <vscale x 8 x i64>
52+
;
53+
; SVE2-LABEL: 'srhadd_i16_sext_i64_scalable'
54+
; SVE2: Cost Model: Found an estimated cost of 0 for instruction: %ext1 = sext <vscale x 8 x i16> %ld1 to <vscale x 8 x i64>
55+
; SVE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %ext2 = sext <vscale x 8 x i16> %ld2 to <vscale x 8 x i64>
56+
;
57+
%ld1 = load <vscale x 8 x i16>, ptr %a
58+
%ld2 = load <vscale x 8 x i16>, ptr %b
59+
%ext1 = sext <vscale x 8 x i16> %ld1 to <vscale x 8 x i64>
60+
%ext2 = sext <vscale x 8 x i16> %ld2 to <vscale x 8 x i64>
61+
%add1 = add nuw nsw <vscale x 8 x i64> %ext1, shufflevector (<vscale x 8 x i64> insertelement (<vscale x 8 x i64> poison, i64 1, i64 0), <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer)
62+
%add2 = add nuw nsw <vscale x 8 x i64> %add1, %ext2
63+
%shr = lshr <vscale x 8 x i64> %add2, shufflevector (<vscale x 8 x i64> insertelement (<vscale x 8 x i64> poison, i64 1, i64 0), <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer)
64+
%trunc = trunc <vscale x 8 x i64> %shr to <vscale x 8 x i16>
65+
store <vscale x 8 x i16> %trunc, ptr %a
66+
ret void
67+
}
68+
69+
; URHADD
70+
71+
define void @urhadd_i32_zext_i64_fixed(ptr %a, ptr %b, ptr %dst) {
72+
; SVE-LABEL: 'urhadd_i32_zext_i64_fixed'
73+
; SVE: Cost Model: Found an estimated cost of 0 for instruction: %ext1 = zext <4 x i32> %ld1 to <4 x i64>
74+
; SVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %ext2 = zext <4 x i32> %ld2 to <4 x i64>
75+
;
76+
; SVE2-LABEL: 'urhadd_i32_zext_i64_fixed'
77+
; SVE2: Cost Model: Found an estimated cost of 0 for instruction: %ext1 = zext <4 x i32> %ld1 to <4 x i64>
78+
; SVE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %ext2 = zext <4 x i32> %ld2 to <4 x i64>
79+
;
80+
%ld1 = load <4 x i32>, ptr %a
81+
%ld2 = load <4 x i32>, ptr %b
82+
%ext1 = zext <4 x i32> %ld1 to <4 x i64>
83+
%ext2 = zext <4 x i32> %ld2 to <4 x i64>
84+
%add1 = add nuw nsw <4 x i64> %ext1, shufflevector (<4 x i64> insertelement (<4 x i64> poison, i64 1, i64 0), <4 x i64> poison, <4 x i32> zeroinitializer)
85+
%add2 = add nuw nsw <4 x i64> %add1, %ext2
86+
%shr = lshr <4 x i64> %add2, shufflevector (<4 x i64> insertelement (<4 x i64> poison, i64 1, i64 0), <4 x i64> poison, <4 x i32> zeroinitializer)
87+
%trunc = trunc <4 x i64> %shr to <4 x i32>
88+
store <4 x i32> %trunc, ptr %a
89+
ret void
90+
}
91+
92+
define void @urhadd_i8_zext_i64(ptr %a, ptr %b, ptr %dst) {
93+
; SVE-LABEL: 'urhadd_i8_zext_i64'
94+
; SVE: Cost Model: Found an estimated cost of 14 for instruction: %ext1 = zext <vscale x 16 x i8> %ld1 to <vscale x 16 x i64>
95+
; SVE-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %ext2 = zext <vscale x 16 x i8> %ld2 to <vscale x 16 x i64>
96+
;
97+
; SVE2-LABEL: 'urhadd_i8_zext_i64'
98+
; SVE2: Cost Model: Found an estimated cost of 0 for instruction: %ext1 = zext <vscale x 16 x i8> %ld1 to <vscale x 16 x i64>
99+
; SVE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %ext2 = zext <vscale x 16 x i8> %ld2 to <vscale x 16 x i64>
100+
;
101+
%ld1 = load <vscale x 16 x i8>, ptr %a
102+
%ld2 = load <vscale x 16 x i8>, ptr %b
103+
%ext1 = zext <vscale x 16 x i8> %ld1 to <vscale x 16 x i64>
104+
%ext2 = zext <vscale x 16 x i8> %ld2 to <vscale x 16 x i64>
105+
%add1 = add nuw nsw <vscale x 16 x i64> %ext1, shufflevector (<vscale x 16 x i64> insertelement (<vscale x 16 x i64> poison, i64 1, i64 0), <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer)
106+
%add2 = add nuw nsw <vscale x 16 x i64> %add1, %ext2
107+
%shr = lshr <vscale x 16 x i64> %add2, shufflevector (<vscale x 16 x i64> insertelement (<vscale x 16 x i64> poison, i64 1, i64 0), <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer)
108+
%trunc = trunc <vscale x 16 x i64> %shr to <vscale x 16 x i8>
109+
store <vscale x 16 x i8> %trunc, ptr %a
110+
ret void
111+
}
112+
113+
define void @urhadd_i16_zext_i32(ptr %a, ptr %b, ptr %dst) {
114+
; SVE-LABEL: 'urhadd_i16_zext_i32'
115+
; SVE: Cost Model: Found an estimated cost of 2 for instruction: %ext1 = zext <vscale x 8 x i16> %ld1 to <vscale x 8 x i32>
116+
; SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ext2 = zext <vscale x 8 x i16> %ld2 to <vscale x 8 x i32>
117+
;
118+
; SVE2-LABEL: 'urhadd_i16_zext_i32'
119+
; SVE2: Cost Model: Found an estimated cost of 0 for instruction: %ext1 = zext <vscale x 8 x i16> %ld1 to <vscale x 8 x i32>
120+
; SVE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %ext2 = zext <vscale x 8 x i16> %ld2 to <vscale x 8 x i32>
121+
;
122+
%ld1 = load <vscale x 8 x i16>, ptr %a
123+
%ld2 = load <vscale x 8 x i16>, ptr %b
124+
%ext1 = zext <vscale x 8 x i16> %ld1 to <vscale x 8 x i32>
125+
%ext2 = zext <vscale x 8 x i16> %ld2 to <vscale x 8 x i32>
126+
%add1 = add nuw nsw <vscale x 8 x i32> %ext1, shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 1, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
127+
%add2 = add nuw nsw <vscale x 8 x i32> %add1, %ext2
128+
%shr = lshr <vscale x 8 x i32> %add2, shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 1, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
129+
%trunc = trunc <vscale x 8 x i32> %shr to <vscale x 8 x i16>
130+
store <vscale x 8 x i16> %trunc, ptr %a
131+
ret void
132+
}
133+
134+
; NEGATIVE TESTS
135+
136+
define void @ext_operand_mismatch(ptr %a, ptr %b, ptr %dst) {
137+
; SVE-LABEL: 'ext_operand_mismatch'
138+
; SVE: Cost Model: Found an estimated cost of 2 for instruction: %ext1 = sext <vscale x 16 x i8> %ld1 to <vscale x 16 x i16>
139+
; SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ext2 = zext <vscale x 16 x i8> %ld2 to <vscale x 16 x i16>
140+
;
141+
; SVE2-LABEL: 'ext_operand_mismatch'
142+
; SVE2: Cost Model: Found an estimated cost of 2 for instruction: %ext1 = sext <vscale x 16 x i8> %ld1 to <vscale x 16 x i16>
143+
; SVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ext2 = zext <vscale x 16 x i8> %ld2 to <vscale x 16 x i16>
144+
;
145+
%ld1 = load <vscale x 16 x i8>, ptr %a
146+
%ld2 = load <vscale x 16 x i8>, ptr %b
147+
%ext1 = sext <vscale x 16 x i8> %ld1 to <vscale x 16 x i16>
148+
%ext2 = zext <vscale x 16 x i8> %ld2 to <vscale x 16 x i16>
149+
%add1 = add nuw nsw <vscale x 16 x i16> %ext1, shufflevector (<vscale x 16 x i16> insertelement (<vscale x 16 x i16> poison, i16 1, i64 0), <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer)
150+
%add2 = add nuw nsw <vscale x 16 x i16> %add1, %ext2
151+
%shr = lshr <vscale x 16 x i16> %add2, shufflevector (<vscale x 16 x i16> insertelement (<vscale x 16 x i16> poison, i16 1, i64 0), <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer)
152+
%trunc = trunc <vscale x 16 x i16> %shr to <vscale x 16 x i8>
153+
store <vscale x 16 x i8> %trunc, ptr %a
154+
ret void
155+
}
156+
157+
define void @add_multiple_uses(ptr %a, ptr %b, ptr %dst) {
158+
; SVE-LABEL: 'add_multiple_uses'
159+
; SVE: Cost Model: Found an estimated cost of 2 for instruction: %ext1 = sext <vscale x 8 x i16> %ld1 to <vscale x 8 x i32>
160+
; SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ext2 = sext <vscale x 8 x i16> %ld2 to <vscale x 8 x i32>
161+
;
162+
; SVE2-LABEL: 'add_multiple_uses'
163+
; SVE2: Cost Model: Found an estimated cost of 2 for instruction: %ext1 = sext <vscale x 8 x i16> %ld1 to <vscale x 8 x i32>
164+
; SVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ext2 = sext <vscale x 8 x i16> %ld2 to <vscale x 8 x i32>
165+
;
166+
%ld1 = load <vscale x 8 x i16>, ptr %a
167+
%ld2 = load <vscale x 8 x i16>, ptr %b
168+
%ext1 = sext <vscale x 8 x i16> %ld1 to <vscale x 8 x i32>
169+
%ext2 = sext <vscale x 8 x i16> %ld2 to <vscale x 8 x i32>
170+
%add1 = add nuw nsw <vscale x 8 x i32> %ext1, shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 1, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
171+
%add2 = add nuw nsw <vscale x 8 x i32> %add1, %ext2
172+
%shr = lshr <vscale x 8 x i32> %add2, shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 1, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
173+
%trunc = trunc <vscale x 8 x i32> %shr to <vscale x 8 x i16>
174+
%add.res = add nuw nsw <vscale x 8 x i32> %add1, %add2
175+
%res = trunc <vscale x 8 x i32> %add.res to <vscale x 8 x i16>
176+
store <vscale x 8 x i16> %res, ptr %a
177+
ret void
178+
}
179+
180+
define void @shift_multiple_uses(ptr %a, ptr %b, ptr %dst) {
181+
; SVE-LABEL: 'shift_multiple_uses'
182+
; SVE: Cost Model: Found an estimated cost of 2 for instruction: %ext1 = zext <vscale x 16 x i8> %ld1 to <vscale x 16 x i16>
183+
; SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ext2 = zext <vscale x 16 x i8> %ld2 to <vscale x 16 x i16>
184+
;
185+
; SVE2-LABEL: 'shift_multiple_uses'
186+
; SVE2: Cost Model: Found an estimated cost of 2 for instruction: %ext1 = zext <vscale x 16 x i8> %ld1 to <vscale x 16 x i16>
187+
; SVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ext2 = zext <vscale x 16 x i8> %ld2 to <vscale x 16 x i16>
188+
;
189+
%ld1 = load <vscale x 16 x i8>, ptr %a
190+
%ld2 = load <vscale x 16 x i8>, ptr %b
191+
%ext1 = zext <vscale x 16 x i8> %ld1 to <vscale x 16 x i16>
192+
%ext2 = zext <vscale x 16 x i8> %ld2 to <vscale x 16 x i16>
193+
%add1 = add nuw nsw <vscale x 16 x i16> %ext1, shufflevector (<vscale x 16 x i16> insertelement (<vscale x 16 x i16> poison, i16 1, i64 0), <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer)
194+
%add2 = add nuw nsw <vscale x 16 x i16> %add1, %ext2
195+
%shr = lshr <vscale x 16 x i16> %add2, shufflevector (<vscale x 16 x i16> insertelement (<vscale x 16 x i16> poison, i16 1, i64 0), <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer)
196+
%trunc = trunc <vscale x 16 x i16> %shr to <vscale x 16 x i8>
197+
%add3 = add nuw nsw <vscale x 16 x i16> %shr, %add2
198+
%res = trunc <vscale x 16 x i16> %add3 to <vscale x 16 x i8>
199+
store <vscale x 16 x i8> %res, ptr %a
200+
ret void
201+
}

0 commit comments

Comments
 (0)