Skip to content

Commit 28d7880

Browse files
authored
[WebAssembly] getMemoryOpCost and getCastInstrCost (llvm#122896)
Add inital implementations of these TTI methods for SIMD types. For casts, The costing covers the free extensions provided by extmul_low as well as extend_low. For memory operations we consider the use of load32_zero and load64_zero, as well as full width v128 loads.
1 parent 4cfbe55 commit 28d7880

File tree

4 files changed

+693
-2
lines changed

4 files changed

+693
-2
lines changed

llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp

Lines changed: 106 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313
//===----------------------------------------------------------------------===//
1414

1515
#include "WebAssemblyTargetTransformInfo.h"
16+
17+
#include "llvm/CodeGen/CostTable.h"
1618
using namespace llvm;
1719

1820
#define DEBUG_TYPE "wasmtti"
@@ -51,8 +53,7 @@ TypeSize WebAssemblyTTIImpl::getRegisterBitWidth(
5153
InstructionCost WebAssemblyTTIImpl::getArithmeticInstrCost(
5254
unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
5355
TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
54-
ArrayRef<const Value *> Args,
55-
const Instruction *CxtI) {
56+
ArrayRef<const Value *> Args, const Instruction *CxtI) {
5657

5758
InstructionCost Cost =
5859
BasicTTIImplBase<WebAssemblyTTIImpl>::getArithmeticInstrCost(
@@ -78,6 +79,109 @@ InstructionCost WebAssemblyTTIImpl::getArithmeticInstrCost(
7879
return Cost;
7980
}
8081

82+
InstructionCost WebAssemblyTTIImpl::getCastInstrCost(
83+
unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH,
84+
TTI::TargetCostKind CostKind, const Instruction *I) {
85+
int ISD = TLI->InstructionOpcodeToISD(Opcode);
86+
auto SrcTy = TLI->getValueType(DL, Src);
87+
auto DstTy = TLI->getValueType(DL, Dst);
88+
89+
if (!SrcTy.isSimple() || !DstTy.isSimple()) {
90+
return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
91+
}
92+
93+
if (!ST->hasSIMD128()) {
94+
return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
95+
}
96+
97+
auto DstVT = DstTy.getSimpleVT();
98+
auto SrcVT = SrcTy.getSimpleVT();
99+
100+
if (I && I->hasOneUser()) {
101+
auto *SingleUser = cast<Instruction>(*I->user_begin());
102+
int UserISD = TLI->InstructionOpcodeToISD(SingleUser->getOpcode());
103+
104+
// extmul_low support
105+
if (UserISD == ISD::MUL &&
106+
(ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND)) {
107+
// Free low extensions.
108+
if ((SrcVT == MVT::v8i8 && DstVT == MVT::v8i16) ||
109+
(SrcVT == MVT::v4i16 && DstVT == MVT::v4i32) ||
110+
(SrcVT == MVT::v2i32 && DstVT == MVT::v2i64)) {
111+
return 0;
112+
}
113+
// Will require an additional extlow operation for the intermediate
114+
// i16/i32 value.
115+
if ((SrcVT == MVT::v4i8 && DstVT == MVT::v4i32) ||
116+
(SrcVT == MVT::v2i16 && DstVT == MVT::v2i64)) {
117+
return 1;
118+
}
119+
}
120+
}
121+
122+
// extend_low
123+
static constexpr TypeConversionCostTblEntry ConversionTbl[] = {
124+
{ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1},
125+
{ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1},
126+
{ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1},
127+
{ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1},
128+
{ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1},
129+
{ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1},
130+
{ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 2},
131+
{ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2},
132+
{ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2},
133+
{ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2},
134+
};
135+
136+
if (const auto *Entry =
137+
ConvertCostTableLookup(ConversionTbl, ISD, DstVT, SrcVT)) {
138+
return Entry->Cost;
139+
}
140+
141+
return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
142+
}
143+
144+
InstructionCost WebAssemblyTTIImpl::getMemoryOpCost(
145+
unsigned Opcode, Type *Ty, MaybeAlign Alignment, unsigned AddressSpace,
146+
TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo,
147+
const Instruction *I) {
148+
if (!ST->hasSIMD128() || !isa<FixedVectorType>(Ty)) {
149+
return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
150+
CostKind);
151+
}
152+
153+
int ISD = TLI->InstructionOpcodeToISD(Opcode);
154+
if (ISD != ISD::LOAD) {
155+
return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
156+
CostKind);
157+
}
158+
159+
EVT VT = TLI->getValueType(DL, Ty, true);
160+
// Type legalization can't handle structs
161+
if (VT == MVT::Other)
162+
return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
163+
CostKind);
164+
165+
auto LT = getTypeLegalizationCost(Ty);
166+
if (!LT.first.isValid())
167+
return InstructionCost::getInvalid();
168+
169+
// 128-bit loads are a single instruction. 32-bit and 64-bit vector loads can
170+
// be lowered to load32_zero and load64_zero respectively. Assume SIMD loads
171+
// are twice as expensive as scalar.
172+
unsigned width = VT.getSizeInBits();
173+
switch (width) {
174+
default:
175+
break;
176+
case 32:
177+
case 64:
178+
case 128:
179+
return 2;
180+
}
181+
182+
return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace, CostKind);
183+
}
184+
81185
InstructionCost
82186
WebAssemblyTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
83187
TTI::TargetCostKind CostKind,

llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,16 @@ class WebAssemblyTTIImpl final : public BasicTTIImplBase<WebAssemblyTTIImpl> {
6464
TTI::OperandValueInfo Op1Info = {TTI::OK_AnyValue, TTI::OP_None},
6565
TTI::OperandValueInfo Op2Info = {TTI::OK_AnyValue, TTI::OP_None},
6666
ArrayRef<const Value *> Args = {}, const Instruction *CxtI = nullptr);
67+
68+
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
69+
TTI::CastContextHint CCH,
70+
TTI::TargetCostKind CostKind,
71+
const Instruction *I = nullptr);
72+
InstructionCost getMemoryOpCost(
73+
unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace,
74+
TTI::TargetCostKind CostKind,
75+
TTI::OperandValueInfo OpInfo = {TTI::OK_AnyValue, TTI::OP_None},
76+
const Instruction *I = nullptr);
6777
using BaseT::getVectorInstrCost;
6878
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
6979
TTI::TargetCostKind CostKind,
Lines changed: 251 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,251 @@
1+
; RUN: opt -mattr=+simd128 -passes=loop-vectorize %s | llc -mtriple=wasm32 -mattr=+simd128 -verify-machineinstrs -o - | FileCheck %s
2+
3+
target triple = "wasm32"
4+
5+
define hidden i32 @i32_mac_s8(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %N) {
6+
; CHECK-LABEL: i32_mac_s8:
7+
; CHECK: v128.load32_zero 0:p2align=0
8+
; CHECK: i16x8.extend_low_i8x16_s
9+
; CHECK: v128.load32_zero 0:p2align=0
10+
; CHECK: i16x8.extend_low_i8x16_s
11+
; CHECK: i32x4.extmul_low_i16x8_s
12+
; CHECK: i32x4.add
13+
entry:
14+
%cmp7.not = icmp eq i32 %N, 0
15+
br i1 %cmp7.not, label %for.cond.cleanup, label %for.body
16+
17+
for.cond.cleanup: ; preds = %for.body, %entry
18+
%res.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ]
19+
ret i32 %res.0.lcssa
20+
21+
for.body: ; preds = %entry, %for.body
22+
%i.09 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
23+
%res.08 = phi i32 [ %add, %for.body ], [ 0, %entry ]
24+
%arrayidx = getelementptr inbounds i8, ptr %a, i32 %i.09
25+
%0 = load i8, ptr %arrayidx, align 1
26+
%conv = sext i8 %0 to i32
27+
%arrayidx1 = getelementptr inbounds i8, ptr %b, i32 %i.09
28+
%1 = load i8, ptr %arrayidx1, align 1
29+
%conv2 = sext i8 %1 to i32
30+
%mul = mul nsw i32 %conv2, %conv
31+
%add = add nsw i32 %mul, %res.08
32+
%inc = add nuw i32 %i.09, 1
33+
%exitcond.not = icmp eq i32 %inc, %N
34+
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
35+
}
36+
37+
define hidden i32 @i32_mac_s16(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %N) {
38+
; CHECK-LABEL: i32_mac_s16:
39+
; CHECK: i32x4.load16x4_s 0:p2align=1
40+
; CHECK: i32x4.load16x4_s 0:p2align=1
41+
; CHECK: i32x4.mul
42+
; CHECK: i32x4.add
43+
entry:
44+
%cmp7.not = icmp eq i32 %N, 0
45+
br i1 %cmp7.not, label %for.cond.cleanup, label %for.body
46+
47+
for.cond.cleanup: ; preds = %for.body, %entry
48+
%res.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ]
49+
ret i32 %res.0.lcssa
50+
51+
for.body: ; preds = %entry, %for.body
52+
%i.09 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
53+
%res.08 = phi i32 [ %add, %for.body ], [ 0, %entry ]
54+
%arrayidx = getelementptr inbounds i16, ptr %a, i32 %i.09
55+
%0 = load i16, ptr %arrayidx, align 2
56+
%conv = sext i16 %0 to i32
57+
%arrayidx1 = getelementptr inbounds i16, ptr %b, i32 %i.09
58+
%1 = load i16, ptr %arrayidx1, align 2
59+
%conv2 = sext i16 %1 to i32
60+
%mul = mul nsw i32 %conv2, %conv
61+
%add = add nsw i32 %mul, %res.08
62+
%inc = add nuw i32 %i.09, 1
63+
%exitcond.not = icmp eq i32 %inc, %N
64+
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
65+
}
66+
67+
define hidden i64 @i64_mac_s16(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %N) {
68+
; CHECK-LABEL: i64_mac_s16:
69+
; CHECK: v128.load32_zero 0:p2align=1
70+
; CHECK: i32x4.extend_low_i16x8_s
71+
; CHECK: v128.load32_zero 0:p2align=1
72+
; CHECK: i32x4.extend_low_i16x8_s
73+
; CHECK: i64x2.extmul_low_i32x4_s
74+
; CHECK: i64x2.add
75+
entry:
76+
%cmp7.not = icmp eq i32 %N, 0
77+
br i1 %cmp7.not, label %for.cond.cleanup, label %for.body
78+
79+
for.cond.cleanup: ; preds = %for.body, %entry
80+
%res.0.lcssa = phi i64 [ 0, %entry ], [ %add, %for.body ]
81+
ret i64 %res.0.lcssa
82+
83+
for.body: ; preds = %entry, %for.body
84+
%i.09 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
85+
%res.08 = phi i64 [ %add, %for.body ], [ 0, %entry ]
86+
%arrayidx = getelementptr inbounds i16, ptr %a, i32 %i.09
87+
%0 = load i16, ptr %arrayidx, align 2
88+
%conv = sext i16 %0 to i64
89+
%arrayidx1 = getelementptr inbounds i16, ptr %b, i32 %i.09
90+
%1 = load i16, ptr %arrayidx1, align 2
91+
%conv2 = sext i16 %1 to i64
92+
%mul = mul nsw i64 %conv2, %conv
93+
%add = add nsw i64 %mul, %res.08
94+
%inc = add nuw i32 %i.09, 1
95+
%exitcond.not = icmp eq i32 %inc, %N
96+
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
97+
}
98+
99+
define hidden i64 @i64_mac_s32(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %N) {
100+
; CHECK-LABEL: i64_mac_s32:
101+
; CHECK: v128.load64_zero 0:p2align=2
102+
; CHECK: v128.load64_zero 0:p2align=2
103+
; CHECK: i32x4.mul
104+
; CHECK: i64x2.extend_low_i32x4_s
105+
; CHECK: i64x2.add
106+
entry:
107+
%cmp6.not = icmp eq i32 %N, 0
108+
br i1 %cmp6.not, label %for.cond.cleanup, label %for.body
109+
110+
for.cond.cleanup: ; preds = %for.body, %entry
111+
%res.0.lcssa = phi i64 [ 0, %entry ], [ %add, %for.body ]
112+
ret i64 %res.0.lcssa
113+
114+
for.body: ; preds = %entry, %for.body
115+
%i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
116+
%res.07 = phi i64 [ %add, %for.body ], [ 0, %entry ]
117+
%arrayidx = getelementptr inbounds i32, ptr %a, i32 %i.08
118+
%0 = load i32, ptr %arrayidx, align 4
119+
%arrayidx1 = getelementptr inbounds i32, ptr %b, i32 %i.08
120+
%1 = load i32, ptr %arrayidx1, align 4
121+
%mul = mul i32 %1, %0
122+
%conv = sext i32 %mul to i64
123+
%add = add i64 %res.07, %conv
124+
%inc = add nuw i32 %i.08, 1
125+
%exitcond.not = icmp eq i32 %inc, %N
126+
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
127+
}
128+
129+
define hidden i32 @i32_mac_u8(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %N) {
130+
; CHECK-LABEL: i32_mac_u8:
131+
; CHECK: v128.load32_zero 0:p2align=0
132+
; CHECK: i16x8.extend_low_i8x16_u
133+
; CHECK: v128.load32_zero 0:p2align=0
134+
; CHECK: i16x8.extend_low_i8x16_u
135+
; CHECK: i32x4.extmul_low_i16x8_u
136+
; CHECK: i32x4.add
137+
entry:
138+
%cmp7.not = icmp eq i32 %N, 0
139+
br i1 %cmp7.not, label %for.cond.cleanup, label %for.body
140+
141+
for.cond.cleanup: ; preds = %for.body, %entry
142+
%res.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ]
143+
ret i32 %res.0.lcssa
144+
145+
for.body: ; preds = %entry, %for.body
146+
%i.09 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
147+
%res.08 = phi i32 [ %add, %for.body ], [ 0, %entry ]
148+
%arrayidx = getelementptr inbounds i8, ptr %a, i32 %i.09
149+
%0 = load i8, ptr %arrayidx, align 1
150+
%conv = zext i8 %0 to i32
151+
%arrayidx1 = getelementptr inbounds i8, ptr %b, i32 %i.09
152+
%1 = load i8, ptr %arrayidx1, align 1
153+
%conv2 = zext i8 %1 to i32
154+
%mul = mul nuw nsw i32 %conv2, %conv
155+
%add = add i32 %mul, %res.08
156+
%inc = add nuw i32 %i.09, 1
157+
%exitcond.not = icmp eq i32 %inc, %N
158+
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
159+
}
160+
161+
define hidden i32 @i32_mac_u16(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %N) {
162+
; CHECK-LABEL: i32_mac_u16:
163+
; CHECK: i32x4.load16x4_u 0:p2align=1
164+
; CHECK: i32x4.load16x4_u 0:p2align=1
165+
; CHECK: i32x4.mul
166+
; CHECK: i32x4.add
167+
entry:
168+
%cmp7.not = icmp eq i32 %N, 0
169+
br i1 %cmp7.not, label %for.cond.cleanup, label %for.body
170+
171+
for.cond.cleanup: ; preds = %for.body, %entry
172+
%res.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ]
173+
ret i32 %res.0.lcssa
174+
175+
for.body: ; preds = %entry, %for.body
176+
%i.09 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
177+
%res.08 = phi i32 [ %add, %for.body ], [ 0, %entry ]
178+
%arrayidx = getelementptr inbounds i16, ptr %a, i32 %i.09
179+
%0 = load i16, ptr %arrayidx, align 2
180+
%conv = zext i16 %0 to i32
181+
%arrayidx1 = getelementptr inbounds i16, ptr %b, i32 %i.09
182+
%1 = load i16, ptr %arrayidx1, align 2
183+
%conv2 = zext i16 %1 to i32
184+
%mul = mul nuw nsw i32 %conv2, %conv
185+
%add = add i32 %mul, %res.08
186+
%inc = add nuw i32 %i.09, 1
187+
%exitcond.not = icmp eq i32 %inc, %N
188+
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
189+
}
190+
191+
define hidden i64 @i64_mac_u16(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %N) {
192+
; CHECK-LABEL: i64_mac_u16:
193+
; CHECK: v128.load32_zero 0:p2align=1
194+
; CHECK: i32x4.extend_low_i16x8_u
195+
; CHECK: v128.load32_zero 0:p2align=1
196+
; CHECK: i32x4.extend_low_i16x8_u
197+
; CHECK: i64x2.extmul_low_i32x4_u
198+
; CHECK: i64x2.add
199+
entry:
200+
%cmp8.not = icmp eq i32 %N, 0
201+
br i1 %cmp8.not, label %for.cond.cleanup, label %for.body
202+
203+
for.cond.cleanup: ; preds = %for.body, %entry
204+
%res.0.lcssa = phi i64 [ 0, %entry ], [ %add, %for.body ]
205+
ret i64 %res.0.lcssa
206+
207+
for.body: ; preds = %entry, %for.body
208+
%i.010 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
209+
%res.09 = phi i64 [ %add, %for.body ], [ 0, %entry ]
210+
%arrayidx = getelementptr inbounds i16, ptr %a, i32 %i.010
211+
%0 = load i16, ptr %arrayidx, align 2
212+
%conv = zext i16 %0 to i64
213+
%arrayidx1 = getelementptr inbounds i16, ptr %b, i32 %i.010
214+
%1 = load i16, ptr %arrayidx1, align 2
215+
%conv2 = zext i16 %1 to i64
216+
%mul = mul nuw nsw i64 %conv2, %conv
217+
%add = add i64 %mul, %res.09
218+
%inc = add nuw i32 %i.010, 1
219+
%exitcond.not = icmp eq i32 %inc, %N
220+
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
221+
}
222+
223+
define hidden i64 @i64_mac_u32(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %N) {
224+
; CHECK-LABEL: i64_mac_u32:
225+
; CHECK: v128.load64_zero 0:p2align=2
226+
; CHECK: v128.load64_zero 0:p2align=2
227+
; CHECK: i32x4.mul
228+
; CHECK: i64x2.extend_low_i32x4_u
229+
; CHECK: i64x2.add
230+
entry:
231+
%cmp6.not = icmp eq i32 %N, 0
232+
br i1 %cmp6.not, label %for.cond.cleanup, label %for.body
233+
234+
for.cond.cleanup: ; preds = %for.body, %entry
235+
%res.0.lcssa = phi i64 [ 0, %entry ], [ %add, %for.body ]
236+
ret i64 %res.0.lcssa
237+
238+
for.body: ; preds = %entry, %for.body
239+
%i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
240+
%res.07 = phi i64 [ %add, %for.body ], [ 0, %entry ]
241+
%arrayidx = getelementptr inbounds i32, ptr %a, i32 %i.08
242+
%0 = load i32, ptr %arrayidx, align 4
243+
%arrayidx1 = getelementptr inbounds i32, ptr %b, i32 %i.08
244+
%1 = load i32, ptr %arrayidx1, align 4
245+
%mul = mul i32 %1, %0
246+
%conv = zext i32 %mul to i64
247+
%add = add i64 %res.07, %conv
248+
%inc = add nuw i32 %i.08, 1
249+
%exitcond.not = icmp eq i32 %inc, %N
250+
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
251+
}

0 commit comments

Comments
 (0)