Skip to content

Commit 291101a

Browse files
yolanda15tlively
authored andcommitted
[WebAssembly] Optimize vector shift using a splat value from outside block
The vector shift operation in WebAssembly uses an i32 shift amount type, while the LLVM IR requires binary operator uses the same type of operands. When the shift amount operand is splated from a different block, the splat source will not be exported and the vector shift will be unrolled to scalar shifts. This patch enables the vector shift to identify the splat source value from the other block, and generate expected WebAssembly bytecode when lowering. Reviewed By: tlively Differential Revision: https://reviews.llvm.org/D158399
1 parent 52b93d2 commit 291101a

File tree

3 files changed

+131
-0
lines changed

3 files changed

+131
-0
lines changed

llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
#include "llvm/IR/Function.h"
3333
#include "llvm/IR/Intrinsics.h"
3434
#include "llvm/IR/IntrinsicsWebAssembly.h"
35+
#include "llvm/IR/PatternMatch.h"
3536
#include "llvm/Support/Debug.h"
3637
#include "llvm/Support/ErrorHandling.h"
3738
#include "llvm/Support/KnownBits.h"
@@ -833,6 +834,30 @@ bool WebAssemblyTargetLowering::isOffsetFoldingLegal(
833834
return isa<Function>(GV) ? false : TargetLowering::isOffsetFoldingLegal(GA);
834835
}
835836

837+
bool WebAssemblyTargetLowering::shouldSinkOperands(
838+
Instruction *I, SmallVectorImpl<Use *> &Ops) const {
839+
using namespace llvm::PatternMatch;
840+
841+
if (!I->getType()->isVectorTy() || !I->isShift())
842+
return false;
843+
844+
Value *V = I->getOperand(1);
845+
// We dont need to sink constant splat.
846+
if (dyn_cast<Constant>(V))
847+
return false;
848+
849+
if (match(V, m_Shuffle(m_InsertElt(m_Value(), m_Value(), m_ZeroInt()),
850+
m_Value(), m_ZeroMask()))) {
851+
// Sink insert
852+
Ops.push_back(&cast<Instruction>(V)->getOperandUse(0));
853+
// Sink shuffle
854+
Ops.push_back(&I->getOperandUse(1));
855+
return true;
856+
}
857+
858+
return false;
859+
}
860+
836861
EVT WebAssemblyTargetLowering::getSetCCResultType(const DataLayout &DL,
837862
LLVMContext &C,
838863
EVT VT) const {

llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,8 @@ class WebAssemblyTargetLowering final : public TargetLowering {
7676
bool isIntDivCheap(EVT VT, AttributeList Attr) const override;
7777
bool isVectorLoadExtDesirable(SDValue ExtVal) const override;
7878
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
79+
bool shouldSinkOperands(Instruction *I,
80+
SmallVectorImpl<Use *> &Ops) const override;
7981
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
8082
EVT VT) const override;
8183
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2+
; RUN: llc < %s -verify-machineinstrs -mattr=+simd128 | FileCheck %s
3+
4+
; Test that SIMD shifts can be lowered correctly even when shift
5+
; values are exported from outside blocks.
6+
7+
target triple = "wasm32-unknown-unknown"
8+
9+
define void @shl_loop(ptr %a, i8 %shift, i32 %count) {
10+
; CHECK-LABEL: shl_loop:
11+
; CHECK: .functype shl_loop (i32, i32, i32) -> ()
12+
; CHECK-NEXT: # %bb.0: # %entry
13+
; CHECK-NEXT: .LBB0_1: # %body
14+
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
15+
; CHECK-NEXT: loop # label0:
16+
; CHECK-NEXT: local.get 0
17+
; CHECK-NEXT: local.get 0
18+
; CHECK-NEXT: v128.load 0:p2align=0
19+
; CHECK-NEXT: local.get 1
20+
; CHECK-NEXT: i8x16.shl
21+
; CHECK-NEXT: v128.store 16
22+
; CHECK-NEXT: local.get 0
23+
; CHECK-NEXT: i32.const 16
24+
; CHECK-NEXT: i32.add
25+
; CHECK-NEXT: local.set 0
26+
; CHECK-NEXT: local.get 2
27+
; CHECK-NEXT: i32.const -1
28+
; CHECK-NEXT: i32.add
29+
; CHECK-NEXT: local.tee 2
30+
; CHECK-NEXT: i32.eqz
31+
; CHECK-NEXT: br_if 0 # 0: up to label0
32+
; CHECK-NEXT: # %bb.2: # %exit
33+
; CHECK-NEXT: end_loop
34+
; CHECK-NEXT: # fallthrough-return
35+
entry:
36+
%t1 = insertelement <16 x i8> undef, i8 %shift, i32 0
37+
%vshift = shufflevector <16 x i8> %t1, <16 x i8> undef, <16 x i32> zeroinitializer
38+
br label %body
39+
body:
40+
%out = phi ptr [%a, %entry], [%b, %body]
41+
%i = phi i32 [0, %entry], [%next, %body]
42+
%v = load <16 x i8>, ptr %out, align 1
43+
%r = shl <16 x i8> %v, %vshift
44+
%b = getelementptr inbounds i8, ptr %out, i32 16
45+
store <16 x i8> %r, ptr %b
46+
%next = add i32 %i, 1
47+
%i.cmp = icmp eq i32 %next, %count
48+
br i1 %i.cmp, label %body, label %exit
49+
exit:
50+
ret void
51+
}
52+
53+
; Test that SIMD shifts can be lowered correctly when shift value
54+
; is a phi inside loop body.
55+
56+
define void @shl_phi_loop(ptr %a, i8 %shift, i32 %count) {
57+
; CHECK-LABEL: shl_phi_loop:
58+
; CHECK: .functype shl_phi_loop (i32, i32, i32) -> ()
59+
; CHECK-NEXT: # %bb.0: # %entry
60+
; CHECK-NEXT: .LBB1_1: # %body
61+
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
62+
; CHECK-NEXT: loop # label1:
63+
; CHECK-NEXT: local.get 0
64+
; CHECK-NEXT: local.get 0
65+
; CHECK-NEXT: v128.load 0:p2align=0
66+
; CHECK-NEXT: local.get 1
67+
; CHECK-NEXT: i8x16.shl
68+
; CHECK-NEXT: v128.store 16
69+
; CHECK-NEXT: local.get 1
70+
; CHECK-NEXT: i32.const 1
71+
; CHECK-NEXT: i32.and
72+
; CHECK-NEXT: local.set 1
73+
; CHECK-NEXT: local.get 0
74+
; CHECK-NEXT: i32.const 16
75+
; CHECK-NEXT: i32.add
76+
; CHECK-NEXT: local.set 0
77+
; CHECK-NEXT: local.get 2
78+
; CHECK-NEXT: i32.const -1
79+
; CHECK-NEXT: i32.add
80+
; CHECK-NEXT: local.tee 2
81+
; CHECK-NEXT: i32.eqz
82+
; CHECK-NEXT: br_if 0 # 0: up to label1
83+
; CHECK-NEXT: # %bb.2: # %exit
84+
; CHECK-NEXT: end_loop
85+
; CHECK-NEXT: # fallthrough-return
86+
entry:
87+
br label %body
88+
body:
89+
%out = phi ptr [%a, %entry], [%b, %body]
90+
%i = phi i32 [0, %entry], [%next, %body]
91+
%t1 = phi i8 [%shift, %entry], [%sand, %body]
92+
%t2 = insertelement <16 x i8> undef, i8 %t1, i32 0
93+
%vshift = shufflevector <16 x i8> %t2, <16 x i8> undef, <16 x i32> zeroinitializer
94+
%v = load <16 x i8>, ptr %out, align 1
95+
%r = shl <16 x i8> %v, %vshift
96+
%b = getelementptr inbounds i8, ptr %out, i32 16
97+
store <16 x i8> %r, ptr %b
98+
%sand = and i8 %t1, 1
99+
%next = add i32 %i, 1
100+
%i.cmp = icmp eq i32 %next, %count
101+
br i1 %i.cmp, label %body, label %exit
102+
exit:
103+
ret void
104+
}

0 commit comments

Comments
 (0)