[WebAssembly] Optimize vector shift using a splat value from outside block

yolanda15 · tlively · commit 291101aa8ea5 · 2023-08-25T08:13:27.000-07:00
The vector shift operation in WebAssembly uses an i32 shift amount type, while the LLVM IR requires binary operator uses the same type of operands. When the shift amount operand is splated from a different block, the splat source will not be exported and the vector shift will be unrolled to scalar shifts. This patch enables the vector shift to identify the splat source value from the other block, and generate expected WebAssembly bytecode when lowering. Reviewed By: tlively Differential Revision: https://reviews.llvm.org/D158399
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -32,6 +32,7 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicsWebAssembly.h"
+#include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/KnownBits.h"
@@ -833,6 +834,30 @@ bool WebAssemblyTargetLowering::isOffsetFoldingLegal(
   return isa<Function>(GV) ? false : TargetLowering::isOffsetFoldingLegal(GA);
 }
 
+bool WebAssemblyTargetLowering::shouldSinkOperands(
+    Instruction *I, SmallVectorImpl<Use *> &Ops) const {
+  using namespace llvm::PatternMatch;
+
+  if (!I->getType()->isVectorTy() || !I->isShift())
+    return false;
+
+  Value *V = I->getOperand(1);
+  // We dont need to sink constant splat.
+  if (dyn_cast<Constant>(V))
+    return false;
+
+  if (match(V, m_Shuffle(m_InsertElt(m_Value(), m_Value(), m_ZeroInt()),
+                         m_Value(), m_ZeroMask()))) {
+    // Sink insert
+    Ops.push_back(&cast<Instruction>(V)->getOperandUse(0));
+    // Sink shuffle
+    Ops.push_back(&I->getOperandUse(1));
+    return true;
+  }
+
+  return false;
+}
+
 EVT WebAssemblyTargetLowering::getSetCCResultType(const DataLayout &DL,
                                                   LLVMContext &C,
                                                   EVT VT) const {
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
@@ -76,6 +76,8 @@ class WebAssemblyTargetLowering final : public TargetLowering {
   bool isIntDivCheap(EVT VT, AttributeList Attr) const override;
   bool isVectorLoadExtDesirable(SDValue ExtVal) const override;
   bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
+  bool shouldSinkOperands(Instruction *I,
+                          SmallVectorImpl<Use *> &Ops) const override;
   EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
                          EVT VT) const override;
   bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
diff --git a/llvm/test/CodeGen/WebAssembly/simd-shift-in-loop.ll b/llvm/test/CodeGen/WebAssembly/simd-shift-in-loop.ll
@@ -0,0 +1,104 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc < %s -verify-machineinstrs -mattr=+simd128 | FileCheck %s
+
+; Test that SIMD shifts can be lowered correctly even when shift
+; values are exported from outside blocks.
+
+target triple = "wasm32-unknown-unknown"
+
+define void @shl_loop(ptr %a, i8 %shift, i32 %count) {
+; CHECK-LABEL: shl_loop:
+; CHECK:         .functype shl_loop (i32, i32, i32) -> ()
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:  .LBB0_1: # %body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    loop # label0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.load 0:p2align=0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i8x16.shl
+; CHECK-NEXT:    v128.store 16
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const 16
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    local.set 0
+; CHECK-NEXT:    local.get 2
+; CHECK-NEXT:    i32.const -1
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    local.tee 2
+; CHECK-NEXT:    i32.eqz
+; CHECK-NEXT:    br_if 0 # 0: up to label0
+; CHECK-NEXT:  # %bb.2: # %exit
+; CHECK-NEXT:    end_loop
+; CHECK-NEXT:    # fallthrough-return
+entry:
+ %t1 = insertelement <16 x i8> undef, i8 %shift, i32 0
+ %vshift = shufflevector <16 x i8> %t1, <16 x i8> undef, <16 x i32> zeroinitializer
+ br label %body
+body:
+ %out = phi ptr [%a, %entry], [%b, %body]
+ %i = phi i32 [0, %entry], [%next, %body]
+ %v = load <16 x i8>, ptr %out, align 1
+ %r = shl <16 x i8> %v, %vshift
+ %b = getelementptr inbounds i8, ptr %out, i32 16
+ store <16 x i8> %r, ptr %b
+ %next = add i32 %i, 1
+ %i.cmp = icmp eq i32 %next, %count
+ br i1 %i.cmp, label %body, label %exit
+exit:
+ ret void
+}
+
+; Test that SIMD shifts can be lowered correctly when shift value
+; is a phi inside loop body.
+
+define void @shl_phi_loop(ptr %a, i8 %shift, i32 %count) {
+; CHECK-LABEL: shl_phi_loop:
+; CHECK:         .functype shl_phi_loop (i32, i32, i32) -> ()
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:  .LBB1_1: # %body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    loop # label1:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.load 0:p2align=0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i8x16.shl
+; CHECK-NEXT:    v128.store 16
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32.const 1
+; CHECK-NEXT:    i32.and
+; CHECK-NEXT:    local.set 1
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const 16
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    local.set 0
+; CHECK-NEXT:    local.get 2
+; CHECK-NEXT:    i32.const -1
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    local.tee 2
+; CHECK-NEXT:    i32.eqz
+; CHECK-NEXT:    br_if 0 # 0: up to label1
+; CHECK-NEXT:  # %bb.2: # %exit
+; CHECK-NEXT:    end_loop
+; CHECK-NEXT:    # fallthrough-return
+entry:
+ br label %body
+body:
+ %out = phi ptr [%a, %entry], [%b, %body]
+ %i = phi i32 [0, %entry], [%next, %body]
+ %t1 = phi i8 [%shift, %entry], [%sand, %body]
+ %t2 = insertelement <16 x i8> undef, i8 %t1, i32 0
+ %vshift = shufflevector <16 x i8> %t2, <16 x i8> undef, <16 x i32> zeroinitializer
+ %v = load <16 x i8>, ptr %out, align 1
+ %r = shl <16 x i8> %v, %vshift
+ %b = getelementptr inbounds i8, ptr %out, i32 16
+ store <16 x i8> %r, ptr %b
+ %sand = and i8 %t1, 1
+ %next = add i32 %i, 1
+ %i.cmp = icmp eq i32 %next, %count
+ br i1 %i.cmp, label %body, label %exit
+exit:
+ ret void
+}