-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[WebAssembly] Recognise EXTEND_HIGH #123325
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-webassembly Author: Sam Parker (sparker-arm) ChangesWhen lowering EXTEND_VECTOR_INREG, check whether the operand is a shuffle that is moving the top half of a vector into the lower half. If so, we can EXTEND_HIGH the input to the shuffle instead. Patch is 24.72 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/123325.diff 2 Files Affected:
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 084aed6eed46d3..81e2d65e163aea 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -2143,6 +2143,35 @@ WebAssemblyTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
Op.getOperand(1));
}
+static SDValue GetHighToLowShuffleOperand(SDValue Op) {
+ if (Op.getOpcode() != ISD::VECTOR_SHUFFLE)
+ return SDValue();
+
+ // Look for a shuffle which moves from the high half to the low half. We
+ // can then use EXTEND_HIGH instead.
+ auto IsHighToLow = [](ShuffleVectorSDNode *Shuffle) {
+ ArrayRef<int> Mask = Shuffle->getMask();
+
+ size_t BeginElement = Mask.size() / 2;
+ for (size_t i = 0; i < Mask.size() / 2; ++i) {
+ if (Mask[i] != static_cast<int>(BeginElement + i)) {
+ return false;
+ }
+ }
+ // The rest will be undef.
+ for (size_t i = Mask.size() / 2; i < Mask.size(); ++i) {
+ if (Mask[i] != -1) {
+ return false;
+ }
+ }
+ return true;
+ };
+ if (IsHighToLow(cast<ShuffleVectorSDNode>(Op.getNode()))) {
+ return Op.getOperand(0);
+ }
+ return SDValue();
+}
+
SDValue
WebAssemblyTargetLowering::LowerEXTEND_VECTOR_INREG(SDValue Op,
SelectionDAG &DAG) const {
@@ -2172,6 +2201,15 @@ WebAssemblyTargetLowering::LowerEXTEND_VECTOR_INREG(SDValue Op,
break;
}
+ if (Scale == 2) {
+ if (auto ShuffleIn = GetHighToLowShuffleOperand(Op.getOperand(0))) {
+ unsigned Opc = Ext == WebAssemblyISD::EXTEND_LOW_S
+ ? WebAssemblyISD::EXTEND_HIGH_S
+ : WebAssemblyISD::EXTEND_HIGH_U;
+ return DAG.getNode(Opc, DL, VT, ShuffleIn);
+ }
+ }
+
SDValue Ret = Src;
while (Scale != 1) {
Ret = DAG.getNode(Ext, DL,
diff --git a/llvm/test/CodeGen/WebAssembly/extend-shuffles.ll b/llvm/test/CodeGen/WebAssembly/extend-shuffles.ll
new file mode 100644
index 00000000000000..7b4a0e72f84827
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/extend-shuffles.ll
@@ -0,0 +1,442 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+
+; RUN: llc < %s -mtriple=wasm32 -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+simd128 | FileCheck %s --check-prefix=SIMD128
+
+target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-i128:128-n32:64-S128-ni:1:10:20"
+target triple = "wasm32"
+
+define i32 @sext_v8i8(ptr %in) {
+; SIMD128-LABEL: sext_v8i8:
+; SIMD128: .functype sext_v8i8 (i32) -> (i32)
+; SIMD128-NEXT: # %bb.0:
+; SIMD128-NEXT: v128.load64_zero $push14=, 0($0):p2align=0
+; SIMD128-NEXT: local.tee $push13=, $1=, $pop14
+; SIMD128-NEXT: i16x8.extend_low_i8x16_s $push3=, $pop13
+; SIMD128-NEXT: i32x4.extend_low_i16x8_s $push4=, $pop3
+; SIMD128-NEXT: i8x16.shuffle $push0=, $1, $1, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT: i16x8.extend_low_i8x16_s $push1=, $pop0
+; SIMD128-NEXT: i32x4.extend_low_i16x8_s $push2=, $pop1
+; SIMD128-NEXT: i32x4.add $push12=, $pop4, $pop2
+; SIMD128-NEXT: local.tee $push11=, $1=, $pop12
+; SIMD128-NEXT: i8x16.shuffle $push5=, $1, $1, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT: i32x4.add $push10=, $pop11, $pop5
+; SIMD128-NEXT: local.tee $push9=, $1=, $pop10
+; SIMD128-NEXT: i8x16.shuffle $push6=, $1, $1, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT: i32x4.add $push7=, $pop9, $pop6
+; SIMD128-NEXT: i32x4.extract_lane $push8=, $pop7, 0
+; SIMD128-NEXT: return $pop8
+ %narrow.load = load <8 x i8>, ptr %in, align 1
+ %sext = sext <8 x i8> %narrow.load to <8 x i32>
+ %res = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %sext)
+ ret i32 %res
+}
+
+define i32 @sext_v16i8(ptr %in) {
+; SIMD128-LABEL: sext_v16i8:
+; SIMD128: .functype sext_v16i8 (i32) -> (i32)
+; SIMD128-NEXT: # %bb.0:
+; SIMD128-NEXT: v128.load $push22=, 0($0):p2align=0
+; SIMD128-NEXT: local.tee $push21=, $1=, $pop22
+; SIMD128-NEXT: i16x8.extend_low_i8x16_s $push10=, $pop21
+; SIMD128-NEXT: i32x4.extend_low_i16x8_s $push11=, $pop10
+; SIMD128-NEXT: i8x16.shuffle $push7=, $1, $1, 8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT: i16x8.extend_low_i8x16_s $push8=, $pop7
+; SIMD128-NEXT: i32x4.extend_low_i16x8_s $push9=, $pop8
+; SIMD128-NEXT: i32x4.add $push12=, $pop11, $pop9
+; SIMD128-NEXT: i8x16.shuffle $push3=, $1, $1, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT: i16x8.extend_low_i8x16_s $push4=, $pop3
+; SIMD128-NEXT: i32x4.extend_low_i16x8_s $push5=, $pop4
+; SIMD128-NEXT: i8x16.shuffle $push0=, $1, $1, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT: i16x8.extend_low_i8x16_s $push1=, $pop0
+; SIMD128-NEXT: i32x4.extend_low_i16x8_s $push2=, $pop1
+; SIMD128-NEXT: i32x4.add $push6=, $pop5, $pop2
+; SIMD128-NEXT: i32x4.add $push20=, $pop12, $pop6
+; SIMD128-NEXT: local.tee $push19=, $1=, $pop20
+; SIMD128-NEXT: i8x16.shuffle $push13=, $1, $1, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT: i32x4.add $push18=, $pop19, $pop13
+; SIMD128-NEXT: local.tee $push17=, $1=, $pop18
+; SIMD128-NEXT: i8x16.shuffle $push14=, $1, $1, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT: i32x4.add $push15=, $pop17, $pop14
+; SIMD128-NEXT: i32x4.extract_lane $push16=, $pop15, 0
+; SIMD128-NEXT: return $pop16
+ %load = load <16 x i8>, ptr %in, align 1
+ %sext = sext <16 x i8> %load to <16 x i32>
+ %res = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %sext)
+ ret i32 %res
+}
+
+define i32 @sext_v32i8(ptr %in) {
+; SIMD128-LABEL: sext_v32i8:
+; SIMD128: .functype sext_v32i8 (i32) -> (i32)
+; SIMD128-NEXT: # %bb.0:
+; SIMD128-NEXT: v128.load $push39=, 0($0):p2align=0
+; SIMD128-NEXT: local.tee $push38=, $2=, $pop39
+; SIMD128-NEXT: i16x8.extend_low_i8x16_s $push24=, $pop38
+; SIMD128-NEXT: i32x4.extend_low_i16x8_s $push25=, $pop24
+; SIMD128-NEXT: v128.load $push37=, 16($0):p2align=0
+; SIMD128-NEXT: local.tee $push36=, $1=, $pop37
+; SIMD128-NEXT: i16x8.extend_low_i8x16_s $push22=, $pop36
+; SIMD128-NEXT: i32x4.extend_low_i16x8_s $push23=, $pop22
+; SIMD128-NEXT: i32x4.add $push26=, $pop25, $pop23
+; SIMD128-NEXT: i8x16.shuffle $push18=, $2, $2, 8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT: i16x8.extend_low_i8x16_s $push19=, $pop18
+; SIMD128-NEXT: i32x4.extend_low_i16x8_s $push20=, $pop19
+; SIMD128-NEXT: i8x16.shuffle $push15=, $1, $2, 8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT: i16x8.extend_low_i8x16_s $push16=, $pop15
+; SIMD128-NEXT: i32x4.extend_low_i16x8_s $push17=, $pop16
+; SIMD128-NEXT: i32x4.add $push21=, $pop20, $pop17
+; SIMD128-NEXT: i32x4.add $push27=, $pop26, $pop21
+; SIMD128-NEXT: i8x16.shuffle $push10=, $2, $2, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT: i16x8.extend_low_i8x16_s $push11=, $pop10
+; SIMD128-NEXT: i32x4.extend_low_i16x8_s $push12=, $pop11
+; SIMD128-NEXT: i8x16.shuffle $push7=, $1, $2, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT: i16x8.extend_low_i8x16_s $push8=, $pop7
+; SIMD128-NEXT: i32x4.extend_low_i16x8_s $push9=, $pop8
+; SIMD128-NEXT: i32x4.add $push13=, $pop12, $pop9
+; SIMD128-NEXT: i8x16.shuffle $push3=, $2, $2, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT: i16x8.extend_low_i8x16_s $push4=, $pop3
+; SIMD128-NEXT: i32x4.extend_low_i16x8_s $push5=, $pop4
+; SIMD128-NEXT: i8x16.shuffle $push0=, $1, $2, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT: i16x8.extend_low_i8x16_s $push1=, $pop0
+; SIMD128-NEXT: i32x4.extend_low_i16x8_s $push2=, $pop1
+; SIMD128-NEXT: i32x4.add $push6=, $pop5, $pop2
+; SIMD128-NEXT: i32x4.add $push14=, $pop13, $pop6
+; SIMD128-NEXT: i32x4.add $push35=, $pop27, $pop14
+; SIMD128-NEXT: local.tee $push34=, $2=, $pop35
+; SIMD128-NEXT: i8x16.shuffle $push28=, $2, $2, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT: i32x4.add $push33=, $pop34, $pop28
+; SIMD128-NEXT: local.tee $push32=, $2=, $pop33
+; SIMD128-NEXT: i8x16.shuffle $push29=, $2, $2, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT: i32x4.add $push30=, $pop32, $pop29
+; SIMD128-NEXT: i32x4.extract_lane $push31=, $pop30, 0
+; SIMD128-NEXT: return $pop31
+ %wide.load = load <32 x i8>, ptr %in, align 1
+ %sext = sext <32 x i8> %wide.load to <32 x i32>
+ %res = tail call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %sext)
+ ret i32 %res
+}
+
+define i32 @sext_v4i16(ptr %in) {
+; SIMD128-LABEL: sext_v4i16:
+; SIMD128: .functype sext_v4i16 (i32) -> (i32)
+; SIMD128-NEXT: # %bb.0:
+; SIMD128-NEXT: i32x4.load16x4_s $push7=, 0($0):p2align=1
+; SIMD128-NEXT: local.tee $push6=, $1=, $pop7
+; SIMD128-NEXT: i8x16.shuffle $push0=, $1, $1, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT: i32x4.add $push5=, $pop6, $pop0
+; SIMD128-NEXT: local.tee $push4=, $1=, $pop5
+; SIMD128-NEXT: i8x16.shuffle $push1=, $1, $1, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT: i32x4.add $push2=, $pop4, $pop1
+; SIMD128-NEXT: i32x4.extract_lane $push3=, $pop2, 0
+; SIMD128-NEXT: return $pop3
+ %narrow.load = load <4 x i16>, ptr %in, align 2
+ %sext = sext <4 x i16> %narrow.load to <4 x i32>
+ %res = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %sext)
+ ret i32 %res
+}
+
+define i32 @sext_v8i16(ptr %in) {
+; SIMD128-LABEL: sext_v8i16:
+; SIMD128: .functype sext_v8i16 (i32) -> (i32)
+; SIMD128-NEXT: # %bb.0:
+; SIMD128-NEXT: v128.load $push11=, 0($0):p2align=1
+; SIMD128-NEXT: local.tee $push10=, $1=, $pop11
+; SIMD128-NEXT: i32x4.extend_low_i16x8_s $push1=, $pop10
+; SIMD128-NEXT: i32x4.extend_high_i16x8_s $push0=, $1
+; SIMD128-NEXT: i32x4.add $push9=, $pop1, $pop0
+; SIMD128-NEXT: local.tee $push8=, $1=, $pop9
+; SIMD128-NEXT: i8x16.shuffle $push2=, $1, $1, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT: i32x4.add $push7=, $pop8, $pop2
+; SIMD128-NEXT: local.tee $push6=, $1=, $pop7
+; SIMD128-NEXT: i8x16.shuffle $push3=, $1, $1, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT: i32x4.add $push4=, $pop6, $pop3
+; SIMD128-NEXT: i32x4.extract_lane $push5=, $pop4, 0
+; SIMD128-NEXT: return $pop5
+ %load = load <8 x i16>, ptr %in, align 2
+ %sext = sext <8 x i16> %load to <8 x i32>
+ %res = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %sext)
+ ret i32 %res
+}
+
+define i32 @sext_v16i16(ptr %in) {
+; SIMD128-LABEL: sext_v16i16:
+; SIMD128: .functype sext_v16i16 (i32) -> (i32)
+; SIMD128-NEXT: # %bb.0:
+; SIMD128-NEXT: v128.load $push17=, 0($0):p2align=1
+; SIMD128-NEXT: local.tee $push16=, $2=, $pop17
+; SIMD128-NEXT: i32x4.extend_low_i16x8_s $push4=, $pop16
+; SIMD128-NEXT: v128.load $push15=, 16($0):p2align=1
+; SIMD128-NEXT: local.tee $push14=, $1=, $pop15
+; SIMD128-NEXT: i32x4.extend_low_i16x8_s $push3=, $pop14
+; SIMD128-NEXT: i32x4.add $push5=, $pop4, $pop3
+; SIMD128-NEXT: i32x4.extend_high_i16x8_s $push1=, $2
+; SIMD128-NEXT: i32x4.extend_high_i16x8_s $push0=, $1
+; SIMD128-NEXT: i32x4.add $push2=, $pop1, $pop0
+; SIMD128-NEXT: i32x4.add $push13=, $pop5, $pop2
+; SIMD128-NEXT: local.tee $push12=, $2=, $pop13
+; SIMD128-NEXT: i8x16.shuffle $push6=, $2, $2, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT: i32x4.add $push11=, $pop12, $pop6
+; SIMD128-NEXT: local.tee $push10=, $2=, $pop11
+; SIMD128-NEXT: i8x16.shuffle $push7=, $2, $2, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT: i32x4.add $push8=, $pop10, $pop7
+; SIMD128-NEXT: i32x4.extract_lane $push9=, $pop8, 0
+; SIMD128-NEXT: return $pop9
+ %wide.load = load <16 x i16>, ptr %in, align 2
+ %sext = sext <16 x i16> %wide.load to <16 x i32>
+ %res = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %sext)
+ ret i32 %res
+}
+
+define i64 @sext_v2i32(ptr %in) {
+; SIMD128-LABEL: sext_v2i32:
+; SIMD128: .functype sext_v2i32 (i32) -> (i64)
+; SIMD128-NEXT: # %bb.0:
+; SIMD128-NEXT: i64x2.load32x2_s $push4=, 0($0):p2align=2
+; SIMD128-NEXT: local.tee $push3=, $1=, $pop4
+; SIMD128-NEXT: i8x16.shuffle $push0=, $1, $1, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7
+; SIMD128-NEXT: i64x2.add $push1=, $pop3, $pop0
+; SIMD128-NEXT: i64x2.extract_lane $push2=, $pop1, 0
+; SIMD128-NEXT: return $pop2
+ %narrow.load = load <2 x i32>, ptr %in, align 4
+ %sext = sext <2 x i32> %narrow.load to <2 x i64>
+ %res = tail call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %sext)
+ ret i64 %res
+}
+
+define i64 @sext_v4i32(ptr %in) {
+; SIMD128-LABEL: sext_v4i32:
+; SIMD128: .functype sext_v4i32 (i32) -> (i64)
+; SIMD128-NEXT: # %bb.0:
+; SIMD128-NEXT: v128.load $push8=, 0($0):p2align=2
+; SIMD128-NEXT: local.tee $push7=, $1=, $pop8
+; SIMD128-NEXT: i64x2.extend_low_i32x4_s $push1=, $pop7
+; SIMD128-NEXT: i64x2.extend_high_i32x4_s $push0=, $1
+; SIMD128-NEXT: i64x2.add $push6=, $pop1, $pop0
+; SIMD128-NEXT: local.tee $push5=, $1=, $pop6
+; SIMD128-NEXT: i8x16.shuffle $push2=, $1, $1, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7
+; SIMD128-NEXT: i64x2.add $push3=, $pop5, $pop2
+; SIMD128-NEXT: i64x2.extract_lane $push4=, $pop3, 0
+; SIMD128-NEXT: return $pop4
+ %load = load <4 x i32>, ptr %in, align 4
+ %sext = sext <4 x i32> %load to <4 x i64>
+ %res = tail call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %sext)
+ ret i64 %res
+}
+
+define i32 @zext_v8i8(ptr %in) {
+; SIMD128-LABEL: zext_v8i8:
+; SIMD128: .functype zext_v8i8 (i32) -> (i32)
+; SIMD128-NEXT: # %bb.0:
+; SIMD128-NEXT: v128.load64_zero $push14=, 0($0):p2align=0
+; SIMD128-NEXT: local.tee $push13=, $1=, $pop14
+; SIMD128-NEXT: i16x8.extend_low_i8x16_s $push3=, $pop13
+; SIMD128-NEXT: i32x4.extend_low_i16x8_s $push4=, $pop3
+; SIMD128-NEXT: i8x16.shuffle $push0=, $1, $1, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT: i16x8.extend_low_i8x16_s $push1=, $pop0
+; SIMD128-NEXT: i32x4.extend_low_i16x8_s $push2=, $pop1
+; SIMD128-NEXT: i32x4.add $push12=, $pop4, $pop2
+; SIMD128-NEXT: local.tee $push11=, $1=, $pop12
+; SIMD128-NEXT: i8x16.shuffle $push5=, $1, $1, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT: i32x4.add $push10=, $pop11, $pop5
+; SIMD128-NEXT: local.tee $push9=, $1=, $pop10
+; SIMD128-NEXT: i8x16.shuffle $push6=, $1, $1, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT: i32x4.add $push7=, $pop9, $pop6
+; SIMD128-NEXT: i32x4.extract_lane $push8=, $pop7, 0
+; SIMD128-NEXT: return $pop8
+ %narrow.load = load <8 x i8>, ptr %in, align 1
+ %zext = sext <8 x i8> %narrow.load to <8 x i32>
+ %res = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %zext)
+ ret i32 %res
+}
+
+define i32 @zext_v16i8(ptr %in) {
+; SIMD128-LABEL: zext_v16i8:
+; SIMD128: .functype zext_v16i8 (i32) -> (i32)
+; SIMD128-NEXT: # %bb.0:
+; SIMD128-NEXT: v128.load $push22=, 0($0):p2align=0
+; SIMD128-NEXT: local.tee $push21=, $1=, $pop22
+; SIMD128-NEXT: i16x8.extend_low_i8x16_u $push10=, $pop21
+; SIMD128-NEXT: i32x4.extend_low_i16x8_u $push11=, $pop10
+; SIMD128-NEXT: i8x16.shuffle $push7=, $1, $1, 8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT: i16x8.extend_low_i8x16_u $push8=, $pop7
+; SIMD128-NEXT: i32x4.extend_low_i16x8_u $push9=, $pop8
+; SIMD128-NEXT: i32x4.add $push12=, $pop11, $pop9
+; SIMD128-NEXT: i8x16.shuffle $push3=, $1, $1, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT: i16x8.extend_low_i8x16_u $push4=, $pop3
+; SIMD128-NEXT: i32x4.extend_low_i16x8_u $push5=, $pop4
+; SIMD128-NEXT: i8x16.shuffle $push0=, $1, $1, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT: i16x8.extend_low_i8x16_u $push1=, $pop0
+; SIMD128-NEXT: i32x4.extend_low_i16x8_u $push2=, $pop1
+; SIMD128-NEXT: i32x4.add $push6=, $pop5, $pop2
+; SIMD128-NEXT: i32x4.add $push20=, $pop12, $pop6
+; SIMD128-NEXT: local.tee $push19=, $1=, $pop20
+; SIMD128-NEXT: i8x16.shuffle $push13=, $1, $1, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT: i32x4.add $push18=, $pop19, $pop13
+; SIMD128-NEXT: local.tee $push17=, $1=, $pop18
+; SIMD128-NEXT: i8x16.shuffle $push14=, $1, $1, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT: i32x4.add $push15=, $pop17, $pop14
+; SIMD128-NEXT: i32x4.extract_lane $push16=, $pop15, 0
+; SIMD128-NEXT: return $pop16
+ %load = load <16 x i8>, ptr %in, align 1
+ %zext = zext <16 x i8> %load to <16 x i32>
+ %res = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %zext)
+ ret i32 %res
+}
+
+define i32 @zext_v32i8(ptr %in) {
+; SIMD128-LABEL: zext_v32i8:
+; SIMD128: .functype zext_v32i8 (i32) -> (i32)
+; SIMD128-NEXT: # %bb.0:
+; SIMD128-NEXT: v128.load $push39=, 0($0):p2align=0
+; SIMD128-NEXT: local.tee $push38=, $2=, $pop39
+; SIMD128-NEXT: i16x8.extend_low_i8x16_u $push24=, $pop38
+; SIMD128-NEXT: i32x4.extend_low_i16x8_u $push25=, $pop24
+; SIMD128-NEXT: v128.load $push37=, 16($0):p2align=0
+; SIMD128-NEXT: local.tee $push36=, $1=, $pop37
+; SIMD128-NEXT: i16x8.extend_low_i8x16_u $push22=, $pop36
+; SIMD128-NEXT: i32x4.extend_low_i16x8_u $push23=, $pop22
+; SIMD128-NEXT: i32x4.add $push26=, $pop25, $pop23
+; SIMD128-NEXT: i8x16.shuffle $push18=, $2, $2, 8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT: i16x8.extend_low_i8x16_u $push19=, $pop18
+; SIMD128-NEXT: i32x4.extend_low_i16x8_u $push20=, $pop19
+; SIMD128-NEXT: i8x16.shuffle $push15=, $1, $2, 8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT: i16x8.extend_low_i8x16_u $push16=, $pop15
+; SIMD128-NEXT: i32x4.extend_low_i16x8_u $push17=, $pop16
+; SIMD128-NEXT: i32x4.add $push21=, $pop20, $pop17
+; SIMD128-NEXT: i32x4.add $push27=, $pop26, $pop21
+; SIMD128-NEXT: i8x16.shuffle $push10=, $2, $2, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT: i16x8.extend_low_i8x16_u $push11=, $pop10
+; SIMD128-NEXT: i32x4.extend_low_i16x8_u $push12=, $pop11
+; SIMD128-NEXT: i8x16.shuffle $push7=, $1, $2, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT: i16x8.extend_low_i8x16_u $push8=, $pop7
+; SIMD128-NEXT: i32x4.extend_low_i16x8_u $push9=, $pop8
+; SIMD128-NEXT: i32x4.add $push13=, $pop12, $pop9
+; SIMD128-NEXT: i8x16.shuffle $push3=, $2, $2, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT: i16x8.extend_low_i8x16_u $push4=, $pop3
+; SIMD128-NEXT: i32x4.extend_low_i16x8_u $push5=, $pop4
+; SIMD128-NEXT: i8x16.shuffle $push0=, $1, $2, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT: i16x8.extend_low_i8x16_u $push1=, $pop0
+; SIMD128-NEXT: i32x4.extend_low_i16x8_u $push2=, $pop1
+; SIMD128-NEXT: i32x4.add $push6=, $pop5, $pop2
+; SIMD128-NEXT: i32x4.add $push14=, $pop13, $pop6
+; SIMD128-NEXT: i32x4.add $push35=, $pop27, $pop14
+; SIMD128-NEXT: local.tee $push34=, $2=, $pop35
+; SIMD128-NEXT: i8x16.shuffle $push28=, $2, $2, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT: i32x4.add $push33=, $pop34, $pop28
+; SIMD128-NEXT: local.tee $push32=, $2=, $pop33
+; SIMD128-NEXT: i8x16.shuffle $push29=, $2, $2, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
+; SIMD128-NEXT: i32x4.add $push30=, $pop32, $pop29
+; SIMD128-NEXT: i32x4.extract_lane $push31=, $pop30, 0
+; SIMD128-NEXT: return $pop31
+ %wide.load = load <32 x i8>, ptr %in, align 1
+ %zext = zext <32 x i8> %wide.load to <32 x i32>
+ %res = tail call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %zext)
+ ret i32 %res
+}
+
+define i32 @zext_v4i16(ptr %in) {
+; SIMD128-LABEL: zext_v4i16:
+; SIMD128: .functype zext_v4i16 (i32) -> (i32)
+; SIMD128-NEXT: # %bb.0:
+; SIMD128-NEXT: i32x4.load16x4_u $push7=, 0($0):p2align=1
+; SIMD128-NEXT: lo...
[truncated]
|
✅ With the latest revision this PR passed the undef deprecator. |
0e768f8
to
035e501
Compare
|
||
// Look for a shuffle which moves from the high half to the low half. | ||
size_t FirstIdx = Mask.size() / 2; | ||
for (size_t i = 0; i < Mask.size() / 2; ++i) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we have to check that the top half of the mask has expected values as well?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We don't, but as the user is an extend_low
, the high half of the result doesn't matter. But I will add a check that the extend is the only user.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I implemented and then removed the user check as I couldn't see the benefit.
; SIMD128-NEXT: return $pop8 | ||
%narrow.load = load <8 x i8>, ptr %in, align 1 | ||
%sext = sext <8 x i8> %narrow.load to <8 x i32> | ||
%res = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %sext) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Would it be possible to write tests that check the pattern more directly as well?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fair enough, I'll give it a go.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The tests now just use a shuffle and an extend.
035e501
to
791d134
Compare
When lowering EXTEND_VECTOR_INREG, check whether the operand is a shuffle that is moving the top half of a vector into the lower half. If so, we can EXTEND_HIGH the input to the shuffle instead.
791d134
to
33f4784
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks great, thanks!
; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 | ||
; SIMD128-NEXT: i32x4.extend_low_i16x8_s $push1=, $pop0 | ||
; SIMD128-NEXT: return $pop1 | ||
%shuffle = shufflevector <4 x i16> %in, <4 x i16> poison, <2 x i32> <i32 2, i32 3> | ||
%res = sext <2 x i16> %shuffle to <2 x i32> |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why won't extend_high get picked here? (and in couple more spots in this test)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Because the pattern matching requires shuffling the MSB eight bytes into the LSB eight bytes.
When lowering EXTEND_VECTOR_INREG, check whether the operand is a shuffle that is moving the top half of a vector into the lower half. If so, we can EXTEND_HIGH the input to the shuffle instead.
When lowering EXTEND_VECTOR_INREG, check whether the operand is a shuffle that is moving the top half of a vector into the lower half. If so, we can EXTEND_HIGH the input to the shuffle instead.