-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[WebAssembly] load_zero to initialise build_vector #100610
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-webassembly Author: Sam Parker (sparker-arm) ChangesInstead of splatting a single lane, to initialise a build_vector, lower to scalar_to_vector which can be selected to load_zero. Also add the patterns for f32x4 and f64x2. Full diff: https://github.com/llvm/llvm-project/pull/100610.diff 4 Files Affected:
diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
index 7f1a5f616ed48..cdc9d9e2e3ec0 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
@@ -238,7 +238,7 @@ inline unsigned GetDefaultP2AlignAny(unsigned Opc) {
WASM_LOAD_STORE(MEMORY_ATOMIC_NOTIFY)
WASM_LOAD_STORE(MEMORY_ATOMIC_WAIT32)
WASM_LOAD_STORE(LOAD32_SPLAT)
- WASM_LOAD_STORE(LOAD_ZERO_I32x4)
+ WASM_LOAD_STORE(LOAD_ZERO_32)
WASM_LOAD_STORE(LOAD_LANE_I32x4)
WASM_LOAD_STORE(STORE_LANE_I32x4)
return 2;
@@ -263,7 +263,7 @@ inline unsigned GetDefaultP2AlignAny(unsigned Opc) {
WASM_LOAD_STORE(LOAD_EXTEND_U_I32x4)
WASM_LOAD_STORE(LOAD_EXTEND_S_I64x2)
WASM_LOAD_STORE(LOAD_EXTEND_U_I64x2)
- WASM_LOAD_STORE(LOAD_ZERO_I64x2)
+ WASM_LOAD_STORE(LOAD_ZERO_64)
WASM_LOAD_STORE(LOAD_LANE_I64x2)
WASM_LOAD_STORE(STORE_LANE_I64x2)
return 3;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index f77076d7244ca..960ef90148095 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -2275,8 +2275,15 @@ SDValue WebAssemblyTargetLowering::LowerBUILD_VECTOR(SDValue Op,
return IsConstant(Lane);
};
} else {
- // Use a splat (which might be selected as a load splat)
- Result = DAG.getSplatBuildVector(VecT, DL, SplatValue);
+ size_t DestLaneSize = VecT.getVectorElementType().getFixedSizeInBits();
+ if (NumSplatLanes == 1 && (DestLaneSize == 32 || DestLaneSize == 64)) {
+ // Could be selected to load_zero.
+ assert(SplatValue == Op->getOperand(0));
+ Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecT, SplatValue);
+ } else {
+ // Use a splat (which might be selected as a load splat)
+ Result = DAG.getSplatBuildVector(VecT, DL, SplatValue);
+ }
IsLaneConstructed = [&SplatValue](size_t _, const SDValue &Lane) {
return Lane == SplatValue;
};
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index 26fe61b1d6051..76fde44a3f63c 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -273,13 +273,13 @@ defm : LoadPat<vec.vt, loadpat, inst>;
multiclass SIMDLoadZero<Vec vec, bits<32> simdop> {
defvar name = "v128.load"#vec.lane_bits#"_zero";
let mayLoad = 1, UseNamedOperandTable = 1 in {
- defm LOAD_ZERO_#vec#_A32 :
+ defm LOAD_ZERO_#vec.lane_bits#_A32 :
SIMD_I<(outs V128:$dst),
(ins P2Align:$p2align, offset32_op:$off, I32:$addr),
(outs), (ins P2Align:$p2align, offset32_op:$off), [],
name#"\t$dst, ${off}(${addr})$p2align",
name#"\t$off$p2align", simdop>;
- defm LOAD_ZERO_#vec#_A64 :
+ defm LOAD_ZERO_#vec.lane_bits#_A64 :
SIMD_I<(outs V128:$dst),
(ins P2Align:$p2align, offset64_op:$off, I64:$addr),
(outs), (ins P2Align:$p2align, offset64_op:$off), [],
@@ -293,15 +293,15 @@ defm "" : SIMDLoadZero<I64x2, 0x5d>;
// Use load_zero to load scalars into vectors as well where possible.
// TODO: i16, and i8 scalars
-foreach vec = [I32x4, I64x2] in {
- defvar inst = "LOAD_ZERO_"#vec;
+foreach vec = [I32x4, I64x2, F32x4, F64x2] in {
+ defvar inst = "LOAD_ZERO_"#vec.lane_bits;
defvar pat = PatFrag<(ops node:$addr), (scalar_to_vector (vec.lane_vt (load $addr)))>;
defm : LoadPat<vec.vt, pat, inst>;
}
// TODO: f32x4 and f64x2 as well
foreach vec = [I32x4, I64x2] in {
- defvar inst = "LOAD_ZERO_"#vec;
+ defvar inst = "LOAD_ZERO_"#vec.lane_bits;
defvar pat = PatFrag<(ops node:$ptr),
(vector_insert (vec.splat (vec.lane_vt 0)), (vec.lane_vt (load $ptr)), 0)>;
defm : LoadPat<vec.vt, pat, inst>;
@@ -1463,10 +1463,10 @@ def extloadv2f32 : PatFrag<(ops node:$ptr), (extload node:$ptr)> {
// Adapted from the body of LoadPatNoOffset
// TODO: other addressing patterns
def : Pat<(v2f64 (extloadv2f32 (i32 I32:$addr))),
- (promote_low_F64x2 (LOAD_ZERO_I64x2_A32 0, 0, I32:$addr))>,
+ (promote_low_F64x2 (LOAD_ZERO_64_A32 0, 0, I32:$addr))>,
Requires<[HasAddr32]>;
def : Pat<(v2f64 (extloadv2f32 (i64 I64:$addr))),
- (promote_low_F64x2 (LOAD_ZERO_I64x2_A64 0, 0, I64:$addr))>,
+ (promote_low_F64x2 (LOAD_ZERO_64_A64 0, 0, I64:$addr))>,
Requires<[HasAddr64]>;
//===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/WebAssembly/simd-build-vector.ll b/llvm/test/CodeGen/WebAssembly/simd-build-vector.ll
index a51b358de2e89..7075e21ccf0c8 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-build-vector.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-build-vector.ll
@@ -440,3 +440,76 @@ define <2 x double> @all_undef_f64x2() {
; CHECK-NEXT: return $0
ret <2 x double> undef
}
+
+define <4 x i32> @load_zero_lane_i32x4(ptr %addr.a, ptr %addr.b, ptr %addr.c, ptr %addr.d) {
+; CHECK-LABEL: load_zero_lane_i32x4:
+; CHECK: .functype load_zero_lane_i32x4 (i32, i32, i32, i32) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: v128.load32_zero $push0=, 0($0)
+; CHECK-NEXT: v128.load32_lane $push1=, 0($1), $pop0, 1
+; CHECK-NEXT: v128.load32_lane $push2=, 0($2), $pop1, 2
+; CHECK-NEXT: v128.load32_lane $push3=, 0($3), $pop2, 3
+; CHECK-NEXT: return $pop3
+ %a = load i32, ptr %addr.a
+ %b = load i32, ptr %addr.b
+ %c = load i32, ptr %addr.c
+ %d = load i32, ptr %addr.d
+ %v = insertelement <4 x i32> undef, i32 %a, i32 0
+ %v.1 = insertelement <4 x i32> %v, i32 %b, i32 1
+ %v.2 = insertelement <4 x i32> %v.1, i32 %c, i32 2
+ %v.3 = insertelement <4 x i32> %v.2, i32 %d, i32 3
+ ret <4 x i32> %v.3
+}
+
+define <2 x i64> @load_zero_lane_i64x2(ptr %addr.a, ptr %addr.b) {
+; CHECK-LABEL: load_zero_lane_i64x2:
+; CHECK: .functype load_zero_lane_i64x2 (i32, i32) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: v128.load64_zero $push0=, 0($0)
+; CHECK-NEXT: v128.load64_lane $push1=, 0($1), $pop0, 1
+; CHECK-NEXT: return $pop1
+ %a = load i64, ptr %addr.a
+ %b = load i64, ptr %addr.b
+ %v = insertelement <2 x i64> undef, i64 %a, i32 0
+ %v.1 = insertelement <2 x i64> %v, i64 %b, i32 1
+ ret <2 x i64> %v.1
+}
+
+define <4 x float> @load_zero_lane_f32x4(ptr %addr.a, ptr %addr.b, ptr %addr.c, ptr %addr.d) {
+; CHECK-LABEL: load_zero_lane_f32x4:
+; CHECK: .functype load_zero_lane_f32x4 (i32, i32, i32, i32) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: v128.load32_zero $push1=, 0($0)
+; CHECK-NEXT: f32.load $push0=, 0($1)
+; CHECK-NEXT: f32x4.replace_lane $push2=, $pop1, 1, $pop0
+; CHECK-NEXT: f32.load $push3=, 0($2)
+; CHECK-NEXT: f32x4.replace_lane $push4=, $pop2, 2, $pop3
+; CHECK-NEXT: f32.load $push5=, 0($3)
+; CHECK-NEXT: f32x4.replace_lane $push6=, $pop4, 3, $pop5
+; CHECK-NEXT: return $pop6
+ %a = load float, ptr %addr.a
+ %b = load float, ptr %addr.b
+ %c = load float, ptr %addr.c
+ %d = load float, ptr %addr.d
+ %v = insertelement <4 x float> undef, float %a, i32 0
+ %v.1 = insertelement <4 x float> %v, float %b, i32 1
+ %v.2 = insertelement <4 x float> %v.1, float %c, i32 2
+ %v.3 = insertelement <4 x float> %v.2, float %d, i32 3
+ ret <4 x float> %v.3
+}
+
+define <2 x double> @load_zero_lane_f64x2(ptr %addr.a, ptr %addr.b) {
+; CHECK-LABEL: load_zero_lane_f64x2:
+; CHECK: .functype load_zero_lane_f64x2 (i32, i32) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: v128.load64_zero $push1=, 0($0)
+; CHECK-NEXT: f64.load $push0=, 0($1)
+; CHECK-NEXT: f64x2.replace_lane $push2=, $pop1, 1, $pop0
+; CHECK-NEXT: return $pop2
+ %a = load double, ptr %addr.a
+ %b = load double, ptr %addr.b
+ %v = insertelement <2 x double> undef, double %a, i32 0
+ %v.1 = insertelement <2 x double> %v, double %b, i32 1
+ ret <2 x double> %v.1
+}
+
|
Instead of splatting a single lane, to initialise a build_vector, lower to scalar_to_vector which can be selected to load_zero. Also add the patterns for load_zero and load_lane for f32x4 and f64x2 as well.
3e9b034
to
21d900d
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks good! I guess the load_zero is generally more efficient than the load_splat?
Yes, it should be more easily codegen'd to a scalar load, of some sort. I noticed on my Arm box that these were quite costly in numerical loops. These are some of the uplifts I saw on V8:
|
This is causing the llvm roller in emscripten to fail due to failing tests test_avx, test_sse4_1, test_sse2: See https://ci.chromium.org/ui/p/emscripten-releases/builders/try/linux/b8740537667603659377/overview and test_avx fails like this:
|
Okay, it's hitting the assertion I added. I'll take a look today. |
Proposed fix: #101961 |
Instead of splatting a single lane, to initialise a build_vector, lower to scalar_to_vector which can be selected to load_zero.
Also add the patterns for f32x4 and f64x2.