Skip to content

[WebAssembly] load_zero to initialise build_vector #100610

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Aug 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ inline unsigned GetDefaultP2AlignAny(unsigned Opc) {
WASM_LOAD_STORE(ATOMIC_RMW8_U_CMPXCHG_I32)
WASM_LOAD_STORE(ATOMIC_RMW8_U_CMPXCHG_I64)
WASM_LOAD_STORE(LOAD8_SPLAT)
WASM_LOAD_STORE(LOAD_LANE_I8x16)
WASM_LOAD_STORE(LOAD_LANE_8)
WASM_LOAD_STORE(STORE_LANE_I8x16)
return 0;
WASM_LOAD_STORE(LOAD16_S_I32)
Expand Down Expand Up @@ -205,7 +205,7 @@ inline unsigned GetDefaultP2AlignAny(unsigned Opc) {
WASM_LOAD_STORE(ATOMIC_RMW16_U_CMPXCHG_I32)
WASM_LOAD_STORE(ATOMIC_RMW16_U_CMPXCHG_I64)
WASM_LOAD_STORE(LOAD16_SPLAT)
WASM_LOAD_STORE(LOAD_LANE_I16x8)
WASM_LOAD_STORE(LOAD_LANE_16)
WASM_LOAD_STORE(STORE_LANE_I16x8)
WASM_LOAD_STORE(LOAD_F16_F32)
WASM_LOAD_STORE(STORE_F16_F32)
Expand Down Expand Up @@ -238,8 +238,8 @@ inline unsigned GetDefaultP2AlignAny(unsigned Opc) {
WASM_LOAD_STORE(MEMORY_ATOMIC_NOTIFY)
WASM_LOAD_STORE(MEMORY_ATOMIC_WAIT32)
WASM_LOAD_STORE(LOAD32_SPLAT)
WASM_LOAD_STORE(LOAD_ZERO_I32x4)
WASM_LOAD_STORE(LOAD_LANE_I32x4)
WASM_LOAD_STORE(LOAD_ZERO_32)
WASM_LOAD_STORE(LOAD_LANE_32)
WASM_LOAD_STORE(STORE_LANE_I32x4)
return 2;
WASM_LOAD_STORE(LOAD_I64)
Expand All @@ -263,8 +263,8 @@ inline unsigned GetDefaultP2AlignAny(unsigned Opc) {
WASM_LOAD_STORE(LOAD_EXTEND_U_I32x4)
WASM_LOAD_STORE(LOAD_EXTEND_S_I64x2)
WASM_LOAD_STORE(LOAD_EXTEND_U_I64x2)
WASM_LOAD_STORE(LOAD_ZERO_I64x2)
WASM_LOAD_STORE(LOAD_LANE_I64x2)
WASM_LOAD_STORE(LOAD_ZERO_64)
WASM_LOAD_STORE(LOAD_LANE_64)
WASM_LOAD_STORE(STORE_LANE_I64x2)
return 3;
WASM_LOAD_STORE(LOAD_V128)
Expand Down
11 changes: 9 additions & 2 deletions llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2275,8 +2275,15 @@ SDValue WebAssemblyTargetLowering::LowerBUILD_VECTOR(SDValue Op,
return IsConstant(Lane);
};
} else {
// Use a splat (which might be selected as a load splat)
Result = DAG.getSplatBuildVector(VecT, DL, SplatValue);
size_t DestLaneSize = VecT.getVectorElementType().getFixedSizeInBits();
if (NumSplatLanes == 1 && (DestLaneSize == 32 || DestLaneSize == 64)) {
// Could be selected to load_zero.
assert(SplatValue == Op->getOperand(0));
Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecT, SplatValue);
} else {
// Use a splat (which might be selected as a load splat)
Result = DAG.getSplatBuildVector(VecT, DL, SplatValue);
}
IsLaneConstructed = [&SplatValue](size_t _, const SDValue &Lane) {
return Lane == SplatValue;
};
Expand Down
49 changes: 27 additions & 22 deletions llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
Original file line number Diff line number Diff line change
Expand Up @@ -273,13 +273,13 @@ defm : LoadPat<vec.vt, loadpat, inst>;
multiclass SIMDLoadZero<Vec vec, bits<32> simdop> {
defvar name = "v128.load"#vec.lane_bits#"_zero";
let mayLoad = 1, UseNamedOperandTable = 1 in {
defm LOAD_ZERO_#vec#_A32 :
defm LOAD_ZERO_#vec.lane_bits#_A32 :
SIMD_I<(outs V128:$dst),
(ins P2Align:$p2align, offset32_op:$off, I32:$addr),
(outs), (ins P2Align:$p2align, offset32_op:$off), [],
name#"\t$dst, ${off}(${addr})$p2align",
name#"\t$off$p2align", simdop>;
defm LOAD_ZERO_#vec#_A64 :
defm LOAD_ZERO_#vec.lane_bits#_A64 :
SIMD_I<(outs V128:$dst),
(ins P2Align:$p2align, offset64_op:$off, I64:$addr),
(outs), (ins P2Align:$p2align, offset64_op:$off), [],
Expand All @@ -293,32 +293,32 @@ defm "" : SIMDLoadZero<I64x2, 0x5d>;

// Use load_zero to load scalars into vectors as well where possible.
// TODO: i16, and i8 scalars
foreach vec = [I32x4, I64x2] in {
defvar inst = "LOAD_ZERO_"#vec;
foreach vec = [I32x4, I64x2, F32x4, F64x2] in {
defvar inst = "LOAD_ZERO_"#vec.lane_bits;
defvar pat = PatFrag<(ops node:$addr), (scalar_to_vector (vec.lane_vt (load $addr)))>;
defm : LoadPat<vec.vt, pat, inst>;
}

// TODO: f32x4 and f64x2 as well
foreach vec = [I32x4, I64x2] in {
defvar inst = "LOAD_ZERO_"#vec;
defvar inst = "LOAD_ZERO_"#vec.lane_bits;
defvar pat = PatFrag<(ops node:$ptr),
(vector_insert (vec.splat (vec.lane_vt 0)), (vec.lane_vt (load $ptr)), 0)>;
defm : LoadPat<vec.vt, pat, inst>;
}

// Load lane
multiclass SIMDLoadLane<Vec vec, bits<32> simdop> {
defvar name = "v128.load"#vec.lane_bits#"_lane";
multiclass SIMDLoadLane<bits<32> lane_bits, bits<32> simdop> {
defvar name = "v128.load"#lane_bits#"_lane";
let mayLoad = 1, UseNamedOperandTable = 1 in {
defm LOAD_LANE_#vec#_A32 :
defm LOAD_LANE_#lane_bits#_A32 :
SIMD_I<(outs V128:$dst),
(ins P2Align:$p2align, offset32_op:$off, vec_i8imm_op:$idx,
I32:$addr, V128:$vec),
(outs), (ins P2Align:$p2align, offset32_op:$off, vec_i8imm_op:$idx),
[], name#"\t$dst, ${off}(${addr})$p2align, $vec, $idx",
name#"\t$off$p2align, $idx", simdop>;
defm LOAD_LANE_#vec#_A64 :
defm LOAD_LANE_#lane_bits#_A64 :
SIMD_I<(outs V128:$dst),
(ins P2Align:$p2align, offset64_op:$off, vec_i8imm_op:$idx,
I64:$addr, V128:$vec),
Expand All @@ -328,15 +328,15 @@ multiclass SIMDLoadLane<Vec vec, bits<32> simdop> {
} // mayLoad = 1, UseNamedOperandTable = 1
}

defm "" : SIMDLoadLane<I8x16, 0x54>;
defm "" : SIMDLoadLane<I16x8, 0x55>;
defm "" : SIMDLoadLane<I32x4, 0x56>;
defm "" : SIMDLoadLane<I64x2, 0x57>;
defm "" : SIMDLoadLane<8, 0x54>;
defm "" : SIMDLoadLane<16, 0x55>;
defm "" : SIMDLoadLane<32, 0x56>;
defm "" : SIMDLoadLane<64, 0x57>;

// Select loads with no constant offset.
multiclass LoadLanePatNoOffset<Vec vec, SDPatternOperator kind> {
defvar load_lane_a32 = !cast<NI>("LOAD_LANE_"#vec#"_A32");
defvar load_lane_a64 = !cast<NI>("LOAD_LANE_"#vec#"_A64");
defvar load_lane_a32 = !cast<NI>("LOAD_LANE_"#vec.lane_bits#"_A32");
defvar load_lane_a64 = !cast<NI>("LOAD_LANE_"#vec.lane_bits#"_A64");
def : Pat<(vec.vt (kind (i32 I32:$addr),
(vec.vt V128:$vec), (i32 vec.lane_idx:$idx))),
(load_lane_a32 0, 0, imm:$idx, $addr, $vec)>,
Expand All @@ -354,17 +354,22 @@ def load16_lane :
PatFrag<(ops node:$ptr, node:$vec, node:$idx),
(vector_insert $vec, (i32 (extloadi16 $ptr)), $idx)>;
def load32_lane :
PatFrag<(ops node:$ptr, node:$vec, node:$idx),
(vector_insert $vec, (i32 (load $ptr)), $idx)>;
PatFrags<(ops node:$ptr, node:$vec, node:$idx), [
(vector_insert $vec, (i32 (load $ptr)), $idx),
(vector_insert $vec, (f32 (load $ptr)), $idx)
]>;
def load64_lane :
PatFrag<(ops node:$ptr, node:$vec, node:$idx),
(vector_insert $vec, (i64 (load $ptr)), $idx)>;
// TODO: floating point lanes as well
PatFrags<(ops node:$ptr, node:$vec, node:$idx), [
(vector_insert $vec, (i64 (load $ptr)), $idx),
(vector_insert $vec, (f64 (load $ptr)), $idx)
]>;

defm : LoadLanePatNoOffset<I8x16, load8_lane>;
defm : LoadLanePatNoOffset<I16x8, load16_lane>;
defm : LoadLanePatNoOffset<I32x4, load32_lane>;
defm : LoadLanePatNoOffset<I64x2, load64_lane>;
defm : LoadLanePatNoOffset<F32x4, load32_lane>;
defm : LoadLanePatNoOffset<F64x2, load64_lane>;

// TODO: Also support the other load patterns for load_lane once the instructions
// are merged to the proposal.
Expand Down Expand Up @@ -1463,10 +1468,10 @@ def extloadv2f32 : PatFrag<(ops node:$ptr), (extload node:$ptr)> {
// Adapted from the body of LoadPatNoOffset
// TODO: other addressing patterns
def : Pat<(v2f64 (extloadv2f32 (i32 I32:$addr))),
(promote_low_F64x2 (LOAD_ZERO_I64x2_A32 0, 0, I32:$addr))>,
(promote_low_F64x2 (LOAD_ZERO_64_A32 0, 0, I32:$addr))>,
Requires<[HasAddr32]>;
def : Pat<(v2f64 (extloadv2f32 (i64 I64:$addr))),
(promote_low_F64x2 (LOAD_ZERO_I64x2_A64 0, 0, I64:$addr))>,
(promote_low_F64x2 (LOAD_ZERO_64_A64 0, 0, I64:$addr))>,
Requires<[HasAddr64]>;

//===----------------------------------------------------------------------===//
Expand Down
69 changes: 69 additions & 0 deletions llvm/test/CodeGen/WebAssembly/simd-build-vector.ll
Original file line number Diff line number Diff line change
Expand Up @@ -440,3 +440,72 @@ define <2 x double> @all_undef_f64x2() {
; CHECK-NEXT: return $0
ret <2 x double> undef
}

define <4 x i32> @load_zero_lane_i32x4(ptr %addr.a, ptr %addr.b, ptr %addr.c, ptr %addr.d) {
; CHECK-LABEL: load_zero_lane_i32x4:
; CHECK: .functype load_zero_lane_i32x4 (i32, i32, i32, i32) -> (v128)
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: v128.load32_zero $push0=, 0($0)
; CHECK-NEXT: v128.load32_lane $push1=, 0($1), $pop0, 1
; CHECK-NEXT: v128.load32_lane $push2=, 0($2), $pop1, 2
; CHECK-NEXT: v128.load32_lane $push3=, 0($3), $pop2, 3
; CHECK-NEXT: return $pop3
%a = load i32, ptr %addr.a
%b = load i32, ptr %addr.b
%c = load i32, ptr %addr.c
%d = load i32, ptr %addr.d
%v = insertelement <4 x i32> undef, i32 %a, i32 0
%v.1 = insertelement <4 x i32> %v, i32 %b, i32 1
%v.2 = insertelement <4 x i32> %v.1, i32 %c, i32 2
%v.3 = insertelement <4 x i32> %v.2, i32 %d, i32 3
ret <4 x i32> %v.3
}

define <2 x i64> @load_zero_lane_i64x2(ptr %addr.a, ptr %addr.b) {
; CHECK-LABEL: load_zero_lane_i64x2:
; CHECK: .functype load_zero_lane_i64x2 (i32, i32) -> (v128)
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: v128.load64_zero $push0=, 0($0)
; CHECK-NEXT: v128.load64_lane $push1=, 0($1), $pop0, 1
; CHECK-NEXT: return $pop1
%a = load i64, ptr %addr.a
%b = load i64, ptr %addr.b
%v = insertelement <2 x i64> undef, i64 %a, i32 0
%v.1 = insertelement <2 x i64> %v, i64 %b, i32 1
ret <2 x i64> %v.1
}

define <4 x float> @load_zero_lane_f32x4(ptr %addr.a, ptr %addr.b, ptr %addr.c, ptr %addr.d) {
; CHECK-LABEL: load_zero_lane_f32x4:
; CHECK: .functype load_zero_lane_f32x4 (i32, i32, i32, i32) -> (v128)
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: v128.load32_zero $push0=, 0($0)
; CHECK-NEXT: v128.load32_lane $push1=, 0($1), $pop0, 1
; CHECK-NEXT: v128.load32_lane $push2=, 0($2), $pop1, 2
; CHECK-NEXT: v128.load32_lane $push3=, 0($3), $pop2, 3
; CHECK-NEXT: return $pop3
%a = load float, ptr %addr.a
%b = load float, ptr %addr.b
%c = load float, ptr %addr.c
%d = load float, ptr %addr.d
%v = insertelement <4 x float> undef, float %a, i32 0
%v.1 = insertelement <4 x float> %v, float %b, i32 1
%v.2 = insertelement <4 x float> %v.1, float %c, i32 2
%v.3 = insertelement <4 x float> %v.2, float %d, i32 3
ret <4 x float> %v.3
}

define <2 x double> @load_zero_lane_f64x2(ptr %addr.a, ptr %addr.b) {
; CHECK-LABEL: load_zero_lane_f64x2:
; CHECK: .functype load_zero_lane_f64x2 (i32, i32) -> (v128)
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: v128.load64_zero $push0=, 0($0)
; CHECK-NEXT: v128.load64_lane $push1=, 0($1), $pop0, 1
; CHECK-NEXT: return $pop1
%a = load double, ptr %addr.a
%b = load double, ptr %addr.b
%v = insertelement <2 x double> undef, double %a, i32 0
%v.1 = insertelement <2 x double> %v, double %b, i32 1
ret <2 x double> %v.1
}

Loading