Skip to content

Commit 542523a

Browse files
committed
[WebAssembly] Emulate v128.const efficiently
v128.const was recently implemented in V8, but until it rolls into Chrome stable, we can't enable it in the WebAssembly backend without breaking origin trial users. So far we have been lowering build_vectors that would otherwise have been lowered to v128.const to splats followed by sequences of replace_lane instructions to initialize each lane individually. That produces large and inefficient code, so this patch introduces new logic to lower integer vector constants to a single i64x2.splat where possible, with at most a single i64x2.replace_lane following it if necessary. Adapted from a patch authored by @omnisip. Differential Revision: https://reviews.llvm.org/D88591
1 parent b0ce9f0 commit 542523a

File tree

2 files changed

+130
-8
lines changed

2 files changed

+130
-8
lines changed

llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp

Lines changed: 65 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
#include "llvm/IR/Intrinsics.h"
3131
#include "llvm/IR/IntrinsicsWebAssembly.h"
3232
#include "llvm/Support/Debug.h"
33+
#include "llvm/Support/Endian.h"
3334
#include "llvm/Support/ErrorHandling.h"
3435
#include "llvm/Support/raw_ostream.h"
3536
#include "llvm/Target/TargetOptions.h"
@@ -1565,6 +1566,7 @@ SDValue WebAssemblyTargetLowering::LowerBUILD_VECTOR(SDValue Op,
15651566
};
15661567
} else if (NumConstantLanes >= NumSplatLanes &&
15671568
Subtarget->hasUnimplementedSIMD128()) {
1569+
// If we support v128.const, emit it directly
15681570
SmallVector<SDValue, 16> ConstLanes;
15691571
for (const SDValue &Lane : Op->op_values()) {
15701572
if (IsConstant(Lane)) {
@@ -1576,11 +1578,67 @@ SDValue WebAssemblyTargetLowering::LowerBUILD_VECTOR(SDValue Op,
15761578
}
15771579
}
15781580
Result = DAG.getBuildVector(VecT, DL, ConstLanes);
1579-
IsLaneConstructed = [&](size_t _, const SDValue &Lane) {
1581+
IsLaneConstructed = [&IsConstant](size_t _, const SDValue &Lane) {
15801582
return IsConstant(Lane);
15811583
};
1582-
}
1583-
if (!Result) {
1584+
} else if (NumConstantLanes >= NumSplatLanes && VecT.isInteger()) {
1585+
// Otherwise, if this is an integer vector, pack the lane values together so
1586+
// we can construct the 128-bit constant from a pair of i64s using a splat
1587+
// followed by at most one i64x2.replace_lane. Also keep track of the lanes
1588+
// that actually matter so we can avoid the replace_lane in more cases.
1589+
std::array<uint64_t, 2> I64s({0, 0});
1590+
std::array<uint64_t, 2> ConstLaneMasks({0, 0});
1591+
uint8_t *I64Bytes = reinterpret_cast<uint8_t *>(I64s.data());
1592+
uint8_t *MaskBytes = reinterpret_cast<uint8_t *>(ConstLaneMasks.data());
1593+
unsigned I = 0;
1594+
size_t ByteStep = VecT.getScalarSizeInBits() / 8;
1595+
for (const SDValue &Lane : Op->op_values()) {
1596+
if (IsConstant(Lane)) {
1597+
using llvm::support::little;
1598+
using llvm::support::endian::byte_swap;
1599+
// The endianness of the compiler matters here. We want to enforce
1600+
// little endianness so that the bytes of a smaller integer type will
1601+
// occur first in the uint64_t.
1602+
auto *Const = cast<ConstantSDNode>(Lane.getNode());
1603+
uint64_t Val = byte_swap(Const->getLimitedValue(), little);
1604+
uint8_t *ValPtr = reinterpret_cast<uint8_t *>(&Val);
1605+
std::copy(ValPtr, ValPtr + ByteStep, I64Bytes + I * ByteStep);
1606+
uint64_t Mask = uint64_t(-1LL);
1607+
uint8_t *MaskPtr = reinterpret_cast<uint8_t *>(&Mask);
1608+
std::copy(MaskPtr, MaskPtr + ByteStep, MaskBytes + I * ByteStep);
1609+
}
1610+
++I;
1611+
}
1612+
// Check whether all constant lanes in the second half of the vector are
1613+
// equivalent in the first half or vice versa to determine whether splatting
1614+
// either side will be sufficient to materialize the constant. As a special
1615+
// case, if the first and second halves have no constant lanes in common, we
1616+
// can just combine them.
1617+
bool FirstHalfSufficient = (I64s[0] & ConstLaneMasks[1]) == I64s[1];
1618+
bool SecondHalfSufficient = (I64s[1] & ConstLaneMasks[0]) == I64s[0];
1619+
bool CombinedSufficient = (ConstLaneMasks[0] & ConstLaneMasks[1]) == 0;
1620+
1621+
uint64_t Splatted;
1622+
if (SecondHalfSufficient) {
1623+
Splatted = I64s[1];
1624+
} else if (CombinedSufficient) {
1625+
Splatted = I64s[0] | I64s[1];
1626+
} else {
1627+
Splatted = I64s[0];
1628+
}
1629+
1630+
Result = DAG.getSplatBuildVector(MVT::v2i64, DL,
1631+
DAG.getConstant(Splatted, DL, MVT::i64));
1632+
if (!FirstHalfSufficient && !SecondHalfSufficient && !CombinedSufficient) {
1633+
Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v2i64, Result,
1634+
DAG.getConstant(I64s[1], DL, MVT::i64),
1635+
DAG.getConstant(1, DL, MVT::i32));
1636+
}
1637+
Result = DAG.getBitcast(VecT, Result);
1638+
IsLaneConstructed = [&IsConstant](size_t _, const SDValue &Lane) {
1639+
return IsConstant(Lane);
1640+
};
1641+
} else {
15841642
// Use a splat, but possibly a load_splat
15851643
LoadSDNode *SplattedLoad;
15861644
if ((SplattedLoad = dyn_cast<LoadSDNode>(SplatValue)) &&
@@ -1593,11 +1651,14 @@ SDValue WebAssemblyTargetLowering::LowerBUILD_VECTOR(SDValue Op,
15931651
} else {
15941652
Result = DAG.getSplatBuildVector(VecT, DL, SplatValue);
15951653
}
1596-
IsLaneConstructed = [&](size_t _, const SDValue &Lane) {
1654+
IsLaneConstructed = [&SplatValue](size_t _, const SDValue &Lane) {
15971655
return Lane == SplatValue;
15981656
};
15991657
}
16001658

1659+
assert(Result);
1660+
assert(IsLaneConstructed);
1661+
16011662
// Add replace_lane instructions for any unhandled values
16021663
for (size_t I = 0; I < Lanes; ++I) {
16031664
const SDValue &Lane = Op->getOperand(I);

llvm/test/CodeGen/WebAssembly/simd-build-vector.ll

Lines changed: 65 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,73 @@
88
target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
99
target triple = "wasm32-unknown-unknown"
1010

11+
; CHECK-LABEL: emulated_const_trivial_splat:
12+
; CHECK-NEXT: .functype emulated_const_trivial_splat () -> (v128)
13+
; SIMD-VM-NEXT: i64.const $push0=, 8589934593
14+
; SIMD-VM-NEXT: i64x2.splat $push1=, $pop0
15+
; SIMD-VM-NEXT: return $pop1
16+
; UNIMP: v128.const
17+
define <4 x i32> @emulated_const_trivial_splat() {
18+
ret <4 x i32> <i32 1, i32 2, i32 1, i32 2>
19+
}
20+
21+
; CHECK-LABEL: emulated_const_first_sufficient:
22+
; CHECK-NEXT: .functype emulated_const_first_sufficient () -> (v128)
23+
; SIMD-VM-NEXT: i64.const $push0=, 8589934593
24+
; SIMD-VM-NEXT: i64x2.splat $push1=, $pop0
25+
; SIMD-VM-NEXT: return $pop1
26+
; UNIMP: v128.const
27+
define <4 x i32> @emulated_const_first_sufficient() {
28+
ret <4 x i32> <i32 1, i32 2, i32 undef, i32 2>
29+
}
30+
31+
; CHECK-LABEL: emulated_const_second_sufficient:
32+
; CHECK-NEXT: .functype emulated_const_second_sufficient () -> (v128)
33+
; SIMD-VM-NEXT: i64.const $push0=, 8589934593
34+
; SIMD-VM-NEXT: i64x2.splat $push1=, $pop0
35+
; SIMD-VM-NEXT: return $pop1
36+
; UNIMP: v128.const
37+
define <4 x i32> @emulated_const_second_sufficient() {
38+
ret <4 x i32> <i32 1, i32 undef, i32 1, i32 2>
39+
}
40+
41+
; CHECK-LABEL: emulated_const_combined_sufficient:
42+
; CHECK-NEXT: .functype emulated_const_combined_sufficient () -> (v128)
43+
; SIMD-VM-NEXT: i64.const $push0=, 8589934593
44+
; SIMD-VM-NEXT: i64x2.splat $push1=, $pop0
45+
; SIMD-VM-NEXT: return $pop1
46+
; UNIMP: v128.const
47+
define <4 x i32> @emulated_const_combined_sufficient() {
48+
ret <4 x i32> <i32 1, i32 undef, i32 undef, i32 2>
49+
}
50+
51+
; CHECK-LABEL: emulated_const_either_sufficient:
52+
; CHECK-NEXT: .functype emulated_const_either_sufficient () -> (v128)
53+
; SIMD-VM-NEXT: i64.const $push0=, 1
54+
; SIMD-VM-NEXT: i64x2.splat $push1=, $pop0
55+
; SIMD-VM-NEXT: return $pop1
56+
; UNIMP: v128.const
57+
define <4 x i32> @emulated_const_either_sufficient() {
58+
ret <4 x i32> <i32 1, i32 undef, i32 1, i32 undef>
59+
}
60+
61+
; CHECK-LABEL: emulated_const_neither_sufficient:
62+
; CHECK-NEXT: .functype emulated_const_neither_sufficient () -> (v128)
63+
; SIMD-VM-NEXT: i64.const $push0=, 8589934593
64+
; SIMD-VM-NEXT: i64x2.splat $push1=, $pop0
65+
; SIMD-VM-NEXT: i64.const $push2=, 17179869184
66+
; SIMD-VM-NEXT: i64x2.replace_lane $push3=, $pop1, 1, $pop2
67+
; SIMD-VM-NEXT: return $pop3
68+
define <4 x i32> @emulated_const_neither_sufficient() {
69+
ret <4 x i32> <i32 1, i32 2, i32 undef, i32 4>
70+
}
71+
1172
; CHECK-LABEL: same_const_one_replaced_i16x8:
1273
; CHECK-NEXT: .functype same_const_one_replaced_i16x8 (i32) -> (v128)
1374
; UNIMP-NEXT: v128.const $push[[L0:[0-9]+]]=, 42, 42, 42, 42, 42, 0, 42, 42
1475
; UNIMP-NEXT: i16x8.replace_lane $push[[L1:[0-9]+]]=, $pop[[L0]], 5, $0
1576
; UNIMP-NEXT: return $pop[[L1]]
16-
; SIMD-VM: i16x8.splat
77+
; SIMD-VM: i64x2.splat
1778
define <8 x i16> @same_const_one_replaced_i16x8(i16 %x) {
1879
%v = insertelement
1980
<8 x i16> <i16 42, i16 42, i16 42, i16 42, i16 42, i16 42, i16 42, i16 42>,
@@ -27,7 +88,7 @@ define <8 x i16> @same_const_one_replaced_i16x8(i16 %x) {
2788
; UNIMP-NEXT: v128.const $push[[L0:[0-9]+]]=, 1, -2, 3, -4, 5, 0, 7, -8
2889
; UNIMP-NEXT: i16x8.replace_lane $push[[L1:[0-9]+]]=, $pop[[L0]], 5, $0
2990
; UNIMP-NEXT: return $pop[[L1]]
30-
; SIMD-VM: i16x8.splat
91+
; SIMD-VM: i64x2.splat
3192
define <8 x i16> @different_const_one_replaced_i16x8(i16 %x) {
3293
%v = insertelement
3394
<8 x i16> <i16 1, i16 -2, i16 3, i16 -4, i16 5, i16 -6, i16 7, i16 -8>,
@@ -68,7 +129,7 @@ define <4 x float> @different_const_one_replaced_f32x4(float %x) {
68129
; CHECK-NEXT: .functype splat_common_const_i32x4 () -> (v128)
69130
; UNIMP-NEXT: v128.const $push[[L0:[0-9]+]]=, 0, 3, 3, 1
70131
; UNIMP-NEXT: return $pop[[L0]]
71-
; SIMD-VM: i32x4.splat
132+
; SIMD-VM: i64x2.splat
72133
define <4 x i32> @splat_common_const_i32x4() {
73134
ret <4 x i32> <i32 undef, i32 3, i32 3, i32 1>
74135
}
@@ -206,7 +267,7 @@ define <16 x i8> @mashup_swizzle_i8x16(<16 x i8> %src, <16 x i8> %mask, i8 %spla
206267
; UNIMP: i8x16.replace_lane
207268
; UNIMP: i8x16.replace_lane
208269
; UNIMP: return
209-
; SIMD-VM: i8x16.splat
270+
; SIMD-VM: i64x2.splat
210271
define <16 x i8> @mashup_const_i8x16(<16 x i8> %src, <16 x i8> %mask, i8 %splatted) {
211272
; swizzle 0
212273
%m0 = extractelement <16 x i8> %mask, i32 0

0 commit comments

Comments
 (0)