Skip to content

Commit 4a8c1f7

Browse files
authored
[WebAssembly] [Backend] Wasm optimize illegal bitmask (#145627)
[WebAssembly] [Backend] Wasm optimize illegal bitmask for #131980. Currently, the case for illegal bitmask (v32i8 or v64i8) is that at the SelectionDag level, two (four) vectors of v128 will be concatenated together, then they'll all be SETCC by the same pseudo illegal instruction, which requires expansion later on. I opt for SETCC-ing them seperately, bitcast and zext them and then add them up together in the end. --------- Co-authored-by: badumbatish <--show-origin>
1 parent ae21048 commit 4a8c1f7

File tree

3 files changed

+655
-228
lines changed

3 files changed

+655
-228
lines changed

llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp

Lines changed: 54 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
#include "llvm/CodeGen/MachineJumpTableInfo.h"
2525
#include "llvm/CodeGen/MachineModuleInfo.h"
2626
#include "llvm/CodeGen/MachineRegisterInfo.h"
27+
#include "llvm/CodeGen/SDPatternMatch.h"
2728
#include "llvm/CodeGen/SelectionDAG.h"
2829
#include "llvm/CodeGen/SelectionDAGNodes.h"
2930
#include "llvm/IR/DiagnosticInfo.h"
@@ -3214,20 +3215,23 @@ static SDValue performTruncateCombine(SDNode *N,
32143215

32153216
static SDValue performBitcastCombine(SDNode *N,
32163217
TargetLowering::DAGCombinerInfo &DCI) {
3218+
using namespace llvm::SDPatternMatch;
32173219
auto &DAG = DCI.DAG;
32183220
SDLoc DL(N);
32193221
SDValue Src = N->getOperand(0);
32203222
EVT VT = N->getValueType(0);
32213223
EVT SrcVT = Src.getValueType();
32223224

3223-
// bitcast <N x i1> to iN
3225+
if (!(DCI.isBeforeLegalize() && VT.isScalarInteger() &&
3226+
SrcVT.isFixedLengthVector() && SrcVT.getScalarType() == MVT::i1))
3227+
return SDValue();
3228+
3229+
unsigned NumElts = SrcVT.getVectorNumElements();
3230+
EVT Width = MVT::getIntegerVT(128 / NumElts);
3231+
3232+
// bitcast <N x i1> to iN, where N = 2, 4, 8, 16 (legal)
32243233
// ==> bitmask
3225-
if (DCI.isBeforeLegalize() && VT.isScalarInteger() &&
3226-
SrcVT.isFixedLengthVector() && SrcVT.getScalarType() == MVT::i1) {
3227-
unsigned NumElts = SrcVT.getVectorNumElements();
3228-
if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
3229-
return SDValue();
3230-
EVT Width = MVT::getIntegerVT(128 / NumElts);
3234+
if (NumElts == 2 || NumElts == 4 || NumElts == 8 || NumElts == 16) {
32313235
return DAG.getZExtOrTrunc(
32323236
DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
32333237
{DAG.getConstant(Intrinsic::wasm_bitmask, DL, MVT::i32),
@@ -3236,6 +3240,49 @@ static SDValue performBitcastCombine(SDNode *N,
32363240
DL, VT);
32373241
}
32383242

3243+
// bitcast <N x i1>(setcc ...) to concat iN, where N = 32 and 64 (illegal)
3244+
if (NumElts == 32 || NumElts == 64) {
3245+
// Strategy: We will setcc them seperately in v16i8 -> v16i1
3246+
// Bitcast them to i16, extend them to either i32 or i64.
3247+
// Add them together, shifting left 1 by 1.
3248+
SDValue Concat, SetCCVector;
3249+
ISD::CondCode SetCond;
3250+
3251+
if (!sd_match(N, m_BitCast(m_c_SetCC(m_Value(Concat), m_Value(SetCCVector),
3252+
m_CondCode(SetCond)))))
3253+
return SDValue();
3254+
if (Concat.getOpcode() != ISD::CONCAT_VECTORS)
3255+
return SDValue();
3256+
3257+
uint64_t ElementWidth =
3258+
SetCCVector.getValueType().getVectorElementType().getFixedSizeInBits();
3259+
3260+
SmallVector<SDValue> VectorsToShuffle;
3261+
for (size_t I = 0; I < Concat->ops().size(); I++) {
3262+
VectorsToShuffle.push_back(DAG.getBitcast(
3263+
MVT::i16,
3264+
DAG.getSetCC(DL, MVT::v16i1, Concat->ops()[I],
3265+
extractSubVector(SetCCVector, I * (128 / ElementWidth),
3266+
DAG, DL, 128),
3267+
SetCond)));
3268+
}
3269+
3270+
MVT ReturnType = VectorsToShuffle.size() == 2 ? MVT::i32 : MVT::i64;
3271+
SDValue ReturningInteger = DAG.getConstant(0, DL, ReturnType);
3272+
3273+
for (SDValue V : VectorsToShuffle) {
3274+
ReturningInteger = DAG.getNode(
3275+
ISD::SHL, DL, ReturnType,
3276+
{DAG.getShiftAmountConstant(16, ReturnType, DL), ReturningInteger});
3277+
3278+
SDValue ExtendedV = DAG.getZExtOrTrunc(V, DL, ReturnType);
3279+
ReturningInteger =
3280+
DAG.getNode(ISD::ADD, DL, ReturnType, {ReturningInteger, ExtendedV});
3281+
}
3282+
3283+
return ReturningInteger;
3284+
}
3285+
32393286
return SDValue();
32403287
}
32413288

llvm/test/CodeGen/WebAssembly/simd-bitmask.ll

Lines changed: 5 additions & 221 deletions
Original file line numberDiff line numberDiff line change
@@ -177,236 +177,20 @@ define i32 @bitmask_v32i8(<32 x i8> %v) {
177177
; CHECK: .functype bitmask_v32i8 (v128, v128) -> (i32)
178178
; CHECK-NEXT: .local v128
179179
; CHECK-NEXT: # %bb.0:
180-
; CHECK-NEXT: global.get __stack_pointer
181180
; CHECK-NEXT: i32.const 16
182-
; CHECK-NEXT: i32.sub
183-
; CHECK-NEXT: drop
184181
; CHECK-NEXT: local.get 0
185182
; CHECK-NEXT: v128.const 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
186183
; CHECK-NEXT: local.tee 2
187184
; CHECK-NEXT: i8x16.eq
188-
; CHECK-NEXT: local.tee 0
189-
; CHECK-NEXT: i8x16.extract_lane_u 0
190-
; CHECK-NEXT: i32.const 1
191-
; CHECK-NEXT: i32.and
192-
; CHECK-NEXT: local.get 0
193-
; CHECK-NEXT: i8x16.extract_lane_u 1
194-
; CHECK-NEXT: i32.const 1
195-
; CHECK-NEXT: i32.and
196-
; CHECK-NEXT: i32.const 1
197-
; CHECK-NEXT: i32.shl
198-
; CHECK-NEXT: i32.or
199-
; CHECK-NEXT: local.get 0
200-
; CHECK-NEXT: i8x16.extract_lane_u 2
201-
; CHECK-NEXT: i32.const 1
202-
; CHECK-NEXT: i32.and
203-
; CHECK-NEXT: i32.const 2
204-
; CHECK-NEXT: i32.shl
205-
; CHECK-NEXT: i32.or
206-
; CHECK-NEXT: local.get 0
207-
; CHECK-NEXT: i8x16.extract_lane_u 3
208-
; CHECK-NEXT: i32.const 1
209-
; CHECK-NEXT: i32.and
210-
; CHECK-NEXT: i32.const 3
211-
; CHECK-NEXT: i32.shl
212-
; CHECK-NEXT: i32.or
213-
; CHECK-NEXT: local.get 0
214-
; CHECK-NEXT: i8x16.extract_lane_u 4
215-
; CHECK-NEXT: i32.const 1
216-
; CHECK-NEXT: i32.and
217-
; CHECK-NEXT: i32.const 4
218-
; CHECK-NEXT: i32.shl
219-
; CHECK-NEXT: i32.or
220-
; CHECK-NEXT: local.get 0
221-
; CHECK-NEXT: i8x16.extract_lane_u 5
222-
; CHECK-NEXT: i32.const 1
223-
; CHECK-NEXT: i32.and
224-
; CHECK-NEXT: i32.const 5
225-
; CHECK-NEXT: i32.shl
226-
; CHECK-NEXT: i32.or
227-
; CHECK-NEXT: local.get 0
228-
; CHECK-NEXT: i8x16.extract_lane_u 6
229-
; CHECK-NEXT: i32.const 1
230-
; CHECK-NEXT: i32.and
231-
; CHECK-NEXT: i32.const 6
232-
; CHECK-NEXT: i32.shl
233-
; CHECK-NEXT: i32.or
234-
; CHECK-NEXT: local.get 0
235-
; CHECK-NEXT: i8x16.extract_lane_u 7
236-
; CHECK-NEXT: i32.const 1
237-
; CHECK-NEXT: i32.and
238-
; CHECK-NEXT: i32.const 7
239-
; CHECK-NEXT: i32.shl
240-
; CHECK-NEXT: i32.or
241-
; CHECK-NEXT: local.get 0
242-
; CHECK-NEXT: i8x16.extract_lane_u 8
243-
; CHECK-NEXT: i32.const 1
244-
; CHECK-NEXT: i32.and
245-
; CHECK-NEXT: i32.const 8
246-
; CHECK-NEXT: i32.shl
247-
; CHECK-NEXT: i32.or
248-
; CHECK-NEXT: local.get 0
249-
; CHECK-NEXT: i8x16.extract_lane_u 9
250-
; CHECK-NEXT: i32.const 1
251-
; CHECK-NEXT: i32.and
252-
; CHECK-NEXT: i32.const 9
253-
; CHECK-NEXT: i32.shl
254-
; CHECK-NEXT: i32.or
255-
; CHECK-NEXT: local.get 0
256-
; CHECK-NEXT: i8x16.extract_lane_u 10
257-
; CHECK-NEXT: i32.const 1
258-
; CHECK-NEXT: i32.and
259-
; CHECK-NEXT: i32.const 10
260-
; CHECK-NEXT: i32.shl
261-
; CHECK-NEXT: i32.or
262-
; CHECK-NEXT: local.get 0
263-
; CHECK-NEXT: i8x16.extract_lane_u 11
264-
; CHECK-NEXT: i32.const 1
265-
; CHECK-NEXT: i32.and
266-
; CHECK-NEXT: i32.const 11
267-
; CHECK-NEXT: i32.shl
268-
; CHECK-NEXT: i32.or
269-
; CHECK-NEXT: local.get 0
270-
; CHECK-NEXT: i8x16.extract_lane_u 12
271-
; CHECK-NEXT: i32.const 1
272-
; CHECK-NEXT: i32.and
273-
; CHECK-NEXT: i32.const 12
274-
; CHECK-NEXT: i32.shl
275-
; CHECK-NEXT: i32.or
276-
; CHECK-NEXT: local.get 0
277-
; CHECK-NEXT: i8x16.extract_lane_u 13
278-
; CHECK-NEXT: i32.const 1
279-
; CHECK-NEXT: i32.and
280-
; CHECK-NEXT: i32.const 13
281-
; CHECK-NEXT: i32.shl
282-
; CHECK-NEXT: i32.or
283-
; CHECK-NEXT: local.get 0
284-
; CHECK-NEXT: i8x16.extract_lane_u 14
285-
; CHECK-NEXT: i32.const 1
286-
; CHECK-NEXT: i32.and
287-
; CHECK-NEXT: i32.const 14
288-
; CHECK-NEXT: i32.shl
289-
; CHECK-NEXT: i32.or
290-
; CHECK-NEXT: local.get 0
291-
; CHECK-NEXT: i8x16.extract_lane_u 15
292-
; CHECK-NEXT: i32.const 15
185+
; CHECK-NEXT: i8x16.bitmask
186+
; CHECK-NEXT: i32.const 16
187+
; CHECK-NEXT: i32.add
293188
; CHECK-NEXT: i32.shl
294-
; CHECK-NEXT: i32.or
295-
; CHECK-NEXT: i32.const 65535
296-
; CHECK-NEXT: i32.and
297189
; CHECK-NEXT: local.get 1
298190
; CHECK-NEXT: local.get 2
299191
; CHECK-NEXT: i8x16.eq
300-
; CHECK-NEXT: local.tee 0
301-
; CHECK-NEXT: i8x16.extract_lane_u 15
302-
; CHECK-NEXT: i32.const 31
303-
; CHECK-NEXT: i32.shl
304-
; CHECK-NEXT: local.get 0
305-
; CHECK-NEXT: i8x16.extract_lane_u 14
306-
; CHECK-NEXT: i32.const 1
307-
; CHECK-NEXT: i32.and
308-
; CHECK-NEXT: i32.const 30
309-
; CHECK-NEXT: i32.shl
310-
; CHECK-NEXT: local.get 0
311-
; CHECK-NEXT: i8x16.extract_lane_u 13
312-
; CHECK-NEXT: i32.const 1
313-
; CHECK-NEXT: i32.and
314-
; CHECK-NEXT: i32.const 29
315-
; CHECK-NEXT: i32.shl
316-
; CHECK-NEXT: local.get 0
317-
; CHECK-NEXT: i8x16.extract_lane_u 12
318-
; CHECK-NEXT: i32.const 1
319-
; CHECK-NEXT: i32.and
320-
; CHECK-NEXT: i32.const 28
321-
; CHECK-NEXT: i32.shl
322-
; CHECK-NEXT: local.get 0
323-
; CHECK-NEXT: i8x16.extract_lane_u 11
324-
; CHECK-NEXT: i32.const 1
325-
; CHECK-NEXT: i32.and
326-
; CHECK-NEXT: i32.const 27
327-
; CHECK-NEXT: i32.shl
328-
; CHECK-NEXT: local.get 0
329-
; CHECK-NEXT: i8x16.extract_lane_u 10
330-
; CHECK-NEXT: i32.const 1
331-
; CHECK-NEXT: i32.and
332-
; CHECK-NEXT: i32.const 26
333-
; CHECK-NEXT: i32.shl
334-
; CHECK-NEXT: local.get 0
335-
; CHECK-NEXT: i8x16.extract_lane_u 9
336-
; CHECK-NEXT: i32.const 1
337-
; CHECK-NEXT: i32.and
338-
; CHECK-NEXT: i32.const 25
339-
; CHECK-NEXT: i32.shl
340-
; CHECK-NEXT: local.get 0
341-
; CHECK-NEXT: i8x16.extract_lane_u 8
342-
; CHECK-NEXT: i32.const 1
343-
; CHECK-NEXT: i32.and
344-
; CHECK-NEXT: i32.const 24
345-
; CHECK-NEXT: i32.shl
346-
; CHECK-NEXT: local.get 0
347-
; CHECK-NEXT: i8x16.extract_lane_u 7
348-
; CHECK-NEXT: i32.const 1
349-
; CHECK-NEXT: i32.and
350-
; CHECK-NEXT: i32.const 23
351-
; CHECK-NEXT: i32.shl
352-
; CHECK-NEXT: local.get 0
353-
; CHECK-NEXT: i8x16.extract_lane_u 6
354-
; CHECK-NEXT: i32.const 1
355-
; CHECK-NEXT: i32.and
356-
; CHECK-NEXT: i32.const 22
357-
; CHECK-NEXT: i32.shl
358-
; CHECK-NEXT: local.get 0
359-
; CHECK-NEXT: i8x16.extract_lane_u 5
360-
; CHECK-NEXT: i32.const 1
361-
; CHECK-NEXT: i32.and
362-
; CHECK-NEXT: i32.const 21
363-
; CHECK-NEXT: i32.shl
364-
; CHECK-NEXT: local.get 0
365-
; CHECK-NEXT: i8x16.extract_lane_u 4
366-
; CHECK-NEXT: i32.const 1
367-
; CHECK-NEXT: i32.and
368-
; CHECK-NEXT: i32.const 20
369-
; CHECK-NEXT: i32.shl
370-
; CHECK-NEXT: local.get 0
371-
; CHECK-NEXT: i8x16.extract_lane_u 3
372-
; CHECK-NEXT: i32.const 1
373-
; CHECK-NEXT: i32.and
374-
; CHECK-NEXT: i32.const 19
375-
; CHECK-NEXT: i32.shl
376-
; CHECK-NEXT: local.get 0
377-
; CHECK-NEXT: i8x16.extract_lane_u 2
378-
; CHECK-NEXT: i32.const 1
379-
; CHECK-NEXT: i32.and
380-
; CHECK-NEXT: i32.const 18
381-
; CHECK-NEXT: i32.shl
382-
; CHECK-NEXT: local.get 0
383-
; CHECK-NEXT: i8x16.extract_lane_u 1
384-
; CHECK-NEXT: i32.const 1
385-
; CHECK-NEXT: i32.and
386-
; CHECK-NEXT: i32.const 17
387-
; CHECK-NEXT: i32.shl
388-
; CHECK-NEXT: local.get 0
389-
; CHECK-NEXT: i8x16.extract_lane_u 0
390-
; CHECK-NEXT: i32.const 1
391-
; CHECK-NEXT: i32.and
392-
; CHECK-NEXT: i32.const 16
393-
; CHECK-NEXT: i32.shl
394-
; CHECK-NEXT: i32.or
395-
; CHECK-NEXT: i32.or
396-
; CHECK-NEXT: i32.or
397-
; CHECK-NEXT: i32.or
398-
; CHECK-NEXT: i32.or
399-
; CHECK-NEXT: i32.or
400-
; CHECK-NEXT: i32.or
401-
; CHECK-NEXT: i32.or
402-
; CHECK-NEXT: i32.or
403-
; CHECK-NEXT: i32.or
404-
; CHECK-NEXT: i32.or
405-
; CHECK-NEXT: i32.or
406-
; CHECK-NEXT: i32.or
407-
; CHECK-NEXT: i32.or
408-
; CHECK-NEXT: i32.or
409-
; CHECK-NEXT: i32.or
192+
; CHECK-NEXT: i8x16.bitmask
193+
; CHECK-NEXT: i32.add
410194
; CHECK-NEXT: # fallthrough-return
411195
%cmp = icmp eq <32 x i8> %v, zeroinitializer
412196
%bitmask = bitcast <32 x i1> %cmp to i32

0 commit comments

Comments
 (0)