Skip to content

Commit 953838d

Browse files
authored
[LoongArch] Optimize vector bitreverse using scalar bitrev and vshuf4i (#118054)
Custom lower vector type bitreverse to scalar bitrev and vshuf4i instructions. Keep `v2i64` and `v4i64` bitreverse `Expand`, it's good enough.
1 parent 41ed16c commit 953838d

File tree

5 files changed

+115
-102
lines changed

5 files changed

+115
-102
lines changed

llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,8 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
270270
{ISD::SETNE, ISD::SETGE, ISD::SETGT, ISD::SETUGE, ISD::SETUGT}, VT,
271271
Expand);
272272
}
273+
for (MVT VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
274+
setOperationAction(ISD::BITREVERSE, VT, Custom);
273275
for (MVT VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64})
274276
setOperationAction(ISD::BSWAP, VT, Legal);
275277
for (MVT VT : {MVT::v4i32, MVT::v2i64}) {
@@ -324,6 +326,8 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
324326
{ISD::SETNE, ISD::SETGE, ISD::SETGT, ISD::SETUGE, ISD::SETUGT}, VT,
325327
Expand);
326328
}
329+
for (MVT VT : {MVT::v32i8, MVT::v16i16, MVT::v8i32})
330+
setOperationAction(ISD::BITREVERSE, VT, Custom);
327331
for (MVT VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64})
328332
setOperationAction(ISD::BSWAP, VT, Legal);
329333
for (MVT VT : {MVT::v8i32, MVT::v4i32, MVT::v4i64}) {
@@ -440,10 +444,56 @@ SDValue LoongArchTargetLowering::LowerOperation(SDValue Op,
440444
return lowerBUILD_VECTOR(Op, DAG);
441445
case ISD::VECTOR_SHUFFLE:
442446
return lowerVECTOR_SHUFFLE(Op, DAG);
447+
case ISD::BITREVERSE:
448+
return lowerBITREVERSE(Op, DAG);
443449
}
444450
return SDValue();
445451
}
446452

453+
SDValue LoongArchTargetLowering::lowerBITREVERSE(SDValue Op,
454+
SelectionDAG &DAG) const {
455+
EVT ResTy = Op->getValueType(0);
456+
SDValue Src = Op->getOperand(0);
457+
SDLoc DL(Op);
458+
459+
EVT NewVT = ResTy.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
460+
unsigned int OrigEltNum = ResTy.getVectorNumElements();
461+
unsigned int NewEltNum = NewVT.getVectorNumElements();
462+
463+
SDValue NewSrc = DAG.getNode(ISD::BITCAST, DL, NewVT, Src);
464+
465+
SmallVector<SDValue, 8> Ops;
466+
for (unsigned int i = 0; i < NewEltNum; i++) {
467+
SDValue Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, NewSrc,
468+
DAG.getConstant(i, DL, MVT::i64));
469+
SDValue RevOp = DAG.getNode((ResTy == MVT::v16i8 || ResTy == MVT::v32i8)
470+
? LoongArchISD::BITREV_8B
471+
: ISD::BITREVERSE,
472+
DL, MVT::i64, Op);
473+
Ops.push_back(RevOp);
474+
}
475+
SDValue Res =
476+
DAG.getNode(ISD::BITCAST, DL, ResTy, DAG.getBuildVector(NewVT, DL, Ops));
477+
478+
switch (ResTy.getSimpleVT().SimpleTy) {
479+
default:
480+
return SDValue();
481+
case MVT::v16i8:
482+
case MVT::v32i8:
483+
return Res;
484+
case MVT::v8i16:
485+
case MVT::v16i16:
486+
case MVT::v4i32:
487+
case MVT::v8i32: {
488+
SmallVector<int, 32> Mask;
489+
for (unsigned int i = 0; i < NewEltNum; i++)
490+
for (int j = OrigEltNum / NewEltNum - 1; j >= 0; j--)
491+
Mask.push_back(j + (OrigEltNum / NewEltNum) * i);
492+
return DAG.getVectorShuffle(ResTy, DL, Res, DAG.getUNDEF(ResTy), Mask);
493+
}
494+
}
495+
}
496+
447497
/// Determine whether a range fits a regular pattern of values.
448498
/// This function accounts for the possibility of jumping over the End iterator.
449499
template <typename ValType>
@@ -4685,6 +4735,7 @@ const char *LoongArchTargetLowering::getTargetNodeName(unsigned Opcode) const {
46854735
NODE_NAME_CASE(REVB_2H)
46864736
NODE_NAME_CASE(REVB_2W)
46874737
NODE_NAME_CASE(BITREV_4B)
4738+
NODE_NAME_CASE(BITREV_8B)
46884739
NODE_NAME_CASE(BITREV_W)
46894740
NODE_NAME_CASE(ROTR_W)
46904741
NODE_NAME_CASE(ROTL_W)

llvm/lib/Target/LoongArch/LoongArchISelLowering.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ enum NodeType : unsigned {
6868
REVB_2H,
6969
REVB_2W,
7070
BITREV_4B,
71+
BITREV_8B,
7172
BITREV_W,
7273

7374
// Intrinsic operations start ============================================
@@ -334,6 +335,7 @@ class LoongArchTargetLowering : public TargetLowering {
334335
SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
335336
SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
336337
SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
338+
SDValue lowerBITREVERSE(SDValue Op, SelectionDAG &DAG) const;
337339

338340
bool isFPImmLegal(const APFloat &Imm, EVT VT,
339341
bool ForCodeSize) const override;

llvm/lib/Target/LoongArch/LoongArchInstrInfo.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,7 @@ def loongarch_bstrpick
112112
def loongarch_revb_2h : SDNode<"LoongArchISD::REVB_2H", SDTUnaryOp>;
113113
def loongarch_revb_2w : SDNode<"LoongArchISD::REVB_2W", SDTUnaryOp>;
114114
def loongarch_bitrev_4b : SDNode<"LoongArchISD::BITREV_4B", SDTUnaryOp>;
115+
def loongarch_bitrev_8b : SDNode<"LoongArchISD::BITREV_8B", SDTUnaryOp>;
115116
def loongarch_bitrev_w : SDNode<"LoongArchISD::BITREV_W", SDTUnaryOp>;
116117
def loongarch_clzw : SDNode<"LoongArchISD::CLZ_W", SDTIntBitCountUnaryOp>;
117118
def loongarch_ctzw : SDNode<"LoongArchISD::CTZ_W", SDTIntBitCountUnaryOp>;
@@ -1765,6 +1766,7 @@ def : Pat<(bitreverse (bswap GPR:$rj)), (BITREV_4B GPR:$rj)>;
17651766
let Predicates = [IsLA64] in {
17661767
def : Pat<(loongarch_revb_2w GPR:$rj), (REVB_2W GPR:$rj)>;
17671768
def : Pat<(bswap GPR:$rj), (REVB_D GPR:$rj)>;
1769+
def : Pat<(loongarch_bitrev_8b GPR:$rj), (BITREV_8B GPR:$rj)>;
17681770
def : Pat<(loongarch_bitrev_w GPR:$rj), (BITREV_W GPR:$rj)>;
17691771
def : Pat<(bitreverse GPR:$rj), (BITREV_D GPR:$rj)>;
17701772
def : Pat<(bswap (bitreverse GPR:$rj)), (BITREV_8B GPR:$rj)>;

llvm/test/CodeGen/LoongArch/lasx/bitreverse.ll

Lines changed: 39 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -7,19 +7,19 @@ declare <32 x i8> @llvm.bitreverse.v32i8(<32 x i8>)
77
define <32 x i8> @test_bitreverse_v32i8(<32 x i8> %a) nounwind {
88
; CHECK-LABEL: test_bitreverse_v32i8:
99
; CHECK: # %bb.0:
10-
; CHECK-NEXT: xvslli.b $xr1, $xr0, 4
11-
; CHECK-NEXT: xvsrli.b $xr0, $xr0, 4
12-
; CHECK-NEXT: xvor.v $xr0, $xr0, $xr1
13-
; CHECK-NEXT: xvandi.b $xr1, $xr0, 51
14-
; CHECK-NEXT: xvslli.b $xr1, $xr1, 2
15-
; CHECK-NEXT: xvsrli.b $xr0, $xr0, 2
16-
; CHECK-NEXT: xvandi.b $xr0, $xr0, 51
17-
; CHECK-NEXT: xvor.v $xr0, $xr0, $xr1
18-
; CHECK-NEXT: xvandi.b $xr1, $xr0, 85
19-
; CHECK-NEXT: xvslli.b $xr1, $xr1, 1
20-
; CHECK-NEXT: xvsrli.b $xr0, $xr0, 1
21-
; CHECK-NEXT: xvandi.b $xr0, $xr0, 85
22-
; CHECK-NEXT: xvor.v $xr0, $xr0, $xr1
10+
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 0
11+
; CHECK-NEXT: bitrev.8b $a0, $a0
12+
; CHECK-NEXT: xvinsgr2vr.d $xr1, $a0, 0
13+
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 1
14+
; CHECK-NEXT: bitrev.8b $a0, $a0
15+
; CHECK-NEXT: xvinsgr2vr.d $xr1, $a0, 1
16+
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 2
17+
; CHECK-NEXT: bitrev.8b $a0, $a0
18+
; CHECK-NEXT: xvinsgr2vr.d $xr1, $a0, 2
19+
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 3
20+
; CHECK-NEXT: bitrev.8b $a0, $a0
21+
; CHECK-NEXT: xvinsgr2vr.d $xr1, $a0, 3
22+
; CHECK-NEXT: xvori.b $xr0, $xr1, 0
2323
; CHECK-NEXT: ret
2424
%b = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a)
2525
ret <32 x i8> %b
@@ -30,25 +30,19 @@ declare <16 x i16> @llvm.bitreverse.v16i16(<16 x i16>)
3030
define <16 x i16> @test_bitreverse_v16i16(<16 x i16> %a) nounwind {
3131
; CHECK-LABEL: test_bitreverse_v16i16:
3232
; CHECK: # %bb.0:
33-
; CHECK-NEXT: xvshuf4i.b $xr0, $xr0, 177
34-
; CHECK-NEXT: xvsrli.h $xr1, $xr0, 4
35-
; CHECK-NEXT: xvrepli.b $xr2, 15
36-
; CHECK-NEXT: xvand.v $xr1, $xr1, $xr2
37-
; CHECK-NEXT: xvand.v $xr0, $xr0, $xr2
38-
; CHECK-NEXT: xvslli.h $xr0, $xr0, 4
39-
; CHECK-NEXT: xvor.v $xr0, $xr1, $xr0
40-
; CHECK-NEXT: xvsrli.h $xr1, $xr0, 2
41-
; CHECK-NEXT: xvrepli.b $xr2, 51
42-
; CHECK-NEXT: xvand.v $xr1, $xr1, $xr2
43-
; CHECK-NEXT: xvand.v $xr0, $xr0, $xr2
44-
; CHECK-NEXT: xvslli.h $xr0, $xr0, 2
45-
; CHECK-NEXT: xvor.v $xr0, $xr1, $xr0
46-
; CHECK-NEXT: xvsrli.h $xr1, $xr0, 1
47-
; CHECK-NEXT: xvrepli.b $xr2, 85
48-
; CHECK-NEXT: xvand.v $xr1, $xr1, $xr2
49-
; CHECK-NEXT: xvand.v $xr0, $xr0, $xr2
50-
; CHECK-NEXT: xvslli.h $xr0, $xr0, 1
51-
; CHECK-NEXT: xvor.v $xr0, $xr1, $xr0
33+
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 0
34+
; CHECK-NEXT: bitrev.d $a0, $a0
35+
; CHECK-NEXT: xvinsgr2vr.d $xr1, $a0, 0
36+
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 1
37+
; CHECK-NEXT: bitrev.d $a0, $a0
38+
; CHECK-NEXT: xvinsgr2vr.d $xr1, $a0, 1
39+
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 2
40+
; CHECK-NEXT: bitrev.d $a0, $a0
41+
; CHECK-NEXT: xvinsgr2vr.d $xr1, $a0, 2
42+
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 3
43+
; CHECK-NEXT: bitrev.d $a0, $a0
44+
; CHECK-NEXT: xvinsgr2vr.d $xr1, $a0, 3
45+
; CHECK-NEXT: xvshuf4i.h $xr0, $xr1, 27
5246
; CHECK-NEXT: ret
5347
%b = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a)
5448
ret <16 x i16> %b
@@ -59,25 +53,19 @@ declare <8 x i32> @llvm.bitreverse.v8i32(<8 x i32>)
5953
define <8 x i32> @test_bitreverse_v8i32(<8 x i32> %a) nounwind {
6054
; CHECK-LABEL: test_bitreverse_v8i32:
6155
; CHECK: # %bb.0:
62-
; CHECK-NEXT: xvshuf4i.b $xr0, $xr0, 27
63-
; CHECK-NEXT: xvsrli.w $xr1, $xr0, 4
64-
; CHECK-NEXT: xvrepli.b $xr2, 15
65-
; CHECK-NEXT: xvand.v $xr1, $xr1, $xr2
66-
; CHECK-NEXT: xvand.v $xr0, $xr0, $xr2
67-
; CHECK-NEXT: xvslli.w $xr0, $xr0, 4
68-
; CHECK-NEXT: xvor.v $xr0, $xr1, $xr0
69-
; CHECK-NEXT: xvsrli.w $xr1, $xr0, 2
70-
; CHECK-NEXT: xvrepli.b $xr2, 51
71-
; CHECK-NEXT: xvand.v $xr1, $xr1, $xr2
72-
; CHECK-NEXT: xvand.v $xr0, $xr0, $xr2
73-
; CHECK-NEXT: xvslli.w $xr0, $xr0, 2
74-
; CHECK-NEXT: xvor.v $xr0, $xr1, $xr0
75-
; CHECK-NEXT: xvsrli.w $xr1, $xr0, 1
76-
; CHECK-NEXT: xvrepli.b $xr2, 85
77-
; CHECK-NEXT: xvand.v $xr1, $xr1, $xr2
78-
; CHECK-NEXT: xvand.v $xr0, $xr0, $xr2
79-
; CHECK-NEXT: xvslli.w $xr0, $xr0, 1
80-
; CHECK-NEXT: xvor.v $xr0, $xr1, $xr0
56+
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 0
57+
; CHECK-NEXT: bitrev.d $a0, $a0
58+
; CHECK-NEXT: xvinsgr2vr.d $xr1, $a0, 0
59+
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 1
60+
; CHECK-NEXT: bitrev.d $a0, $a0
61+
; CHECK-NEXT: xvinsgr2vr.d $xr1, $a0, 1
62+
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 2
63+
; CHECK-NEXT: bitrev.d $a0, $a0
64+
; CHECK-NEXT: xvinsgr2vr.d $xr1, $a0, 2
65+
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 3
66+
; CHECK-NEXT: bitrev.d $a0, $a0
67+
; CHECK-NEXT: xvinsgr2vr.d $xr1, $a0, 3
68+
; CHECK-NEXT: xvshuf4i.w $xr0, $xr1, 177
8169
; CHECK-NEXT: ret
8270
%b = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a)
8371
ret <8 x i32> %b

llvm/test/CodeGen/LoongArch/lsx/bitreverse.ll

Lines changed: 21 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -7,19 +7,13 @@ declare <16 x i8> @llvm.bitreverse.v16i8(<16 x i8>)
77
define <16 x i8> @test_bitreverse_v16i8(<16 x i8> %a) nounwind {
88
; CHECK-LABEL: test_bitreverse_v16i8:
99
; CHECK: # %bb.0:
10-
; CHECK-NEXT: vslli.b $vr1, $vr0, 4
11-
; CHECK-NEXT: vsrli.b $vr0, $vr0, 4
12-
; CHECK-NEXT: vor.v $vr0, $vr0, $vr1
13-
; CHECK-NEXT: vandi.b $vr1, $vr0, 51
14-
; CHECK-NEXT: vslli.b $vr1, $vr1, 2
15-
; CHECK-NEXT: vsrli.b $vr0, $vr0, 2
16-
; CHECK-NEXT: vandi.b $vr0, $vr0, 51
17-
; CHECK-NEXT: vor.v $vr0, $vr0, $vr1
18-
; CHECK-NEXT: vandi.b $vr1, $vr0, 85
19-
; CHECK-NEXT: vslli.b $vr1, $vr1, 1
20-
; CHECK-NEXT: vsrli.b $vr0, $vr0, 1
21-
; CHECK-NEXT: vandi.b $vr0, $vr0, 85
22-
; CHECK-NEXT: vor.v $vr0, $vr0, $vr1
10+
; CHECK-NEXT: vpickve2gr.d $a0, $vr0, 0
11+
; CHECK-NEXT: bitrev.8b $a0, $a0
12+
; CHECK-NEXT: vinsgr2vr.d $vr1, $a0, 0
13+
; CHECK-NEXT: vpickve2gr.d $a0, $vr0, 1
14+
; CHECK-NEXT: bitrev.8b $a0, $a0
15+
; CHECK-NEXT: vinsgr2vr.d $vr1, $a0, 1
16+
; CHECK-NEXT: vori.b $vr0, $vr1, 0
2317
; CHECK-NEXT: ret
2418
%b = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a)
2519
ret <16 x i8> %b
@@ -30,25 +24,13 @@ declare <8 x i16> @llvm.bitreverse.v8i16(<8 x i16>)
3024
define <8 x i16> @test_bitreverse_v8i16(<8 x i16> %a) nounwind {
3125
; CHECK-LABEL: test_bitreverse_v8i16:
3226
; CHECK: # %bb.0:
33-
; CHECK-NEXT: vshuf4i.b $vr0, $vr0, 177
34-
; CHECK-NEXT: vsrli.h $vr1, $vr0, 4
35-
; CHECK-NEXT: vrepli.b $vr2, 15
36-
; CHECK-NEXT: vand.v $vr1, $vr1, $vr2
37-
; CHECK-NEXT: vand.v $vr0, $vr0, $vr2
38-
; CHECK-NEXT: vslli.h $vr0, $vr0, 4
39-
; CHECK-NEXT: vor.v $vr0, $vr1, $vr0
40-
; CHECK-NEXT: vsrli.h $vr1, $vr0, 2
41-
; CHECK-NEXT: vrepli.b $vr2, 51
42-
; CHECK-NEXT: vand.v $vr1, $vr1, $vr2
43-
; CHECK-NEXT: vand.v $vr0, $vr0, $vr2
44-
; CHECK-NEXT: vslli.h $vr0, $vr0, 2
45-
; CHECK-NEXT: vor.v $vr0, $vr1, $vr0
46-
; CHECK-NEXT: vsrli.h $vr1, $vr0, 1
47-
; CHECK-NEXT: vrepli.b $vr2, 85
48-
; CHECK-NEXT: vand.v $vr1, $vr1, $vr2
49-
; CHECK-NEXT: vand.v $vr0, $vr0, $vr2
50-
; CHECK-NEXT: vslli.h $vr0, $vr0, 1
51-
; CHECK-NEXT: vor.v $vr0, $vr1, $vr0
27+
; CHECK-NEXT: vpickve2gr.d $a0, $vr0, 0
28+
; CHECK-NEXT: bitrev.d $a0, $a0
29+
; CHECK-NEXT: vinsgr2vr.d $vr1, $a0, 0
30+
; CHECK-NEXT: vpickve2gr.d $a0, $vr0, 1
31+
; CHECK-NEXT: bitrev.d $a0, $a0
32+
; CHECK-NEXT: vinsgr2vr.d $vr1, $a0, 1
33+
; CHECK-NEXT: vshuf4i.h $vr0, $vr1, 27
5234
; CHECK-NEXT: ret
5335
%b = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a)
5436
ret <8 x i16> %b
@@ -59,25 +41,13 @@ declare <4 x i32> @llvm.bitreverse.v4i32(<4 x i32>)
5941
define <4 x i32> @test_bitreverse_v4i32(<4 x i32> %a) nounwind {
6042
; CHECK-LABEL: test_bitreverse_v4i32:
6143
; CHECK: # %bb.0:
62-
; CHECK-NEXT: vshuf4i.b $vr0, $vr0, 27
63-
; CHECK-NEXT: vsrli.w $vr1, $vr0, 4
64-
; CHECK-NEXT: vrepli.b $vr2, 15
65-
; CHECK-NEXT: vand.v $vr1, $vr1, $vr2
66-
; CHECK-NEXT: vand.v $vr0, $vr0, $vr2
67-
; CHECK-NEXT: vslli.w $vr0, $vr0, 4
68-
; CHECK-NEXT: vor.v $vr0, $vr1, $vr0
69-
; CHECK-NEXT: vsrli.w $vr1, $vr0, 2
70-
; CHECK-NEXT: vrepli.b $vr2, 51
71-
; CHECK-NEXT: vand.v $vr1, $vr1, $vr2
72-
; CHECK-NEXT: vand.v $vr0, $vr0, $vr2
73-
; CHECK-NEXT: vslli.w $vr0, $vr0, 2
74-
; CHECK-NEXT: vor.v $vr0, $vr1, $vr0
75-
; CHECK-NEXT: vsrli.w $vr1, $vr0, 1
76-
; CHECK-NEXT: vrepli.b $vr2, 85
77-
; CHECK-NEXT: vand.v $vr1, $vr1, $vr2
78-
; CHECK-NEXT: vand.v $vr0, $vr0, $vr2
79-
; CHECK-NEXT: vslli.w $vr0, $vr0, 1
80-
; CHECK-NEXT: vor.v $vr0, $vr1, $vr0
44+
; CHECK-NEXT: vpickve2gr.d $a0, $vr0, 0
45+
; CHECK-NEXT: bitrev.d $a0, $a0
46+
; CHECK-NEXT: vinsgr2vr.d $vr1, $a0, 0
47+
; CHECK-NEXT: vpickve2gr.d $a0, $vr0, 1
48+
; CHECK-NEXT: bitrev.d $a0, $a0
49+
; CHECK-NEXT: vinsgr2vr.d $vr1, $a0, 1
50+
; CHECK-NEXT: vshuf4i.w $vr0, $vr1, 177
8151
; CHECK-NEXT: ret
8252
%b = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a)
8353
ret <4 x i32> %b

0 commit comments

Comments
 (0)