Skip to content

Commit 90beda2

Browse files
authored
[LoongArch] Lower vector_shuffle as lane permute and shuffle for lasx if possible. (#141196)
1 parent 03bbd04 commit 90beda2

File tree

2 files changed

+91
-129
lines changed

2 files changed

+91
-129
lines changed

llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2136,6 +2136,51 @@ static void canonicalizeShuffleVectorByLane(const SDLoc &DL,
21362136
}
21372137
}
21382138

2139+
/// Lower VECTOR_SHUFFLE as lane permute and then shuffle (if possible).
2140+
/// Only for 256-bit vector.
2141+
///
2142+
/// For example:
2143+
/// %2 = shufflevector <4 x i64> %0, <4 x i64> posion,
2144+
/// <4 x i64> <i32 0, i32 3, i32 2, i32 0>
2145+
/// is lowerded to:
2146+
/// (XVPERMI $xr2, $xr0, 78)
2147+
/// (XVSHUF $xr1, $xr2, $xr0)
2148+
/// (XVORI $xr0, $xr1, 0)
2149+
static SDValue lowerVECTOR_SHUFFLEAsLanePermuteAndShuffle(const SDLoc &DL,
2150+
ArrayRef<int> Mask,
2151+
MVT VT, SDValue V1,
2152+
SDValue V2,
2153+
SelectionDAG &DAG) {
2154+
assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
2155+
int Size = Mask.size();
2156+
int LaneSize = Size / 2;
2157+
2158+
bool LaneCrossing[2] = {false, false};
2159+
for (int i = 0; i < Size; ++i)
2160+
if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))
2161+
LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
2162+
2163+
// Ensure that all lanes ared involved.
2164+
if (!LaneCrossing[0] && !LaneCrossing[1])
2165+
return SDValue();
2166+
2167+
SmallVector<int> InLaneMask;
2168+
InLaneMask.assign(Mask.begin(), Mask.end());
2169+
for (int i = 0; i < Size; ++i) {
2170+
int &M = InLaneMask[i];
2171+
if (M < 0)
2172+
continue;
2173+
if (((M % Size) / LaneSize) != (i / LaneSize))
2174+
M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;
2175+
}
2176+
2177+
SDValue Flipped = DAG.getBitcast(MVT::v4i64, V1);
2178+
Flipped = DAG.getVectorShuffle(MVT::v4i64, DL, Flipped,
2179+
DAG.getUNDEF(MVT::v4i64), {2, 3, 0, 1});
2180+
Flipped = DAG.getBitcast(VT, Flipped);
2181+
return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask);
2182+
}
2183+
21392184
/// Dispatching routine to lower various 256-bit LoongArch vector shuffles.
21402185
///
21412186
/// This routine breaks down the specific type of 256-bit shuffle and
@@ -2168,6 +2213,9 @@ static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
21682213
return Result;
21692214
if ((Result = lowerVECTOR_SHUFFLE_XVSHUF4I(DL, NewMask, VT, V1, V2, DAG)))
21702215
return Result;
2216+
if ((Result = lowerVECTOR_SHUFFLEAsLanePermuteAndShuffle(DL, NewMask, VT,
2217+
V1, V2, DAG)))
2218+
return Result;
21712219

21722220
// TODO: This comment may be enabled in the future to better match the
21732221
// pattern for instruction selection.

llvm/test/CodeGen/LoongArch/lasx/shuffle-as-permute-and-shuffle.ll

Lines changed: 43 additions & 129 deletions
Original file line numberDiff line numberDiff line change
@@ -4,76 +4,14 @@
44
define <32 x i8> @shuffle_v32i8(<32 x i8> %a) {
55
; CHECK-LABEL: shuffle_v32i8:
66
; CHECK: # %bb.0:
7-
; CHECK-NEXT: addi.d $sp, $sp, -64
8-
; CHECK-NEXT: .cfi_def_cfa_offset 64
9-
; CHECK-NEXT: st.d $ra, $sp, 56 # 8-byte Folded Spill
10-
; CHECK-NEXT: st.d $fp, $sp, 48 # 8-byte Folded Spill
11-
; CHECK-NEXT: .cfi_offset 1, -8
12-
; CHECK-NEXT: .cfi_offset 22, -16
13-
; CHECK-NEXT: addi.d $fp, $sp, 64
14-
; CHECK-NEXT: .cfi_def_cfa 22, 0
15-
; CHECK-NEXT: bstrins.d $sp, $zero, 4, 0
16-
; CHECK-NEXT: xvori.b $xr1, $xr0, 0
17-
; CHECK-NEXT: xvst $xr0, $sp, 0
18-
; CHECK-NEXT: ld.h $a0, $sp, 16
19-
; CHECK-NEXT: vinsgr2vr.h $vr0, $a0, 0
20-
; CHECK-NEXT: vpickve2gr.h $a1, $vr1, 0
21-
; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 1
22-
; CHECK-NEXT: vpickve2gr.h $a1, $vr1, 1
23-
; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 2
24-
; CHECK-NEXT: vpickve2gr.h $a1, $vr1, 2
25-
; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 3
26-
; CHECK-NEXT: vpickve2gr.h $a1, $vr1, 3
27-
; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 4
28-
; CHECK-NEXT: vpickve2gr.h $a1, $vr1, 4
29-
; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 5
30-
; CHECK-NEXT: vpickve2gr.h $a1, $vr1, 5
31-
; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 6
32-
; CHECK-NEXT: vpickve2gr.h $a1, $vr1, 6
33-
; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 7
34-
; CHECK-NEXT: xvori.b $xr1, $xr0, 0
35-
; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1
36-
; CHECK-NEXT: vinsgr2vr.h $vr1, $a0, 0
37-
; CHECK-NEXT: ld.h $a0, $sp, 18
38-
; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2
39-
; CHECK-NEXT: xvori.b $xr1, $xr0, 0
40-
; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1
41-
; CHECK-NEXT: vinsgr2vr.h $vr1, $a0, 1
42-
; CHECK-NEXT: ld.h $a0, $sp, 20
43-
; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2
44-
; CHECK-NEXT: xvori.b $xr1, $xr0, 0
45-
; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1
46-
; CHECK-NEXT: vinsgr2vr.h $vr1, $a0, 2
47-
; CHECK-NEXT: ld.h $a0, $sp, 22
48-
; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2
49-
; CHECK-NEXT: xvori.b $xr1, $xr0, 0
50-
; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1
51-
; CHECK-NEXT: vinsgr2vr.h $vr1, $a0, 3
52-
; CHECK-NEXT: ld.h $a0, $sp, 24
53-
; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2
54-
; CHECK-NEXT: xvori.b $xr1, $xr0, 0
55-
; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1
56-
; CHECK-NEXT: vinsgr2vr.h $vr1, $a0, 4
57-
; CHECK-NEXT: ld.h $a0, $sp, 26
58-
; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2
59-
; CHECK-NEXT: xvori.b $xr1, $xr0, 0
60-
; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1
61-
; CHECK-NEXT: vinsgr2vr.h $vr1, $a0, 5
62-
; CHECK-NEXT: ld.h $a0, $sp, 28
63-
; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2
64-
; CHECK-NEXT: xvori.b $xr1, $xr0, 0
65-
; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1
66-
; CHECK-NEXT: vinsgr2vr.h $vr1, $a0, 6
67-
; CHECK-NEXT: ld.h $a0, $sp, 30
68-
; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2
69-
; CHECK-NEXT: xvori.b $xr1, $xr0, 0
70-
; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1
71-
; CHECK-NEXT: vinsgr2vr.h $vr1, $a0, 7
72-
; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2
73-
; CHECK-NEXT: addi.d $sp, $fp, -64
74-
; CHECK-NEXT: ld.d $fp, $sp, 48 # 8-byte Folded Reload
75-
; CHECK-NEXT: ld.d $ra, $sp, 56 # 8-byte Folded Reload
76-
; CHECK-NEXT: addi.d $sp, $sp, 64
7+
; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI0_0)
8+
; CHECK-NEXT: xvld $xr2, $a0, %pc_lo12(.LCPI0_0)
9+
; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI0_1)
10+
; CHECK-NEXT: xvld $xr1, $a0, %pc_lo12(.LCPI0_1)
11+
; CHECK-NEXT: xvpermi.d $xr3, $xr0, 78
12+
; CHECK-NEXT: xvshuf.d $xr2, $xr0, $xr3
13+
; CHECK-NEXT: xvshuf.h $xr1, $xr2, $xr0
14+
; CHECK-NEXT: xvori.b $xr0, $xr1, 0
7715
; CHECK-NEXT: ret
7816
%shuffle = shufflevector <32 x i8> %a, <32 x i8> poison, <32 x i32> <i32 16, i32 17, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
7917
ret <32 x i8> %shuffle
@@ -83,21 +21,13 @@ define <32 x i8> @shuffle_v32i8(<32 x i8> %a) {
8321
define <16 x i16> @shuffle_v16i16(<16 x i16> %a) {
8422
; CHECK-LABEL: shuffle_v16i16:
8523
; CHECK: # %bb.0:
86-
; CHECK-NEXT: xvpickve2gr.w $a0, $xr0, 4
87-
; CHECK-NEXT: xvinsgr2vr.w $xr1, $a0, 0
88-
; CHECK-NEXT: xvpickve2gr.w $a1, $xr0, 0
89-
; CHECK-NEXT: xvinsgr2vr.w $xr1, $a1, 1
90-
; CHECK-NEXT: xvpickve2gr.w $a1, $xr0, 1
91-
; CHECK-NEXT: xvinsgr2vr.w $xr1, $a1, 2
92-
; CHECK-NEXT: xvpickve2gr.w $a1, $xr0, 2
93-
; CHECK-NEXT: xvinsgr2vr.w $xr1, $a1, 3
94-
; CHECK-NEXT: xvinsgr2vr.w $xr1, $a0, 4
95-
; CHECK-NEXT: xvpickve2gr.w $a0, $xr0, 5
96-
; CHECK-NEXT: xvinsgr2vr.w $xr1, $a0, 5
97-
; CHECK-NEXT: xvpickve2gr.w $a0, $xr0, 6
98-
; CHECK-NEXT: xvinsgr2vr.w $xr1, $a0, 6
99-
; CHECK-NEXT: xvpickve2gr.w $a0, $xr0, 7
100-
; CHECK-NEXT: xvinsgr2vr.w $xr1, $a0, 7
24+
; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI1_0)
25+
; CHECK-NEXT: xvld $xr2, $a0, %pc_lo12(.LCPI1_0)
26+
; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI1_1)
27+
; CHECK-NEXT: xvld $xr1, $a0, %pc_lo12(.LCPI1_1)
28+
; CHECK-NEXT: xvpermi.d $xr3, $xr0, 78
29+
; CHECK-NEXT: xvshuf.d $xr2, $xr0, $xr3
30+
; CHECK-NEXT: xvshuf.w $xr1, $xr2, $xr0
10131
; CHECK-NEXT: xvori.b $xr0, $xr1, 0
10232
; CHECK-NEXT: ret
10333
%shuffle = shufflevector <16 x i16> %a, <16 x i16> poison, <16 x i32> <i32 8, i32 9, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -107,13 +37,13 @@ define <16 x i16> @shuffle_v16i16(<16 x i16> %a) {
10737
define <8 x i32> @shuffle_v8i32(<8 x i32> %a) {
10838
; CHECK-LABEL: shuffle_v8i32:
10939
; CHECK: # %bb.0:
110-
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 2
111-
; CHECK-NEXT: xvinsgr2vr.d $xr1, $a0, 0
112-
; CHECK-NEXT: xvpickve2gr.d $a1, $xr0, 0
113-
; CHECK-NEXT: xvinsgr2vr.d $xr1, $a1, 1
114-
; CHECK-NEXT: xvinsgr2vr.d $xr1, $a0, 2
115-
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 3
116-
; CHECK-NEXT: xvinsgr2vr.d $xr1, $a0, 3
40+
; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI2_0)
41+
; CHECK-NEXT: xvld $xr2, $a0, %pc_lo12(.LCPI2_0)
42+
; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI2_1)
43+
; CHECK-NEXT: xvld $xr1, $a0, %pc_lo12(.LCPI2_1)
44+
; CHECK-NEXT: xvpermi.d $xr3, $xr0, 78
45+
; CHECK-NEXT: xvshuf.d $xr2, $xr0, $xr3
46+
; CHECK-NEXT: xvshuf.d $xr1, $xr2, $xr0
11747
; CHECK-NEXT: xvori.b $xr0, $xr1, 0
11848
; CHECK-NEXT: ret
11949
%shuffle = shufflevector <8 x i32> %a, <8 x i32> poison, <8 x i32> <i32 4, i32 5, i32 0, i32 1, i32 4, i32 5, i32 6, i32 7>
@@ -123,14 +53,13 @@ define <8 x i32> @shuffle_v8i32(<8 x i32> %a) {
12353
define <4 x i64> @shuffle_v4i64(<4 x i64> %a) {
12454
; CHECK-LABEL: shuffle_v4i64:
12555
; CHECK: # %bb.0:
126-
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 3
127-
; CHECK-NEXT: xvinsgr2vr.d $xr1, $a0, 0
128-
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 1
129-
; CHECK-NEXT: xvinsgr2vr.d $xr1, $a0, 1
130-
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 2
131-
; CHECK-NEXT: xvinsgr2vr.d $xr1, $a0, 2
132-
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 0
133-
; CHECK-NEXT: xvinsgr2vr.d $xr1, $a0, 3
56+
; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI3_0)
57+
; CHECK-NEXT: xvld $xr2, $a0, %pc_lo12(.LCPI3_0)
58+
; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI3_1)
59+
; CHECK-NEXT: xvld $xr1, $a0, %pc_lo12(.LCPI3_1)
60+
; CHECK-NEXT: xvpermi.d $xr3, $xr0, 78
61+
; CHECK-NEXT: xvshuf.d $xr2, $xr0, $xr3
62+
; CHECK-NEXT: xvshuf.d $xr1, $xr2, $xr0
13463
; CHECK-NEXT: xvori.b $xr0, $xr1, 0
13564
; CHECK-NEXT: ret
13665
%shuffle = shufflevector <4 x i64> %a, <4 x i64> poison, <4 x i32> <i32 3, i32 1, i32 2, i32 0>
@@ -140,19 +69,13 @@ define <4 x i64> @shuffle_v4i64(<4 x i64> %a) {
14069
define <8 x float> @shuffle_v8f32(<8 x float> %a) {
14170
; CHECK-LABEL: shuffle_v8f32:
14271
; CHECK: # %bb.0:
143-
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 2
144-
; CHECK-NEXT: movgr2fr.d $fa1, $a0
145-
; CHECK-NEXT: movfr2gr.d $a0, $fa1
146-
; CHECK-NEXT: xvpickve2gr.d $a1, $xr0, 0
147-
; CHECK-NEXT: movgr2fr.d $fa2, $a1
148-
; CHECK-NEXT: xvinsgr2vr.d $xr1, $a0, 0
149-
; CHECK-NEXT: movfr2gr.d $a1, $fa2
150-
; CHECK-NEXT: xvinsgr2vr.d $xr1, $a1, 1
151-
; CHECK-NEXT: xvpickve2gr.d $a1, $xr0, 3
152-
; CHECK-NEXT: movgr2fr.d $fa0, $a1
153-
; CHECK-NEXT: xvinsgr2vr.d $xr1, $a0, 2
154-
; CHECK-NEXT: movfr2gr.d $a0, $fa0
155-
; CHECK-NEXT: xvinsgr2vr.d $xr1, $a0, 3
72+
; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI4_0)
73+
; CHECK-NEXT: xvld $xr2, $a0, %pc_lo12(.LCPI4_0)
74+
; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI4_1)
75+
; CHECK-NEXT: xvld $xr1, $a0, %pc_lo12(.LCPI4_1)
76+
; CHECK-NEXT: xvpermi.d $xr3, $xr0, 78
77+
; CHECK-NEXT: xvshuf.d $xr2, $xr0, $xr3
78+
; CHECK-NEXT: xvshuf.d $xr1, $xr2, $xr0
15679
; CHECK-NEXT: xvori.b $xr0, $xr1, 0
15780
; CHECK-NEXT: ret
15881
%shuffle = shufflevector <8 x float> %a, <8 x float> poison, <8 x i32> <i32 4, i32 5, i32 0, i32 1, i32 4, i32 5, i32 6, i32 7>
@@ -162,22 +85,13 @@ define <8 x float> @shuffle_v8f32(<8 x float> %a) {
16285
define <4 x double> @shuffle_v4f64(<4 x double> %a) {
16386
; CHECK-LABEL: shuffle_v4f64:
16487
; CHECK: # %bb.0:
165-
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 3
166-
; CHECK-NEXT: movgr2fr.d $fa1, $a0
167-
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 1
168-
; CHECK-NEXT: movgr2fr.d $fa2, $a0
169-
; CHECK-NEXT: movfr2gr.d $a0, $fa1
170-
; CHECK-NEXT: xvinsgr2vr.d $xr1, $a0, 0
171-
; CHECK-NEXT: movfr2gr.d $a0, $fa2
172-
; CHECK-NEXT: xvinsgr2vr.d $xr1, $a0, 1
173-
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 2
174-
; CHECK-NEXT: movgr2fr.d $fa2, $a0
175-
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 0
176-
; CHECK-NEXT: movgr2fr.d $fa0, $a0
177-
; CHECK-NEXT: movfr2gr.d $a0, $fa2
178-
; CHECK-NEXT: xvinsgr2vr.d $xr1, $a0, 2
179-
; CHECK-NEXT: movfr2gr.d $a0, $fa0
180-
; CHECK-NEXT: xvinsgr2vr.d $xr1, $a0, 3
88+
; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI5_0)
89+
; CHECK-NEXT: xvld $xr2, $a0, %pc_lo12(.LCPI5_0)
90+
; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI5_1)
91+
; CHECK-NEXT: xvld $xr1, $a0, %pc_lo12(.LCPI5_1)
92+
; CHECK-NEXT: xvpermi.d $xr3, $xr0, 78
93+
; CHECK-NEXT: xvshuf.d $xr2, $xr0, $xr3
94+
; CHECK-NEXT: xvshuf.d $xr1, $xr2, $xr0
18195
; CHECK-NEXT: xvori.b $xr0, $xr1, 0
18296
; CHECK-NEXT: ret
18397
%shuffle = shufflevector <4 x double> %a, <4 x double> poison, <4 x i32> <i32 3, i32 1, i32 2, i32 0>

0 commit comments

Comments
 (0)