-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[RISCV] Improve lowering of spread(2) shuffles #118658
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
A spread(2) shuffle is just a interleave with an undef lane. The existing lowering was reusing the even lane for the undef value. This was entirely legal, but non-optimal.
@llvm/pr-subscribers-backend-risc-v Author: Philip Reames (preames) ChangesA spread(2) shuffle is just a interleave with an undef lane. The existing lowering was reusing the even lane for the undef value. This was entirely legal, but non-optimal. Full diff: https://github.com/llvm/llvm-project/pull/118658.diff 4 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 4544a922def1a3..1efb9e7d52dbd2 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -5331,17 +5331,36 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
// Extract the halves of the vectors.
MVT HalfVT = VT.getHalfNumVectorElementsVT();
+ // Recognize if one half is actually undef; the matching above will
+ // otherwise reuse the even stream for the undef one. This improves
+ // spread(2) shuffles.
+ bool EvenIsUndef = true, OddIsUndef = true;
+ for (unsigned i = 0; i < Mask.size(); i++) {
+ if (i % 2 == 0)
+ EvenIsUndef &= (Mask[i] == -1);
+ else
+ OddIsUndef &= (Mask[i] == -1);
+ }
+
int Size = Mask.size();
SDValue EvenV, OddV;
- assert(EvenSrc >= 0 && "Undef source?");
- EvenV = (EvenSrc / Size) == 0 ? V1 : V2;
- EvenV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, EvenV,
- DAG.getVectorIdxConstant(EvenSrc % Size, DL));
-
- assert(OddSrc >= 0 && "Undef source?");
- OddV = (OddSrc / Size) == 0 ? V1 : V2;
- OddV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, OddV,
- DAG.getVectorIdxConstant(OddSrc % Size, DL));
+ if (EvenIsUndef) {
+ EvenV = DAG.getUNDEF(HalfVT);
+ } else {
+ assert(EvenSrc >= 0 && "Undef source?");
+ EvenV = (EvenSrc / Size) == 0 ? V1 : V2;
+ EvenV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, EvenV,
+ DAG.getVectorIdxConstant(EvenSrc % Size, DL));
+ }
+
+ if (OddIsUndef) {
+ OddV = DAG.getUNDEF(HalfVT);
+ } else {
+ assert(OddSrc >= 0 && "Undef source?");
+ OddV = (OddSrc / Size) == 0 ? V1 : V2;
+ OddV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, OddV,
+ DAG.getVectorIdxConstant(OddSrc % Size, DL));
+ }
return getWideningInterleave(EvenV, OddV, DL, DAG, Subtarget);
}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll
index e4b8e9debad271..97e458e70565ce 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll
@@ -242,33 +242,27 @@ define <64 x float> @interleave_v32f32(<32 x float> %x, <32 x float> %y) {
; V128-NEXT: slli a0, a0, 3
; V128-NEXT: sub sp, sp, a0
; V128-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; V128-NEXT: vmv8r.v v24, v16
-; V128-NEXT: vmv8r.v v16, v8
-; V128-NEXT: vmv8r.v v8, v24
; V128-NEXT: addi a0, sp, 16
-; V128-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
+; V128-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; V128-NEXT: vsetivli zero, 16, e32, m8, ta, ma
-; V128-NEXT: vslidedown.vi v0, v24, 16
-; V128-NEXT: li a0, -1
-; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; V128-NEXT: vwaddu.vv v24, v8, v0
-; V128-NEXT: vwmaccu.vx v24, a0, v0
-; V128-NEXT: vsetivli zero, 16, e32, m8, ta, ma
-; V128-NEXT: vslidedown.vi v0, v16, 16
+; V128-NEXT: vslidedown.vi v24, v16, 16
+; V128-NEXT: li a0, 32
+; V128-NEXT: vslidedown.vi v0, v8, 16
; V128-NEXT: lui a1, 699051
-; V128-NEXT: li a2, 32
-; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; V128-NEXT: vwaddu.vv v8, v0, v16
+; V128-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; V128-NEXT: vzext.vf2 v8, v24
+; V128-NEXT: vzext.vf2 v24, v0
; V128-NEXT: addi a1, a1, -1366
; V128-NEXT: vmv.s.x v0, a1
-; V128-NEXT: vwmaccu.vx v8, a0, v16
-; V128-NEXT: vsetvli zero, a2, e32, m8, ta, ma
-; V128-NEXT: vmerge.vvm v24, v8, v24, v0
-; V128-NEXT: addi a1, sp, 16
-; V128-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
+; V128-NEXT: vsll.vx v8, v8, a0
+; V128-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; V128-NEXT: vmerge.vvm v24, v24, v8, v0
+; V128-NEXT: addi a0, sp, 16
+; V128-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; V128-NEXT: vwaddu.vv v0, v16, v8
-; V128-NEXT: vwmaccu.vx v0, a0, v8
+; V128-NEXT: vwaddu.vv v0, v8, v16
+; V128-NEXT: li a0, -1
+; V128-NEXT: vwmaccu.vx v0, a0, v16
; V128-NEXT: vmv8r.v v8, v0
; V128-NEXT: vmv8r.v v16, v24
; V128-NEXT: csrr a0, vlenb
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll
index 66af5718fb9dc5..a8eb1f97fd1a2c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll
@@ -186,35 +186,29 @@ define <4 x i32> @interleave_v4i32_offset_2(<4 x i32> %x, <4 x i32> %y) {
define <4 x i32> @interleave_v4i32_offset_1(<4 x i32> %x, <4 x i32> %y) {
; V128-LABEL: interleave_v4i32_offset_1:
; V128: # %bb.0:
-; V128-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
-; V128-NEXT: vwaddu.vv v10, v8, v8
-; V128-NEXT: li a0, -1
; V128-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; V128-NEXT: vid.v v11
-; V128-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
-; V128-NEXT: vwmaccu.vx v10, a0, v8
-; V128-NEXT: vsetivli zero, 4, e32, m1, ta, mu
-; V128-NEXT: vsrl.vi v8, v11, 1
+; V128-NEXT: vid.v v10
; V128-NEXT: vmv.v.i v0, 10
-; V128-NEXT: vadd.vi v8, v8, 1
-; V128-NEXT: vrgather.vv v10, v9, v8, v0.t
+; V128-NEXT: vsrl.vi v10, v10, 1
+; V128-NEXT: vadd.vi v11, v10, 1
+; V128-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; V128-NEXT: vzext.vf2 v10, v8
+; V128-NEXT: vsetivli zero, 4, e32, m1, ta, mu
+; V128-NEXT: vrgather.vv v10, v9, v11, v0.t
; V128-NEXT: vmv.v.v v8, v10
; V128-NEXT: ret
;
; V512-LABEL: interleave_v4i32_offset_1:
; V512: # %bb.0:
-; V512-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
-; V512-NEXT: vwaddu.vv v10, v8, v8
-; V512-NEXT: li a0, -1
; V512-NEXT: vsetivli zero, 4, e32, mf2, ta, ma
-; V512-NEXT: vid.v v11
-; V512-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
-; V512-NEXT: vwmaccu.vx v10, a0, v8
-; V512-NEXT: vsetivli zero, 4, e32, mf2, ta, mu
-; V512-NEXT: vsrl.vi v8, v11, 1
+; V512-NEXT: vid.v v10
; V512-NEXT: vmv.v.i v0, 10
-; V512-NEXT: vadd.vi v8, v8, 1
-; V512-NEXT: vrgather.vv v10, v9, v8, v0.t
+; V512-NEXT: vsrl.vi v10, v10, 1
+; V512-NEXT: vadd.vi v11, v10, 1
+; V512-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; V512-NEXT: vzext.vf2 v10, v8
+; V512-NEXT: vsetivli zero, 4, e32, mf2, ta, mu
+; V512-NEXT: vrgather.vv v10, v9, v11, v0.t
; V512-NEXT: vmv1r.v v8, v10
; V512-NEXT: ret
%a = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 0, i32 5, i32 1, i32 6>
@@ -411,33 +405,27 @@ define <64 x i32> @interleave_v32i32(<32 x i32> %x, <32 x i32> %y) {
; V128-NEXT: slli a0, a0, 3
; V128-NEXT: sub sp, sp, a0
; V128-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; V128-NEXT: vmv8r.v v24, v16
-; V128-NEXT: vmv8r.v v16, v8
-; V128-NEXT: vmv8r.v v8, v24
; V128-NEXT: addi a0, sp, 16
-; V128-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
-; V128-NEXT: vsetivli zero, 16, e32, m8, ta, ma
-; V128-NEXT: vslidedown.vi v0, v24, 16
-; V128-NEXT: li a0, -1
-; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; V128-NEXT: vwaddu.vv v24, v8, v0
-; V128-NEXT: vwmaccu.vx v24, a0, v0
+; V128-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; V128-NEXT: vsetivli zero, 16, e32, m8, ta, ma
-; V128-NEXT: vslidedown.vi v0, v16, 16
+; V128-NEXT: vslidedown.vi v24, v16, 16
+; V128-NEXT: li a0, 32
+; V128-NEXT: vslidedown.vi v0, v8, 16
; V128-NEXT: lui a1, 699051
-; V128-NEXT: li a2, 32
-; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; V128-NEXT: vwaddu.vv v8, v0, v16
+; V128-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; V128-NEXT: vzext.vf2 v8, v24
+; V128-NEXT: vzext.vf2 v24, v0
; V128-NEXT: addi a1, a1, -1366
; V128-NEXT: vmv.s.x v0, a1
-; V128-NEXT: vwmaccu.vx v8, a0, v16
-; V128-NEXT: vsetvli zero, a2, e32, m8, ta, ma
-; V128-NEXT: vmerge.vvm v24, v8, v24, v0
-; V128-NEXT: addi a1, sp, 16
-; V128-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
+; V128-NEXT: vsll.vx v8, v8, a0
+; V128-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; V128-NEXT: vmerge.vvm v24, v24, v8, v0
+; V128-NEXT: addi a0, sp, 16
+; V128-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; V128-NEXT: vwaddu.vv v0, v16, v8
-; V128-NEXT: vwmaccu.vx v0, a0, v8
+; V128-NEXT: vwaddu.vv v0, v8, v16
+; V128-NEXT: li a0, -1
+; V128-NEXT: vwmaccu.vx v0, a0, v16
; V128-NEXT: vmv8r.v v8, v0
; V128-NEXT: vmv8r.v v16, v24
; V128-NEXT: csrr a0, vlenb
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
index 5d307211ead6e6..a9ae2181333e88 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
@@ -801,11 +801,9 @@ define <8 x i32> @shuffle_compress_singlesrc_gaps_e32(<8 x i32> %v) {
define <8 x i32> @shuffle_spread2_singlesrc_e32(<8 x i32> %v) {
; CHECK-LABEL: shuffle_spread2_singlesrc_e32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT: vwaddu.vv v10, v8, v8
-; CHECK-NEXT: li a0, -1
-; CHECK-NEXT: vwmaccu.vx v10, a0, v8
-; CHECK-NEXT: vmv2r.v v8, v10
+; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT: vzext.vf2 v10, v8
+; CHECK-NEXT: vmv.v.v v8, v10
; CHECK-NEXT: ret
%out = shufflevector <8 x i32> %v, <8 x i32> poison, <8 x i32> <i32 0, i32 undef, i32 1, i32 undef, i32 2, i32 undef, i32 3, i32 undef>
ret <8 x i32> %out
@@ -814,11 +812,10 @@ define <8 x i32> @shuffle_spread2_singlesrc_e32(<8 x i32> %v) {
define <8 x i32> @shuffle_spread2_singlesrc_e32_index1(<8 x i32> %v) {
; CHECK-LABEL: shuffle_spread2_singlesrc_e32_index1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT: vwaddu.vv v10, v8, v8
-; CHECK-NEXT: li a0, -1
-; CHECK-NEXT: vwmaccu.vx v10, a0, v8
-; CHECK-NEXT: vmv2r.v v8, v10
+; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT: vzext.vf2 v10, v8
+; CHECK-NEXT: li a0, 32
+; CHECK-NEXT: vsll.vx v8, v10, a0
; CHECK-NEXT: ret
%out = shufflevector <8 x i32> %v, <8 x i32> poison, <8 x i32> <i32 undef, i32 0, i32 undef, i32 1, i32 undef, i32 2, i32 undef, i32 3>
ret <8 x i32> %out
|
OddV = (OddSrc / Size) == 0 ? V1 : V2; | ||
OddV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, OddV, | ||
DAG.getVectorIdxConstant(OddSrc % Size, DL)); | ||
} | ||
|
||
return getWideningInterleave(EvenV, OddV, DL, DAG, Subtarget); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Note that we already have the required undef handling in getWideningInterleave because it's used by the deinterleave2 intrinsic lowering.
; V128-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload | ||
; V128-NEXT: vsll.vx v8, v8, a0 | ||
; V128-NEXT: vsetvli zero, a0, e32, m8, ta, ma | ||
; V128-NEXT: vmerge.vvm v24, v24, v8, v0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The weird two spread(2) and then merge - which could be one interleave - comes from lowering this:
t24: v32i32 = vector_shuffle<16,48,17,49,18,50,19,51,20,52,21,53,22,54,23,55,24,56,25,57,26,58,27,59,28,60,29,61,30,62,31,63> t4, t7
This fails our current restrictions in the definition of isInterleaveShuffle, but we could probably relax that. I'm going to glance at that separately.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I did glance at this, and decided not to pursue for now. The change itself is simple, but it exposes two unrelated issues I don't have cycles to fix. 1) isShuffleMaskLegal causes unprofitable transforms somewhere in generic DAG 2) MachineScheduling creates a truly adversarial schedule, and register allocator is forced to greatly increase spilling.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
A spread(2) shuffle is just a interleave with an undef lane. The existing lowering was reusing the even lane for the undef value. This was entirely legal, but non-optimal.