Skip to content

Commit 6840521

Browse files
committed
Revert "[RISCV][CG]Use processShuffleMasks for per-register shuffles"
This reverts commit b8952d4. spec x264 fails to build in all VLS configurations, with the assertion failure: clang: ../llvm-project/llvm/lib/Target/RISCV/RISCVISelLowering.cpp:5246: llvm::SDValue lowerShuffleViaVRegSplitting(llvm::ShuffleVectorSDNode*, llvm::SelectionDAG&, const llvm::RISCVSubtarget&): Assertion `RegCnt == NumOfDestRegs && "Whole vector must be processed"' failed. I can reduce a failing piece of IR, but the failure appears pretty broad, so I suspect any reasonable vls build will hit it.
1 parent 8435225 commit 6840521

File tree

2 files changed

+89
-93
lines changed

2 files changed

+89
-93
lines changed

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

Lines changed: 41 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -5104,6 +5104,7 @@ static SDValue lowerShuffleViaVRegSplitting(ShuffleVectorSDNode *SVN,
51045104
SDValue V1 = SVN->getOperand(0);
51055105
SDValue V2 = SVN->getOperand(1);
51065106
ArrayRef<int> Mask = SVN->getMask();
5107+
unsigned NumElts = VT.getVectorNumElements();
51075108

51085109
// If we don't know exact data layout, not much we can do. If this
51095110
// is already m1 or smaller, no point in splitting further.
@@ -5120,70 +5121,58 @@ static SDValue lowerShuffleViaVRegSplitting(ShuffleVectorSDNode *SVN,
51205121

51215122
MVT ElemVT = VT.getVectorElementType();
51225123
unsigned ElemsPerVReg = *VLen / ElemVT.getFixedSizeInBits();
5124+
unsigned VRegsPerSrc = NumElts / ElemsPerVReg;
5125+
5126+
SmallVector<std::pair<int, SmallVector<int>>>
5127+
OutMasks(VRegsPerSrc, {-1, {}});
5128+
5129+
// Check if our mask can be done as a 1-to-1 mapping from source
5130+
// to destination registers in the group without needing to
5131+
// write each destination more than once.
5132+
for (unsigned DstIdx = 0; DstIdx < Mask.size(); DstIdx++) {
5133+
int DstVecIdx = DstIdx / ElemsPerVReg;
5134+
int DstSubIdx = DstIdx % ElemsPerVReg;
5135+
int SrcIdx = Mask[DstIdx];
5136+
if (SrcIdx < 0 || (unsigned)SrcIdx >= 2 * NumElts)
5137+
continue;
5138+
int SrcVecIdx = SrcIdx / ElemsPerVReg;
5139+
int SrcSubIdx = SrcIdx % ElemsPerVReg;
5140+
if (OutMasks[DstVecIdx].first == -1)
5141+
OutMasks[DstVecIdx].first = SrcVecIdx;
5142+
if (OutMasks[DstVecIdx].first != SrcVecIdx)
5143+
// Note: This case could easily be handled by keeping track of a chain
5144+
// of source values and generating two element shuffles below. This is
5145+
// less an implementation question, and more a profitability one.
5146+
return SDValue();
5147+
5148+
OutMasks[DstVecIdx].second.resize(ElemsPerVReg, -1);
5149+
OutMasks[DstVecIdx].second[DstSubIdx] = SrcSubIdx;
5150+
}
51235151

51245152
EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT, Subtarget);
51255153
MVT OneRegVT = MVT::getVectorVT(ElemVT, ElemsPerVReg);
51265154
MVT M1VT = getContainerForFixedLengthVector(DAG, OneRegVT, Subtarget);
51275155
assert(M1VT == getLMUL1VT(M1VT));
51285156
unsigned NumOpElts = M1VT.getVectorMinNumElements();
5129-
unsigned NormalizedVF = ContainerVT.getVectorMinNumElements();
5130-
unsigned NumOfSrcRegs = NormalizedVF / NumOpElts;
5131-
unsigned NumOfDestRegs = NormalizedVF / NumOpElts;
5157+
SDValue Vec = DAG.getUNDEF(ContainerVT);
51325158
// The following semantically builds up a fixed length concat_vector
51335159
// of the component shuffle_vectors. We eagerly lower to scalable here
51345160
// to avoid DAG combining it back to a large shuffle_vector again.
51355161
V1 = convertToScalableVector(ContainerVT, V1, DAG, Subtarget);
51365162
V2 = convertToScalableVector(ContainerVT, V2, DAG, Subtarget);
5137-
SmallVector<SDValue> SubRegs(NumOfDestRegs);
5138-
unsigned RegCnt = 0;
5139-
unsigned PrevCnt = 0;
5140-
processShuffleMasks(
5141-
Mask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs,
5142-
[&]() {
5143-
PrevCnt = RegCnt;
5144-
++RegCnt;
5145-
},
5146-
[&, &DAG = DAG](ArrayRef<int> SrcSubMask, unsigned SrcVecIdx,
5147-
unsigned DstVecIdx) {
5148-
SDValue SrcVec = SrcVecIdx >= NumOfSrcRegs ? V2 : V1;
5149-
unsigned ExtractIdx = (SrcVecIdx % NumOfSrcRegs) * NumOpElts;
5150-
SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, M1VT, SrcVec,
5151-
DAG.getVectorIdxConstant(ExtractIdx, DL));
5152-
SubVec = convertFromScalableVector(OneRegVT, SubVec, DAG, Subtarget);
5153-
SubVec = DAG.getVectorShuffle(OneRegVT, DL, SubVec, SubVec, SrcSubMask);
5154-
SubRegs[RegCnt] = convertToScalableVector(M1VT, SubVec, DAG, Subtarget);
5155-
PrevCnt = RegCnt;
5156-
++RegCnt;
5157-
},
5158-
[&, &DAG = DAG](ArrayRef<int> SrcSubMask, unsigned Idx1, unsigned Idx2) {
5159-
if (PrevCnt + 1 == RegCnt)
5160-
++RegCnt;
5161-
SDValue SubVec1 = SubRegs[PrevCnt + 1];
5162-
if (!SubVec1) {
5163-
SDValue SrcVec = Idx1 >= NumOfSrcRegs ? V2 : V1;
5164-
unsigned ExtractIdx = (Idx1 % NumOfSrcRegs) * NumOpElts;
5165-
SubVec1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, M1VT, SrcVec,
5166-
DAG.getVectorIdxConstant(ExtractIdx, DL));
5167-
}
5168-
SubVec1 = convertFromScalableVector(OneRegVT, SubVec1, DAG, Subtarget);
5169-
SDValue SrcVec = Idx2 >= NumOfSrcRegs ? V2 : V1;
5170-
unsigned ExtractIdx = (Idx2 % NumOfSrcRegs) * NumOpElts;
5171-
SDValue SubVec2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, M1VT, SrcVec,
5172-
DAG.getVectorIdxConstant(ExtractIdx, DL));
5173-
SubVec2 = convertFromScalableVector(OneRegVT, SubVec2, DAG, Subtarget);
5174-
SubVec1 =
5175-
DAG.getVectorShuffle(OneRegVT, DL, SubVec1, SubVec2, SrcSubMask);
5176-
SubVec1 = convertToScalableVector(M1VT, SubVec1, DAG, Subtarget);
5177-
SubRegs[PrevCnt + 1] = SubVec1;
5178-
});
5179-
assert(RegCnt == NumOfDestRegs && "Whole vector must be processed");
5180-
SDValue Vec = DAG.getUNDEF(ContainerVT);
5181-
for (auto [I, V] : enumerate(SubRegs)) {
5182-
if (!V)
5163+
for (unsigned DstVecIdx = 0 ; DstVecIdx < OutMasks.size(); DstVecIdx++) {
5164+
auto &[SrcVecIdx, SrcSubMask] = OutMasks[DstVecIdx];
5165+
if (SrcVecIdx == -1)
51835166
continue;
5184-
unsigned InsertIdx = I * NumOpElts;
5185-
5186-
Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ContainerVT, Vec, V,
5167+
unsigned ExtractIdx = (SrcVecIdx % VRegsPerSrc) * NumOpElts;
5168+
SDValue SrcVec = (unsigned)SrcVecIdx >= VRegsPerSrc ? V2 : V1;
5169+
SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, M1VT, SrcVec,
5170+
DAG.getVectorIdxConstant(ExtractIdx, DL));
5171+
SubVec = convertFromScalableVector(OneRegVT, SubVec, DAG, Subtarget);
5172+
SubVec = DAG.getVectorShuffle(OneRegVT, DL, SubVec, SubVec, SrcSubMask);
5173+
SubVec = convertToScalableVector(M1VT, SubVec, DAG, Subtarget);
5174+
unsigned InsertIdx = DstVecIdx * NumOpElts;
5175+
Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ContainerVT, Vec, SubVec,
51875176
DAG.getVectorIdxConstant(InsertIdx, DL));
51885177
}
51895178
return convertFromScalableVector(VT, Vec, DAG, Subtarget);

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll

Lines changed: 48 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -168,11 +168,12 @@ define <4 x i64> @m2_splat_into_slide_two_source_v2_lo(<4 x i64> %v1, <4 x i64>
168168
define <4 x i64> @m2_splat_into_slide_two_source(<4 x i64> %v1, <4 x i64> %v2) vscale_range(2,2) {
169169
; CHECK-LABEL: m2_splat_into_slide_two_source:
170170
; CHECK: # %bb.0:
171-
; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
172-
; CHECK-NEXT: vslidedown.vi v13, v10, 1
173-
; CHECK-NEXT: vslideup.vi v13, v11, 1
171+
; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
172+
; CHECK-NEXT: vmv.v.i v0, 12
173+
; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu
174174
; CHECK-NEXT: vrgather.vi v12, v8, 0
175-
; CHECK-NEXT: vmv2r.v v8, v12
175+
; CHECK-NEXT: vslideup.vi v12, v10, 1, v0.t
176+
; CHECK-NEXT: vmv.v.v v8, v12
176177
; CHECK-NEXT: ret
177178
%res = shufflevector <4 x i64> %v1, <4 x i64> %v2, <4 x i32> <i32 0, i32 0, i32 5, i32 6>
178179
ret <4 x i64> %res
@@ -182,17 +183,18 @@ define void @shuffle1(ptr %explicit_0, ptr %explicit_1) vscale_range(2,2) {
182183
; CHECK-LABEL: shuffle1:
183184
; CHECK: # %bb.0:
184185
; CHECK-NEXT: addi a0, a0, 252
185-
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
186-
; CHECK-NEXT: vmv.v.i v8, 0
187186
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
188-
; CHECK-NEXT: vid.v v10
187+
; CHECK-NEXT: vid.v v8
189188
; CHECK-NEXT: vsetivli zero, 3, e32, m1, ta, ma
190-
; CHECK-NEXT: vle32.v v11, (a0)
191-
; CHECK-NEXT: vmv.v.i v0, 5
192-
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu
193-
; CHECK-NEXT: vsrl.vi v10, v10, 1
194-
; CHECK-NEXT: vadd.vi v10, v10, 1
195-
; CHECK-NEXT: vrgather.vv v9, v11, v10, v0.t
189+
; CHECK-NEXT: vle32.v v9, (a0)
190+
; CHECK-NEXT: li a0, 175
191+
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
192+
; CHECK-NEXT: vsrl.vi v8, v8, 1
193+
; CHECK-NEXT: vmv.s.x v0, a0
194+
; CHECK-NEXT: vadd.vi v8, v8, 1
195+
; CHECK-NEXT: vrgather.vv v11, v9, v8
196+
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
197+
; CHECK-NEXT: vmerge.vim v8, v10, 0, v0
196198
; CHECK-NEXT: addi a0, a1, 672
197199
; CHECK-NEXT: vs2r.v v8, (a0)
198200
; CHECK-NEXT: ret
@@ -209,15 +211,15 @@ define void @shuffle1(ptr %explicit_0, ptr %explicit_1) vscale_range(2,2) {
209211
define <16 x float> @shuffle2(<4 x float> %a) vscale_range(2,2) {
210212
; CHECK-LABEL: shuffle2:
211213
; CHECK: # %bb.0:
214+
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
215+
; CHECK-NEXT: vid.v v9
216+
; CHECK-NEXT: li a0, -97
217+
; CHECK-NEXT: vadd.vv v9, v9, v9
218+
; CHECK-NEXT: vrsub.vi v9, v9, 4
219+
; CHECK-NEXT: vmv.s.x v0, a0
220+
; CHECK-NEXT: vrgather.vv v13, v8, v9
212221
; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
213-
; CHECK-NEXT: vmv1r.v v12, v8
214-
; CHECK-NEXT: vmv.v.i v8, 0
215-
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu
216-
; CHECK-NEXT: vid.v v13
217-
; CHECK-NEXT: vadd.vv v13, v13, v13
218-
; CHECK-NEXT: vmv.v.i v0, 6
219-
; CHECK-NEXT: vrsub.vi v13, v13, 4
220-
; CHECK-NEXT: vrgather.vv v9, v12, v13, v0.t
222+
; CHECK-NEXT: vmerge.vim v8, v12, 0, v0
221223
; CHECK-NEXT: ret
222224
%b = extractelement <4 x float> %a, i32 2
223225
%c = insertelement <16 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, float %b, i32 5
@@ -229,15 +231,16 @@ define <16 x float> @shuffle2(<4 x float> %a) vscale_range(2,2) {
229231
define i64 @extract_any_extend_vector_inreg_v16i64(<16 x i64> %a0, i32 %a1) vscale_range(2,2) {
230232
; RV32-LABEL: extract_any_extend_vector_inreg_v16i64:
231233
; RV32: # %bb.0:
232-
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
234+
; RV32-NEXT: li a1, 16
235+
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu
233236
; RV32-NEXT: vmv.v.i v16, 0
234-
; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu
235-
; RV32-NEXT: vmv.v.i v0, 1
237+
; RV32-NEXT: vmv.s.x v0, a1
236238
; RV32-NEXT: li a1, 32
237-
; RV32-NEXT: vrgather.vi v18, v15, 1, v0.t
238-
; RV32-NEXT: vsetivli zero, 1, e64, m8, ta, ma
239+
; RV32-NEXT: vrgather.vi v16, v8, 15, v0.t
240+
; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma
239241
; RV32-NEXT: vslidedown.vx v8, v16, a0
240242
; RV32-NEXT: vmv.x.s a0, v8
243+
; RV32-NEXT: vsetivli zero, 1, e64, m8, ta, ma
241244
; RV32-NEXT: vsrl.vx v8, v8, a1
242245
; RV32-NEXT: vmv.x.s a1, v8
243246
; RV32-NEXT: ret
@@ -255,14 +258,13 @@ define i64 @extract_any_extend_vector_inreg_v16i64(<16 x i64> %a0, i32 %a1) vsca
255258
; RV64-NEXT: addi s0, sp, 256
256259
; RV64-NEXT: .cfi_def_cfa s0, 0
257260
; RV64-NEXT: andi sp, sp, -128
258-
; RV64-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
259-
; RV64-NEXT: vmv.v.i v0, 1
261+
; RV64-NEXT: li a1, -17
260262
; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
261-
; RV64-NEXT: vmv.v.i v16, 0
262-
; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, mu
263-
; RV64-NEXT: vrgather.vi v18, v15, 1, v0.t
263+
; RV64-NEXT: vmv.s.x v0, a1
264+
; RV64-NEXT: vrgather.vi v16, v8, 15
265+
; RV64-NEXT: vmerge.vim v8, v16, 0, v0
264266
; RV64-NEXT: mv s2, sp
265-
; RV64-NEXT: vs8r.v v16, (s2)
267+
; RV64-NEXT: vs8r.v v8, (s2)
266268
; RV64-NEXT: andi a0, a0, 15
267269
; RV64-NEXT: li a1, 8
268270
; RV64-NEXT: call __muldi3
@@ -288,16 +290,21 @@ define i64 @extract_any_extend_vector_inreg_v16i64(<16 x i64> %a0, i32 %a1) vsca
288290
define <4 x double> @shuffles_add(<4 x double> %0, <4 x double> %1) vscale_range(2,2) {
289291
; CHECK-LABEL: shuffles_add:
290292
; CHECK: # %bb.0:
291-
; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu
292-
; CHECK-NEXT: vmv1r.v v13, v10
293-
; CHECK-NEXT: vslideup.vi v13, v11, 1
294-
; CHECK-NEXT: vmv1r.v v8, v9
295-
; CHECK-NEXT: vmv.v.i v0, 1
296-
; CHECK-NEXT: vrgather.vi v12, v9, 0
297-
; CHECK-NEXT: vmv1r.v v9, v11
298-
; CHECK-NEXT: vrgather.vi v9, v10, 1, v0.t
299293
; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma
300-
; CHECK-NEXT: vfadd.vv v8, v12, v8
294+
; CHECK-NEXT: vrgather.vi v12, v8, 2
295+
; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
296+
; CHECK-NEXT: vid.v v14
297+
; CHECK-NEXT: vmv.v.i v0, 12
298+
; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma
299+
; CHECK-NEXT: vrgather.vi v16, v8, 3
300+
; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
301+
; CHECK-NEXT: vadd.vv v8, v14, v14
302+
; CHECK-NEXT: vadd.vi v9, v8, -4
303+
; CHECK-NEXT: vadd.vi v8, v8, -3
304+
; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu
305+
; CHECK-NEXT: vrgatherei16.vv v12, v10, v9, v0.t
306+
; CHECK-NEXT: vrgatherei16.vv v16, v10, v8, v0.t
307+
; CHECK-NEXT: vfadd.vv v8, v12, v16
301308
; CHECK-NEXT: ret
302309
%3 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
303310
%4 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 undef, i32 3, i32 5, i32 7>

0 commit comments

Comments
 (0)