Skip to content

Commit 576085c

Browse files
authored
[SelectionDAG][RISCV] Add support for splitting vp.splice (#145184)
Use a stack based expansion similar to the non-VP splice. This code has been in our downstream for a while. I don't know how often it is exercised though. Our downstream was missing clipping for the immediate value to keep it in range of the stack object so I've added it.
1 parent 1128a4f commit 576085c

File tree

3 files changed

+217
-0
lines changed

3 files changed

+217
-0
lines changed

llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -985,6 +985,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
985985
void SplitVecRes_VECTOR_INTERLEAVE(SDNode *N);
986986
void SplitVecRes_VAARG(SDNode *N, SDValue &Lo, SDValue &Hi);
987987
void SplitVecRes_FP_TO_XINT_SAT(SDNode *N, SDValue &Lo, SDValue &Hi);
988+
void SplitVecRes_VP_SPLICE(SDNode *N, SDValue &Lo, SDValue &Hi);
988989
void SplitVecRes_VP_REVERSE(SDNode *N, SDValue &Lo, SDValue &Hi);
989990
void SplitVecRes_PARTIAL_REDUCE_MLA(SDNode *N, SDValue &Lo, SDValue &Hi);
990991
void SplitVecRes_GET_ACTIVE_LANE_MASK(SDNode *N, SDValue &Lo, SDValue &Hi);

llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1382,6 +1382,9 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
13821382
case ISD::UDIVFIXSAT:
13831383
SplitVecRes_FIX(N, Lo, Hi);
13841384
break;
1385+
case ISD::EXPERIMENTAL_VP_SPLICE:
1386+
SplitVecRes_VP_SPLICE(N, Lo, Hi);
1387+
break;
13851388
case ISD::EXPERIMENTAL_VP_REVERSE:
13861389
SplitVecRes_VP_REVERSE(N, Lo, Hi);
13871390
break;
@@ -3209,6 +3212,78 @@ void DAGTypeLegalizer::SplitVecRes_VP_REVERSE(SDNode *N, SDValue &Lo,
32093212
std::tie(Lo, Hi) = DAG.SplitVector(Load, DL);
32103213
}
32113214

3215+
void DAGTypeLegalizer::SplitVecRes_VP_SPLICE(SDNode *N, SDValue &Lo,
3216+
SDValue &Hi) {
3217+
EVT VT = N->getValueType(0);
3218+
SDValue V1 = N->getOperand(0);
3219+
SDValue V2 = N->getOperand(1);
3220+
int64_t Imm = cast<ConstantSDNode>(N->getOperand(2))->getSExtValue();
3221+
SDValue Mask = N->getOperand(3);
3222+
SDValue EVL1 = N->getOperand(4);
3223+
SDValue EVL2 = N->getOperand(5);
3224+
SDLoc DL(N);
3225+
3226+
// Since EVL2 is considered the real VL it gets promoted during
3227+
// SelectionDAGBuilder. Promote EVL1 here if needed.
3228+
if (getTypeAction(EVL1.getValueType()) == TargetLowering::TypePromoteInteger)
3229+
EVL1 = ZExtPromotedInteger(EVL1);
3230+
3231+
Align Alignment = DAG.getReducedAlign(VT, /*UseABI=*/false);
3232+
3233+
EVT MemVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
3234+
VT.getVectorElementCount() * 2);
3235+
SDValue StackPtr = DAG.CreateStackTemporary(MemVT.getStoreSize(), Alignment);
3236+
EVT PtrVT = StackPtr.getValueType();
3237+
auto &MF = DAG.getMachineFunction();
3238+
auto FrameIndex = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
3239+
auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex);
3240+
3241+
MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand(
3242+
PtrInfo, MachineMemOperand::MOStore, LocationSize::beforeOrAfterPointer(),
3243+
Alignment);
3244+
MachineMemOperand *LoadMMO = DAG.getMachineFunction().getMachineMemOperand(
3245+
PtrInfo, MachineMemOperand::MOLoad, LocationSize::beforeOrAfterPointer(),
3246+
Alignment);
3247+
3248+
SDValue StackPtr2 = TLI.getVectorElementPointer(DAG, StackPtr, VT, EVL1);
3249+
3250+
SDValue TrueMask = DAG.getBoolConstant(true, DL, Mask.getValueType(), VT);
3251+
SDValue StoreV1 = DAG.getStoreVP(DAG.getEntryNode(), DL, V1, StackPtr,
3252+
DAG.getUNDEF(PtrVT), TrueMask, EVL1,
3253+
V1.getValueType(), StoreMMO, ISD::UNINDEXED);
3254+
3255+
SDValue StoreV2 =
3256+
DAG.getStoreVP(StoreV1, DL, V2, StackPtr2, DAG.getUNDEF(PtrVT), TrueMask,
3257+
EVL2, V2.getValueType(), StoreMMO, ISD::UNINDEXED);
3258+
3259+
SDValue Load;
3260+
if (Imm >= 0) {
3261+
StackPtr = TLI.getVectorElementPointer(DAG, StackPtr, VT, N->getOperand(2));
3262+
Load = DAG.getLoadVP(VT, DL, StoreV2, StackPtr, Mask, EVL2, LoadMMO);
3263+
} else {
3264+
uint64_t TrailingElts = -Imm;
3265+
unsigned EltWidth = VT.getScalarSizeInBits() / 8;
3266+
SDValue TrailingBytes = DAG.getConstant(TrailingElts * EltWidth, DL, PtrVT);
3267+
3268+
// Make sure TrailingBytes doesn't exceed the size of vec1.
3269+
SDValue OffsetToV2 = DAG.getNode(ISD::SUB, DL, PtrVT, StackPtr2, StackPtr);
3270+
TrailingBytes =
3271+
DAG.getNode(ISD::UMIN, DL, PtrVT, TrailingBytes, OffsetToV2);
3272+
3273+
// Calculate the start address of the spliced result.
3274+
StackPtr2 = DAG.getNode(ISD::SUB, DL, PtrVT, StackPtr2, TrailingBytes);
3275+
Load = DAG.getLoadVP(VT, DL, StoreV2, StackPtr2, Mask, EVL2, LoadMMO);
3276+
}
3277+
3278+
EVT LoVT, HiVT;
3279+
std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
3280+
Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, Load,
3281+
DAG.getVectorIdxConstant(0, DL));
3282+
Hi =
3283+
DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HiVT, Load,
3284+
DAG.getVectorIdxConstant(LoVT.getVectorMinNumElements(), DL));
3285+
}
3286+
32123287
void DAGTypeLegalizer::SplitVecRes_PARTIAL_REDUCE_MLA(SDNode *N, SDValue &Lo,
32133288
SDValue &Hi) {
32143289
SDLoc DL(N);

llvm/test/CodeGen/RISCV/rvv/vp-splice.ll

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -286,3 +286,144 @@ define <vscale x 2 x float> @test_vp_splice_nxv2f32_masked(<vscale x 2 x float>
286286
%v = call <vscale x 2 x float> @llvm.experimental.vp.splice.nxv2f32(<vscale x 2 x float> %va, <vscale x 2 x float> %vb, i32 5, <vscale x 2 x i1> %mask, i32 %evla, i32 %evlb)
287287
ret <vscale x 2 x float> %v
288288
}
289+
290+
define <vscale x 16 x i64> @test_vp_splice_nxv16i64(<vscale x 16 x i64> %va, <vscale x 16 x i64> %vb, i32 zeroext %evla, i32 zeroext %evlb) nounwind {
291+
; CHECK-LABEL: test_vp_splice_nxv16i64:
292+
; CHECK: # %bb.0:
293+
; CHECK-NEXT: csrr a4, vlenb
294+
; CHECK-NEXT: slli a5, a4, 1
295+
; CHECK-NEXT: addi a5, a5, -1
296+
; CHECK-NEXT: slli a1, a4, 3
297+
; CHECK-NEXT: mv a7, a2
298+
; CHECK-NEXT: bltu a2, a5, .LBB21_2
299+
; CHECK-NEXT: # %bb.1:
300+
; CHECK-NEXT: mv a7, a5
301+
; CHECK-NEXT: .LBB21_2:
302+
; CHECK-NEXT: addi sp, sp, -80
303+
; CHECK-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
304+
; CHECK-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
305+
; CHECK-NEXT: addi s0, sp, 80
306+
; CHECK-NEXT: csrr a5, vlenb
307+
; CHECK-NEXT: slli a5, a5, 5
308+
; CHECK-NEXT: sub sp, sp, a5
309+
; CHECK-NEXT: andi sp, sp, -64
310+
; CHECK-NEXT: add a5, a0, a1
311+
; CHECK-NEXT: slli a7, a7, 3
312+
; CHECK-NEXT: addi a6, sp, 64
313+
; CHECK-NEXT: mv t0, a2
314+
; CHECK-NEXT: bltu a2, a4, .LBB21_4
315+
; CHECK-NEXT: # %bb.3:
316+
; CHECK-NEXT: mv t0, a4
317+
; CHECK-NEXT: .LBB21_4:
318+
; CHECK-NEXT: vl8re64.v v24, (a5)
319+
; CHECK-NEXT: add a5, a6, a7
320+
; CHECK-NEXT: vl8re64.v v0, (a0)
321+
; CHECK-NEXT: vsetvli zero, t0, e64, m8, ta, ma
322+
; CHECK-NEXT: vse64.v v8, (a6)
323+
; CHECK-NEXT: sub a0, a2, a4
324+
; CHECK-NEXT: sltu a2, a2, a0
325+
; CHECK-NEXT: addi a2, a2, -1
326+
; CHECK-NEXT: and a0, a2, a0
327+
; CHECK-NEXT: add a6, a6, a1
328+
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
329+
; CHECK-NEXT: vse64.v v16, (a6)
330+
; CHECK-NEXT: mv a0, a3
331+
; CHECK-NEXT: bltu a3, a4, .LBB21_6
332+
; CHECK-NEXT: # %bb.5:
333+
; CHECK-NEXT: mv a0, a4
334+
; CHECK-NEXT: .LBB21_6:
335+
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
336+
; CHECK-NEXT: vse64.v v0, (a5)
337+
; CHECK-NEXT: sub a2, a3, a4
338+
; CHECK-NEXT: add a5, a5, a1
339+
; CHECK-NEXT: sltu a3, a3, a2
340+
; CHECK-NEXT: addi a3, a3, -1
341+
; CHECK-NEXT: and a2, a3, a2
342+
; CHECK-NEXT: addi a3, sp, 104
343+
; CHECK-NEXT: add a1, a3, a1
344+
; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
345+
; CHECK-NEXT: vse64.v v24, (a5)
346+
; CHECK-NEXT: vle64.v v16, (a1)
347+
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
348+
; CHECK-NEXT: vle64.v v8, (a3)
349+
; CHECK-NEXT: addi sp, s0, -80
350+
; CHECK-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
351+
; CHECK-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
352+
; CHECK-NEXT: addi sp, sp, 80
353+
; CHECK-NEXT: ret
354+
%v = call <vscale x 16 x i64> @llvm.experimental.vp.splice.nxv16i64(<vscale x 16 x i64> %va, <vscale x 16 x i64> %vb, i32 5, <vscale x 16 x i1> splat (i1 1), i32 %evla, i32 %evlb)
355+
ret <vscale x 16 x i64> %v
356+
}
357+
358+
define <vscale x 16 x i64> @test_vp_splice_nxv16i64_negative_offset(<vscale x 16 x i64> %va, <vscale x 16 x i64> %vb, i32 zeroext %evla, i32 zeroext %evlb) nounwind {
359+
; CHECK-LABEL: test_vp_splice_nxv16i64_negative_offset:
360+
; CHECK: # %bb.0:
361+
; CHECK-NEXT: csrr a5, vlenb
362+
; CHECK-NEXT: slli a6, a5, 1
363+
; CHECK-NEXT: addi a6, a6, -1
364+
; CHECK-NEXT: slli a1, a5, 3
365+
; CHECK-NEXT: mv a4, a2
366+
; CHECK-NEXT: bltu a2, a6, .LBB22_2
367+
; CHECK-NEXT: # %bb.1:
368+
; CHECK-NEXT: mv a4, a6
369+
; CHECK-NEXT: .LBB22_2:
370+
; CHECK-NEXT: addi sp, sp, -80
371+
; CHECK-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
372+
; CHECK-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
373+
; CHECK-NEXT: addi s0, sp, 80
374+
; CHECK-NEXT: csrr a6, vlenb
375+
; CHECK-NEXT: slli a6, a6, 5
376+
; CHECK-NEXT: sub sp, sp, a6
377+
; CHECK-NEXT: andi sp, sp, -64
378+
; CHECK-NEXT: add a6, a0, a1
379+
; CHECK-NEXT: slli a4, a4, 3
380+
; CHECK-NEXT: addi a7, sp, 64
381+
; CHECK-NEXT: mv t0, a2
382+
; CHECK-NEXT: bltu a2, a5, .LBB22_4
383+
; CHECK-NEXT: # %bb.3:
384+
; CHECK-NEXT: mv t0, a5
385+
; CHECK-NEXT: .LBB22_4:
386+
; CHECK-NEXT: vl8re64.v v24, (a6)
387+
; CHECK-NEXT: add a6, a7, a4
388+
; CHECK-NEXT: vl8re64.v v0, (a0)
389+
; CHECK-NEXT: vsetvli zero, t0, e64, m8, ta, ma
390+
; CHECK-NEXT: vse64.v v8, (a7)
391+
; CHECK-NEXT: sub a0, a2, a5
392+
; CHECK-NEXT: sltu a2, a2, a0
393+
; CHECK-NEXT: addi a2, a2, -1
394+
; CHECK-NEXT: and a0, a2, a0
395+
; CHECK-NEXT: add a7, a7, a1
396+
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
397+
; CHECK-NEXT: vse64.v v16, (a7)
398+
; CHECK-NEXT: mv a0, a3
399+
; CHECK-NEXT: bltu a3, a5, .LBB22_6
400+
; CHECK-NEXT: # %bb.5:
401+
; CHECK-NEXT: mv a0, a5
402+
; CHECK-NEXT: .LBB22_6:
403+
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
404+
; CHECK-NEXT: vse64.v v0, (a6)
405+
; CHECK-NEXT: sub a2, a3, a5
406+
; CHECK-NEXT: add a5, a6, a1
407+
; CHECK-NEXT: sltu a3, a3, a2
408+
; CHECK-NEXT: addi a3, a3, -1
409+
; CHECK-NEXT: and a2, a3, a2
410+
; CHECK-NEXT: li a3, 8
411+
; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
412+
; CHECK-NEXT: vse64.v v24, (a5)
413+
; CHECK-NEXT: bltu a4, a3, .LBB22_8
414+
; CHECK-NEXT: # %bb.7:
415+
; CHECK-NEXT: li a4, 8
416+
; CHECK-NEXT: .LBB22_8:
417+
; CHECK-NEXT: sub a2, a6, a4
418+
; CHECK-NEXT: add a1, a2, a1
419+
; CHECK-NEXT: vle64.v v16, (a1)
420+
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
421+
; CHECK-NEXT: vle64.v v8, (a2)
422+
; CHECK-NEXT: addi sp, s0, -80
423+
; CHECK-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
424+
; CHECK-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
425+
; CHECK-NEXT: addi sp, sp, 80
426+
; CHECK-NEXT: ret
427+
%v = call <vscale x 16 x i64> @llvm.experimental.vp.splice.nxv16i64(<vscale x 16 x i64> %va, <vscale x 16 x i64> %vb, i32 -1, <vscale x 16 x i1> splat (i1 1), i32 %evla, i32 %evlb)
428+
ret <vscale x 16 x i64> %v
429+
}

0 commit comments

Comments
 (0)