Skip to content

Commit 8232ab7

Browse files
author
Dinar Temirbulatov
committed
[AArch64][SVE][SVE2] Enable tbl, tbl2 for shuffle lowering for fixed vector types.
This change enablse some of shuffle lowering with TBL instruction with SVE and SVE2 for indexing for one register and TBL version for SVE2 while indexing to both registers. Differential Revision: https://reviews.llvm.org/D152205
1 parent 06a05f3 commit 8232ab7

File tree

3 files changed

+358
-103
lines changed

3 files changed

+358
-103
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25830,6 +25830,77 @@ AArch64TargetLowering::LowerFixedLengthFPToIntToSVE(SDValue Op,
2583025830
}
2583125831
}
2583225832

25833+
static SDValue GenerateFixedLengthSVETBL(SDValue Op, SDValue Op1, SDValue Op2,
25834+
ArrayRef<int> ShuffleMask, EVT VT,
25835+
EVT ContainerVT, SelectionDAG &DAG) {
25836+
auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
25837+
SDLoc DL(Op);
25838+
unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
25839+
unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
25840+
bool IsSingleOp = ShuffleVectorInst::isSingleSourceMask(ShuffleMask);
25841+
25842+
// Ignore two operands if no SVE2 or all index numbers couldn't
25843+
// be represented.
25844+
if (!IsSingleOp && (!Subtarget.hasSVE2() || MinSVESize != MaxSVESize))
25845+
return SDValue();
25846+
25847+
EVT VTOp1 = Op.getOperand(0).getValueType();
25848+
unsigned BitsPerElt = VTOp1.getVectorElementType().getSizeInBits();
25849+
unsigned IndexLen = MinSVESize / BitsPerElt;
25850+
unsigned ElementsPerVectorReg = VTOp1.getVectorNumElements();
25851+
unsigned MaskSize = ShuffleMask.size();
25852+
uint64_t MaxOffset = APInt(BitsPerElt, -1, false).getZExtValue();
25853+
assert(ElementsPerVectorReg <= IndexLen && MaskSize <= IndexLen &&
25854+
"Incorrectly legalised shuffle operation");
25855+
25856+
SmallVector<SDValue, 8> TBLMask;
25857+
for (int Index : ShuffleMask) {
25858+
// Handling poison index value.
25859+
if (Index < 0)
25860+
Index = 0;
25861+
// If we refer to the second operand then we have to add elements
25862+
// number in hardware register minus number of elements in a type.
25863+
if ((unsigned)Index >= ElementsPerVectorReg)
25864+
Index += IndexLen - ElementsPerVectorReg;
25865+
// For 8-bit elements and 1024-bit SVE registers and MaxOffset equals
25866+
// to 255, this might point to the last element of in the second operand
25867+
// of the shufflevector, thus we are rejecting this transform.
25868+
if ((unsigned)Index >= MaxOffset)
25869+
return SDValue();
25870+
TBLMask.push_back(DAG.getConstant(Index, DL, MVT::i64));
25871+
}
25872+
25873+
// Choosing an out-of-range index leads to the lane being zeroed vs zero
25874+
// value where it would perform first lane duplication for out of
25875+
// index elements. For i8 elements an out-of-range index could be a valid
25876+
// for 2048-bit vector register size.
25877+
for (unsigned i = 0; i < IndexLen - ElementsPerVectorReg; ++i)
25878+
TBLMask.push_back(DAG.getConstant((int)MaxOffset, DL, MVT::i64));
25879+
25880+
EVT MaskEltType = EVT::getIntegerVT(*DAG.getContext(), BitsPerElt);
25881+
EVT MaskType = EVT::getVectorVT(*DAG.getContext(), MaskEltType, IndexLen);
25882+
EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskType);
25883+
SDValue VecMask =
25884+
DAG.getBuildVector(MaskType, DL, ArrayRef(TBLMask.data(), IndexLen));
25885+
SDValue SVEMask = convertToScalableVector(DAG, MaskContainerVT, VecMask);
25886+
25887+
SDValue Shuffle;
25888+
if (IsSingleOp)
25889+
Shuffle =
25890+
DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
25891+
DAG.getConstant(Intrinsic::aarch64_sve_tbl, DL, MVT::i32),
25892+
Op1, SVEMask);
25893+
else if (Subtarget.hasSVE2())
25894+
Shuffle =
25895+
DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
25896+
DAG.getConstant(Intrinsic::aarch64_sve_tbl2, DL, MVT::i32),
25897+
Op1, Op2, SVEMask);
25898+
else
25899+
llvm_unreachable("Cannot lower shuffle without SVE2 TBL");
25900+
Shuffle = convertFromScalableVector(DAG, VT, Shuffle);
25901+
return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
25902+
}
25903+
2583325904
SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
2583425905
SDValue Op, SelectionDAG &DAG) const {
2583525906
EVT VT = Op.getValueType();
@@ -25975,6 +26046,11 @@ SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
2597526046
}
2597626047
}
2597726048

26049+
// Avoid producing TBL instruction if we don't know SVE register minimal size.
26050+
if (MinSVESize)
26051+
return GenerateFixedLengthSVETBL(Op, Op1, Op2, ShuffleMask, VT, ContainerVT,
26052+
DAG);
26053+
2597826054
return SDValue();
2597926055
}
2598026056

llvm/test/CodeGen/AArch64/sve-fixed-length-permute-rev.ll

Lines changed: 28 additions & 103 deletions
Original file line numberDiff line numberDiff line change
@@ -194,28 +194,13 @@ define void @test_revhv32i16(ptr %a) #0 {
194194
define void @test_rev_elts_fail(ptr %a) #1 {
195195
; CHECK-LABEL: test_rev_elts_fail:
196196
; CHECK: // %bb.0:
197-
; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
198-
; CHECK-NEXT: sub x9, sp, #48
199-
; CHECK-NEXT: mov x29, sp
200-
; CHECK-NEXT: and sp, x9, #0xffffffffffffffe0
201-
; CHECK-NEXT: .cfi_def_cfa w29, 16
202-
; CHECK-NEXT: .cfi_offset w30, -8
203-
; CHECK-NEXT: .cfi_offset w29, -16
204197
; CHECK-NEXT: ptrue p0.d
198+
; CHECK-NEXT: adrp x8, .LCPI11_0
199+
; CHECK-NEXT: add x8, x8, :lo12:.LCPI11_0
205200
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
206-
; CHECK-NEXT: mov z1.d, z0.d[2]
207-
; CHECK-NEXT: mov z2.d, z0.d[3]
208-
; CHECK-NEXT: mov x9, v0.d[1]
209-
; CHECK-NEXT: fmov x8, d1
210-
; CHECK-NEXT: fmov x10, d2
211-
; CHECK-NEXT: stp x10, x8, [sp, #16]
212-
; CHECK-NEXT: fmov x8, d0
213-
; CHECK-NEXT: stp x9, x8, [sp]
214-
; CHECK-NEXT: mov x8, sp
215-
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8]
201+
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x8]
202+
; CHECK-NEXT: tbl z0.d, { z0.d }, z1.d
216203
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
217-
; CHECK-NEXT: mov sp, x29
218-
; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
219204
; CHECK-NEXT: ret
220205
%tmp1 = load <4 x i64>, ptr %a
221206
%tmp2 = shufflevector <4 x i64> %tmp1, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
@@ -260,39 +245,26 @@ define void @test_revdv4f64_sve2p1(ptr %a) #2 {
260245

261246
; sve-vector-bits-min=256, sve-vector-bits-max is not set, REV inst can't be generated.
262247
define void @test_revv8i32(ptr %a) #0 {
263-
; CHECK-LABEL: test_revv8i32:
264-
; CHECK: // %bb.0:
265-
; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
266-
; CHECK-NEXT: sub x9, sp, #48
267-
; CHECK-NEXT: mov x29, sp
268-
; CHECK-NEXT: and sp, x9, #0xffffffffffffffe0
269-
; CHECK-NEXT: .cfi_def_cfa w29, 16
270-
; CHECK-NEXT: .cfi_offset w30, -8
271-
; CHECK-NEXT: .cfi_offset w29, -16
272-
; CHECK-NEXT: ptrue p0.s, vl8
273-
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
274-
; CHECK-NEXT: mov w8, v0.s[1]
275-
; CHECK-NEXT: mov w9, v0.s[2]
276-
; CHECK-NEXT: mov w10, v0.s[3]
277-
; CHECK-NEXT: fmov w11, s0
278-
; CHECK-NEXT: mov z1.s, z0.s[4]
279-
; CHECK-NEXT: mov z2.s, z0.s[5]
280-
; CHECK-NEXT: mov z3.s, z0.s[6]
281-
; CHECK-NEXT: mov z0.s, z0.s[7]
282-
; CHECK-NEXT: stp w8, w11, [sp, #24]
283-
; CHECK-NEXT: fmov w8, s1
284-
; CHECK-NEXT: stp w10, w9, [sp, #16]
285-
; CHECK-NEXT: fmov w9, s2
286-
; CHECK-NEXT: stp w9, w8, [sp, #8]
287-
; CHECK-NEXT: fmov w8, s3
288-
; CHECK-NEXT: fmov w9, s0
289-
; CHECK-NEXT: stp w9, w8, [sp]
290-
; CHECK-NEXT: mov x8, sp
291-
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8]
292-
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
293-
; CHECK-NEXT: mov sp, x29
294-
; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
295-
; CHECK-NEXT: ret
248+
; VBITS_GE_256-LABEL: test_revv8i32:
249+
; VBITS_GE_256: // %bb.0:
250+
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
251+
; VBITS_GE_256-NEXT: index z0.s, #7, #-1
252+
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
253+
; VBITS_GE_256-NEXT: tbl z0.s, { z1.s }, z0.s
254+
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0]
255+
; VBITS_GE_256-NEXT: ret
256+
;
257+
; VBITS_GE_512-LABEL: test_revv8i32:
258+
; VBITS_GE_512: // %bb.0:
259+
; VBITS_GE_512-NEXT: ptrue p0.s, vl8
260+
; VBITS_GE_512-NEXT: adrp x8, .LCPI14_0
261+
; VBITS_GE_512-NEXT: add x8, x8, :lo12:.LCPI14_0
262+
; VBITS_GE_512-NEXT: ptrue p1.s, vl16
263+
; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
264+
; VBITS_GE_512-NEXT: ld1w { z1.s }, p1/z, [x8]
265+
; VBITS_GE_512-NEXT: tbl z0.s, { z0.s }, z1.s
266+
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
267+
; VBITS_GE_512-NEXT: ret
296268
%tmp1 = load <8 x i32>, ptr %a
297269
%tmp2 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
298270
store <8 x i32> %tmp2, ptr %a
@@ -379,60 +351,13 @@ define void @test_revv8i32v8i32(ptr %a, ptr %b) #1 {
379351
define void @test_rev_fail(ptr %a) #1 {
380352
; CHECK-LABEL: test_rev_fail:
381353
; CHECK: // %bb.0:
382-
; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
383-
; CHECK-NEXT: sub x9, sp, #48
384-
; CHECK-NEXT: mov x29, sp
385-
; CHECK-NEXT: and sp, x9, #0xffffffffffffffe0
386-
; CHECK-NEXT: .cfi_def_cfa w29, 16
387-
; CHECK-NEXT: .cfi_offset w30, -8
388-
; CHECK-NEXT: .cfi_offset w29, -16
389354
; CHECK-NEXT: ptrue p0.h
355+
; CHECK-NEXT: adrp x8, .LCPI20_0
356+
; CHECK-NEXT: add x8, x8, :lo12:.LCPI20_0
390357
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
391-
; CHECK-NEXT: mov z1.h, z0.h[8]
392-
; CHECK-NEXT: fmov w8, s0
393-
; CHECK-NEXT: mov z2.h, z0.h[9]
394-
; CHECK-NEXT: mov z3.h, z0.h[10]
395-
; CHECK-NEXT: mov z4.h, z0.h[11]
396-
; CHECK-NEXT: strh w8, [sp, #14]
397-
; CHECK-NEXT: fmov w8, s1
398-
; CHECK-NEXT: mov z1.h, z0.h[12]
399-
; CHECK-NEXT: fmov w9, s2
400-
; CHECK-NEXT: mov z2.h, z0.h[13]
401-
; CHECK-NEXT: strh w8, [sp, #30]
402-
; CHECK-NEXT: fmov w8, s3
403-
; CHECK-NEXT: mov z3.h, z0.h[14]
404-
; CHECK-NEXT: strh w9, [sp, #28]
405-
; CHECK-NEXT: fmov w9, s4
406-
; CHECK-NEXT: mov z4.h, z0.h[15]
407-
; CHECK-NEXT: fmov w10, s2
408-
; CHECK-NEXT: strh w8, [sp, #26]
409-
; CHECK-NEXT: fmov w8, s1
410-
; CHECK-NEXT: fmov w11, s3
411-
; CHECK-NEXT: strh w9, [sp, #24]
412-
; CHECK-NEXT: umov w9, v0.h[1]
413-
; CHECK-NEXT: fmov w12, s4
414-
; CHECK-NEXT: strh w10, [sp, #20]
415-
; CHECK-NEXT: umov w10, v0.h[3]
416-
; CHECK-NEXT: strh w8, [sp, #22]
417-
; CHECK-NEXT: umov w8, v0.h[2]
418-
; CHECK-NEXT: strh w11, [sp, #18]
419-
; CHECK-NEXT: umov w11, v0.h[4]
420-
; CHECK-NEXT: strh w12, [sp, #16]
421-
; CHECK-NEXT: umov w12, v0.h[5]
422-
; CHECK-NEXT: strh w9, [sp, #12]
423-
; CHECK-NEXT: umov w9, v0.h[6]
424-
; CHECK-NEXT: strh w8, [sp, #10]
425-
; CHECK-NEXT: umov w8, v0.h[7]
426-
; CHECK-NEXT: strh w10, [sp, #8]
427-
; CHECK-NEXT: strh w11, [sp, #6]
428-
; CHECK-NEXT: strh w12, [sp, #4]
429-
; CHECK-NEXT: strh w9, [sp, #2]
430-
; CHECK-NEXT: strh w8, [sp]
431-
; CHECK-NEXT: mov x8, sp
432-
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8]
358+
; CHECK-NEXT: ld1h { z1.h }, p0/z, [x8]
359+
; CHECK-NEXT: tbl z0.h, { z0.h }, z1.h
433360
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
434-
; CHECK-NEXT: mov sp, x29
435-
; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
436361
; CHECK-NEXT: ret
437362
%tmp1 = load <16 x i16>, ptr %a
438363
%tmp2 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>

0 commit comments

Comments
 (0)