Skip to content

Commit 1b4b0dd

Browse files
committed
[X86] Use GFNI for vXi8 shifts/rotates
As detailed here: https://github.com/InstLatx64/InstLatX64_Demo/blob/master/GFNI_Demo.h We can use the gf2p8affine instruction to lower byte shifts/rotates as well as the existing bitreverse case. There's a few other GFNI patterns we can probably handle - e.g. TZCNT/LZCNT were detailed on PR47394 Based off the original patch here: https://reviews.llvm.org/D137026
1 parent c52b18d commit 1b4b0dd

23 files changed

+1472
-2121
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 49 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28988,6 +28988,29 @@ SDValue X86TargetLowering::LowerWin64_INT128_TO_FP(SDValue Op,
2898828988
return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result;
2898928989
}
2899028990

28991+
// Generate a GFNI gf2p8affine bitmask for vXi8 bitreverse/shift/rotate.
28992+
uint64_t getGFNICtrlImm(unsigned Opcode, unsigned Amt = 0) {
28993+
assert((Amt < 8) && "Shift/Rotation amount out of range");
28994+
switch (Opcode) {
28995+
case ISD::BITREVERSE:
28996+
return 0x8040201008040201ULL;
28997+
case ISD::SHL:
28998+
return ((0x0102040810204080ULL >> (Amt)) &
28999+
(0x0101010101010101ULL * (0xFF >> (Amt))));
29000+
case ISD::SRL:
29001+
return ((0x0102040810204080ULL << (Amt)) &
29002+
(0x0101010101010101ULL * ((0xFF << (Amt)) & 0xFF)));
29003+
case ISD::SRA:
29004+
return (getGFNICtrlImm(ISD::SRL, Amt) |
29005+
(0x8080808080808080ULL >> (64 - (8 * Amt))));
29006+
case ISD::ROTL:
29007+
return getGFNICtrlImm(ISD::SRL, 8 - Amt) | getGFNICtrlImm(ISD::SHL, Amt);
29008+
case ISD::ROTR:
29009+
return getGFNICtrlImm(ISD::SHL, 8 - Amt) | getGFNICtrlImm(ISD::SRL, Amt);
29010+
}
29011+
llvm_unreachable("Unsupported GFNI opcode");
29012+
}
29013+
2899129014
// Return true if the required (according to Opcode) shift-imm form is natively
2899229015
// supported by the Subtarget
2899329016
static bool supportedVectorShiftWithImm(EVT VT, const X86Subtarget &Subtarget,
@@ -29175,6 +29198,14 @@ static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG,
2917529198
if (VT == MVT::v16i8 && Subtarget.hasXOP())
2917629199
return SDValue();
2917729200

29201+
if (Subtarget.hasGFNI()) {
29202+
uint64_t ShiftMask = getGFNICtrlImm(Op.getOpcode(), ShiftAmt);
29203+
MVT MaskVT = MVT::getVectorVT(MVT::i64, NumElts / 8);
29204+
SDValue Mask = DAG.getBitcast(VT, DAG.getConstant(ShiftMask, dl, MaskVT));
29205+
return DAG.getNode(X86ISD::GF2P8AFFINEQB, dl, VT, R, Mask,
29206+
DAG.getTargetConstant(0, dl, MVT::i8));
29207+
}
29208+
2917829209
if (Op.getOpcode() == ISD::SHL) {
2917929210
// Make a large shift.
2918029211
SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R,
@@ -29860,13 +29891,15 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
2986029891
uint64_t ShXAmt = IsFSHR ? (EltSizeInBits - ShiftAmt) : ShiftAmt;
2986129892
uint64_t ShYAmt = IsFSHR ? ShiftAmt : (EltSizeInBits - ShiftAmt);
2986229893
assert((ShXAmt + ShYAmt) == EltSizeInBits && "Illegal funnel shift");
29894+
MVT WideVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
2986329895

29864-
if (EltSizeInBits == 8 && ShXAmt > 1 &&
29865-
(Subtarget.hasXOP() || useVPTERNLOG(Subtarget, VT))) {
29896+
if (EltSizeInBits == 8 &&
29897+
(Subtarget.hasXOP() ||
29898+
(useVPTERNLOG(Subtarget, VT) &&
29899+
supportedVectorShiftWithImm(WideVT, Subtarget, ISD::SHL)))) {
2986629900
// For vXi8 cases on Subtargets that can perform VPCMOV/VPTERNLOG
2986729901
// bit-select - lower using vXi16 shifts and then perform the bitmask at
2986829902
// the original vector width to handle cases where we split.
29869-
MVT WideVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
2987029903
APInt MaskX = APInt::getHighBitsSet(8, 8 - ShXAmt);
2987129904
APInt MaskY = APInt::getLowBitsSet(8, 8 - ShYAmt);
2987229905
SDValue ShX =
@@ -30069,6 +30102,17 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
3006930102
DAG.getNode(ISD::SUB, DL, VT, Z, Amt));
3007030103
}
3007130104

30105+
// Attempt to use GFNI gf2p8affine to rotate vXi8 by an uniform constant.
30106+
if (IsCstSplat && Subtarget.hasGFNI() && VT.getScalarType() == MVT::i8 &&
30107+
DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
30108+
uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
30109+
uint64_t RotMask = getGFNICtrlImm(Opcode, RotAmt);
30110+
MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
30111+
SDValue Mask = DAG.getBitcast(VT, DAG.getConstant(RotMask, DL, MaskVT));
30112+
return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, R, Mask,
30113+
DAG.getTargetConstant(0, DL, MVT::i8));
30114+
}
30115+
3007230116
// Split 256-bit integers on XOP/pre-AVX2 targets.
3007330117
if (VT.is256BitVector() && (Subtarget.hasXOP() || !Subtarget.hasAVX2()))
3007430118
return splitVectorIntBinary(Op, DAG, DL);
@@ -31392,7 +31436,8 @@ static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
3139231436
// If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits.
3139331437
if (Subtarget.hasGFNI()) {
3139431438
MVT MatrixVT = MVT::getVectorVT(MVT::i64, NumElts / 8);
31395-
SDValue Matrix = DAG.getConstant(0x8040201008040201ULL, DL, MatrixVT);
31439+
SDValue Matrix =
31440+
DAG.getConstant(getGFNICtrlImm(ISD::BITREVERSE), DL, MatrixVT);
3139631441
Matrix = DAG.getBitcast(VT, Matrix);
3139731442
return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, In, Matrix,
3139831443
DAG.getTargetConstant(0, DL, MVT::i8));

llvm/lib/Target/X86/X86TargetTransformInfo.cpp

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -345,6 +345,24 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
345345
Op1Info.getNoProps(), Op2Info.getNoProps());
346346
}
347347

348+
static const CostKindTblEntry GFNIUniformConstCostTable[] = {
349+
{ ISD::SHL, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
350+
{ ISD::SRL, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
351+
{ ISD::SRA, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
352+
{ ISD::SHL, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
353+
{ ISD::SRL, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
354+
{ ISD::SRA, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
355+
{ ISD::SHL, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
356+
{ ISD::SRL, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
357+
{ ISD::SRA, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
358+
};
359+
360+
if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasGFNI())
361+
if (const auto *Entry =
362+
CostTableLookup(GFNIUniformConstCostTable, ISD, LT.second))
363+
if (auto KindCost = Entry->Cost[CostKind])
364+
return LT.first * *KindCost;
365+
348366
static const CostKindTblEntry AVX512BWUniformConstCostTable[] = {
349367
{ ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand.
350368
{ ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand.
@@ -3868,6 +3886,9 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
38683886
{ ISD::BITREVERSE, MVT::v2i64, { 1, 8, 2, 4 } }, // gf2p8affineqb
38693887
{ ISD::BITREVERSE, MVT::v4i64, { 1, 9, 2, 4 } }, // gf2p8affineqb
38703888
{ ISD::BITREVERSE, MVT::v8i64, { 1, 9, 2, 4 } }, // gf2p8affineqb
3889+
{ X86ISD::VROTLI, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
3890+
{ X86ISD::VROTLI, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
3891+
{ X86ISD::VROTLI, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
38713892
};
38723893
static const CostKindTblEntry GLMCostTbl[] = {
38733894
{ ISD::FSQRT, MVT::f32, { 19, 20, 1, 1 } }, // sqrtss

llvm/test/Analysis/CostModel/X86/fshl-codesize.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1597,9 +1597,9 @@ define void @splatconstant_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <
15971597
;
15981598
; AVX512GFNI-LABEL: 'splatconstant_funnel_i8'
15991599
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3)
1600-
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
1601-
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
1602-
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
1600+
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
1601+
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
1602+
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
16031603
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
16041604
;
16051605
%I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3)
@@ -2871,9 +2871,9 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <
28712871
;
28722872
; AVX512GFNI-LABEL: 'splatconstant_rotate_i8'
28732873
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3)
2874-
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
2875-
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
2876-
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
2874+
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
2875+
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
2876+
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
28772877
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
28782878
;
28792879
%I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3)

llvm/test/Analysis/CostModel/X86/fshl-latency.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1549,9 +1549,9 @@ define void @splatconstant_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <
15491549
;
15501550
; AVX512GFNI-LABEL: 'splatconstant_funnel_i8'
15511551
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3)
1552-
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
1553-
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
1554-
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
1552+
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
1553+
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
1554+
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
15551555
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
15561556
;
15571557
%I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3)
@@ -2823,9 +2823,9 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <
28232823
;
28242824
; AVX512GFNI-LABEL: 'splatconstant_rotate_i8'
28252825
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3)
2826-
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
2827-
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
2828-
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
2826+
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
2827+
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
2828+
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
28292829
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
28302830
;
28312831
%I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3)

0 commit comments

Comments
 (0)