Skip to content

Commit 25e2381

Browse files
committed
[X86] Use GFNI for vXi8 shifts/rotates
As detailed here: https://github.com/InstLatx64/InstLatX64_Demo/blob/master/GFNI_Demo.h We can use the gf2p8affine instruction to lower byte shifts/rotates as well as the existing bitreverse case. There's a few other GFNI patterns we can probably handle - e.g. TZCNT/LZCNT were detailed on PR47394 Based off the original patch here: https://reviews.llvm.org/D137026
1 parent 215eee6 commit 25e2381

23 files changed

+1070
-1521
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28988,6 +28988,29 @@ SDValue X86TargetLowering::LowerWin64_INT128_TO_FP(SDValue Op,
2898828988
return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result;
2898928989
}
2899028990

28991+
// Generate a GFNI gf2p8affine bitmask for vXi8 bitreverse/shift/rotate.
28992+
uint64_t getGFNICtrlImm(unsigned Opcode, unsigned Amt = 0) {
28993+
assert((Amt < 8) && "Shift/Rotation amount out of range");
28994+
switch (Opcode) {
28995+
case ISD::BITREVERSE:
28996+
return 0x8040201008040201ULL;
28997+
case ISD::SHL:
28998+
return ((0x0102040810204080ULL >> (Amt)) &
28999+
(0x0101010101010101ULL * (0xFF >> (Amt))));
29000+
case ISD::SRL:
29001+
return ((0x0102040810204080ULL << (Amt)) &
29002+
(0x0101010101010101ULL * ((0xFF << (Amt)) & 0xFF)));
29003+
case ISD::SRA:
29004+
return (getGFNICtrlImm(ISD::SRL, Amt) |
29005+
(0x8080808080808080ULL >> (64 - (8 * Amt))));
29006+
case ISD::ROTL:
29007+
return getGFNICtrlImm(ISD::SRL, 8 - Amt) | getGFNICtrlImm(ISD::SHL, Amt);
29008+
case ISD::ROTR:
29009+
return getGFNICtrlImm(ISD::SHL, 8 - Amt) | getGFNICtrlImm(ISD::SRL, Amt);
29010+
}
29011+
llvm_unreachable("Unsupported GFNI opcode");
29012+
}
29013+
2899129014
// Return true if the required (according to Opcode) shift-imm form is natively
2899229015
// supported by the Subtarget
2899329016
static bool supportedVectorShiftWithImm(EVT VT, const X86Subtarget &Subtarget,
@@ -29175,6 +29198,14 @@ static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG,
2917529198
if (VT == MVT::v16i8 && Subtarget.hasXOP())
2917629199
return SDValue();
2917729200

29201+
if (Subtarget.hasGFNI()) {
29202+
uint64_t ShiftMask = getGFNICtrlImm(Op.getOpcode(), ShiftAmt);
29203+
MVT MaskVT = MVT::getVectorVT(MVT::i64, NumElts / 8);
29204+
SDValue Mask = DAG.getBitcast(VT, DAG.getConstant(ShiftMask, dl, MaskVT));
29205+
return DAG.getNode(X86ISD::GF2P8AFFINEQB, dl, VT, R, Mask,
29206+
DAG.getTargetConstant(0, dl, MVT::i8));
29207+
}
29208+
2917829209
if (Op.getOpcode() == ISD::SHL) {
2917929210
// Make a large shift.
2918029211
SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R,
@@ -30046,6 +30077,17 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
3004630077
DAG.getNode(ISD::SUB, DL, VT, Z, Amt));
3004730078
}
3004830079

30080+
// Attempt to use GFNI gf2p8affine to rotate vXi8 by an uniform constant.
30081+
if (IsCstSplat && Subtarget.hasGFNI() && VT.getScalarType() == MVT::i8 &&
30082+
DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
30083+
uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
30084+
uint64_t RotMask = getGFNICtrlImm(Opcode, RotAmt);
30085+
MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
30086+
SDValue Mask = DAG.getBitcast(VT, DAG.getConstant(RotMask, DL, MaskVT));
30087+
return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, R, Mask,
30088+
DAG.getTargetConstant(0, DL, MVT::i8));
30089+
}
30090+
3004930091
// Split 256-bit integers on XOP/pre-AVX2 targets.
3005030092
if (VT.is256BitVector() && (Subtarget.hasXOP() || !Subtarget.hasAVX2()))
3005130093
return splitVectorIntBinary(Op, DAG, DL);
@@ -31369,7 +31411,8 @@ static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
3136931411
// If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits.
3137031412
if (Subtarget.hasGFNI()) {
3137131413
MVT MatrixVT = MVT::getVectorVT(MVT::i64, NumElts / 8);
31372-
SDValue Matrix = DAG.getConstant(0x8040201008040201ULL, DL, MatrixVT);
31414+
SDValue Matrix =
31415+
DAG.getConstant(getGFNICtrlImm(ISD::BITREVERSE), DL, MatrixVT);
3137331416
Matrix = DAG.getBitcast(VT, Matrix);
3137431417
return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, In, Matrix,
3137531418
DAG.getTargetConstant(0, DL, MVT::i8));

llvm/lib/Target/X86/X86TargetTransformInfo.cpp

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -345,6 +345,24 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
345345
Op1Info.getNoProps(), Op2Info.getNoProps());
346346
}
347347

348+
static const CostKindTblEntry GFNIUniformConstCostTable[] = {
349+
{ ISD::SHL, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
350+
{ ISD::SRL, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
351+
{ ISD::SRA, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
352+
{ ISD::SHL, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
353+
{ ISD::SRL, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
354+
{ ISD::SRA, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
355+
{ ISD::SHL, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
356+
{ ISD::SRL, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
357+
{ ISD::SRA, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
358+
};
359+
360+
if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasGFNI())
361+
if (const auto *Entry =
362+
CostTableLookup(GFNIUniformConstCostTable, ISD, LT.second))
363+
if (auto KindCost = Entry->Cost[CostKind])
364+
return LT.first * *KindCost;
365+
348366
static const CostKindTblEntry AVX512BWUniformConstCostTable[] = {
349367
{ ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand.
350368
{ ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand.
@@ -3860,6 +3878,9 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
38603878
{ ISD::BITREVERSE, MVT::v2i64, { 1, 8, 2, 4 } }, // gf2p8affineqb
38613879
{ ISD::BITREVERSE, MVT::v4i64, { 1, 9, 2, 4 } }, // gf2p8affineqb
38623880
{ ISD::BITREVERSE, MVT::v8i64, { 1, 9, 2, 4 } }, // gf2p8affineqb
3881+
{ X86ISD::VROTLI, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
3882+
{ X86ISD::VROTLI, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
3883+
{ X86ISD::VROTLI, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
38633884
};
38643885
static const CostKindTblEntry GLMCostTbl[] = {
38653886
{ ISD::FSQRT, MVT::f32, { 19, 20, 1, 1 } }, // sqrtss

llvm/test/Analysis/CostModel/X86/fshl-codesize.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1597,9 +1597,9 @@ define void @splatconstant_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <
15971597
;
15981598
; AVX512GFNI-LABEL: 'splatconstant_funnel_i8'
15991599
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3)
1600-
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
1601-
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
1602-
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
1600+
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
1601+
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
1602+
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
16031603
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
16041604
;
16051605
%I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3)
@@ -2871,9 +2871,9 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <
28712871
;
28722872
; AVX512GFNI-LABEL: 'splatconstant_rotate_i8'
28732873
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3)
2874-
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
2875-
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
2876-
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
2874+
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
2875+
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
2876+
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
28772877
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
28782878
;
28792879
%I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3)

llvm/test/Analysis/CostModel/X86/fshl-latency.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1549,9 +1549,9 @@ define void @splatconstant_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <
15491549
;
15501550
; AVX512GFNI-LABEL: 'splatconstant_funnel_i8'
15511551
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3)
1552-
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
1553-
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
1554-
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
1552+
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
1553+
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
1554+
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
15551555
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
15561556
;
15571557
%I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3)
@@ -2823,9 +2823,9 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <
28232823
;
28242824
; AVX512GFNI-LABEL: 'splatconstant_rotate_i8'
28252825
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3)
2826-
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
2827-
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
2828-
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
2826+
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
2827+
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
2828+
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
28292829
; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
28302830
;
28312831
%I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3)

0 commit comments

Comments
 (0)