-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[WIP][X86] Use GFNI for vXi8 per-element shifts #89644
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesAs detailed here: https://github.com/InstLatx64/InstLatX64_Demo/blob/master/GFNI_Demo.h These are a bit more complicated than gf2p8affine look ups, requiring us to convert a SHL shift value / amount into a GF so we can perform a multiplication. SRL/SRA need to be converted to SHL via bitreverse/variable-sign-extension. Followup to #89115 CC @shamithoke Patch is 231.04 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/89644.diff 3 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index dd40d079c7e2f7..b8e5255bc56c55 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -29564,6 +29564,62 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
DAG.getNode(Opc, dl, ExtVT, R, Amt));
}
+ // GFNI - we can perform SHL with a GF multiplication, and can convert
+ // SRL/SRA to a SHL.
+ if (VT == MVT::v16i8 ||
+ (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
+ (VT == MVT::v64i8 && Subtarget.hasBWI())) {
+ if (Subtarget.hasGFNI() && Subtarget.hasSSSE3()) {
+ auto GFShiftLeft = [&](SDValue Val) {
+ // Use PSHUFB as a LUT from the shift amount to create a per-element
+ // byte mask for the shift value and an index. For shift amounts greater
+ // than 7, the result will be zero.
+ SmallVector<APInt, 8> MaskBits, IdxBits;
+ for (unsigned I = 0, E = VT.getSizeInBits() / 128; I != E; ++I) {
+ MaskBits.push_back(APInt(64, 0x0103070F1F3F7FFFULL));
+ IdxBits.push_back(APInt(64, 0x8040201008040201ULL));
+ MaskBits.push_back(APInt::getZero(64));
+ IdxBits.push_back(APInt::getZero(64));
+ }
+
+ MVT CVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
+ SDValue Mask =
+ DAG.getBitcast(VT, getConstVector(MaskBits, CVT, DAG, dl));
+ SDValue Idx = DAG.getBitcast(VT, getConstVector(IdxBits, CVT, DAG, dl));
+ Mask = DAG.getNode(X86ISD::PSHUFB, dl, VT, Mask, Amt);
+ Idx = DAG.getNode(X86ISD::PSHUFB, dl, VT, Idx, Amt);
+ Mask = DAG.getNode(ISD::AND, dl, VT, Val, Mask);
+ return DAG.getNode(X86ISD::GF2P8MULB, dl, VT, Mask, Idx);
+ };
+
+ if (Opc == ISD::SHL)
+ return GFShiftLeft(R);
+
+ // srl(x,y)
+ // --> bitreverse(shl(bitreverse(x),y))
+ if (Opc == ISD::SRL) {
+ R = DAG.getNode(ISD::BITREVERSE, dl, VT, R);
+ R = GFShiftLeft(R);
+ return DAG.getNode(ISD::BITREVERSE, dl, VT, R);
+ }
+
+ // sra(x,y)
+ // --> sub(xor(srl(x,y), m),m)
+ // --> sub(xor(bitreverse(shl(bitreverse(x),y)), m),m)
+ // where m = srl(signbit, amt) --> bitreverse(shl(lsb, amt))
+ if (Opc == ISD::SRA) {
+ SDValue LSB = DAG.getConstant(APInt::getOneBitSet(8, 0), dl, VT);
+ SDValue M = DAG.getNode(ISD::BITREVERSE, dl, VT, GFShiftLeft(LSB));
+ R = DAG.getNode(ISD::BITREVERSE, dl, VT, R);
+ R = GFShiftLeft(R);
+ R = DAG.getNode(ISD::BITREVERSE, dl, VT, R);
+ R = DAG.getNode(ISD::XOR, dl, VT, R, M);
+ R = DAG.getNode(ISD::SUB, dl, VT, R, M);
+ return R;
+ }
+ }
+ }
+
// Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we
// extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI.
if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) &&
@@ -55614,6 +55670,15 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
ConcatSubOperand(VT, Ops, 0));
}
break;
+ case X86ISD::GF2P8MULB:
+ if (!IsSplat &&
+ (VT.is256BitVector() ||
+ (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
+ return DAG.getNode(Op0.getOpcode(), DL, VT,
+ ConcatSubOperand(VT, Ops, 0),
+ ConcatSubOperand(VT, Ops, 1));
+ }
+ break;
case X86ISD::GF2P8AFFINEQB:
if (!IsSplat &&
(VT.is256BitVector() ||
diff --git a/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll b/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll
index 0c341dc63a9ecc..f22df047d15b7d 100644
--- a/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll
+++ b/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll
@@ -142,114 +142,71 @@ define <16 x i8> @var_fshl_v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %amt) nou
define <16 x i8> @var_fshr_v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %amt) nounwind {
; GFNISSE-LABEL: var_fshr_v16i8:
; GFNISSE: # %bb.0:
-; GFNISSE-NEXT: movdqa %xmm2, %xmm3
-; GFNISSE-NEXT: movdqa %xmm0, %xmm2
-; GFNISSE-NEXT: movdqa {{.*#+}} xmm5 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; GFNISSE-NEXT: movdqa %xmm3, %xmm0
-; GFNISSE-NEXT: pand %xmm5, %xmm0
-; GFNISSE-NEXT: psllw $5, %xmm0
-; GFNISSE-NEXT: movdqa %xmm0, %xmm4
-; GFNISSE-NEXT: paddb %xmm0, %xmm4
-; GFNISSE-NEXT: movdqa %xmm1, %xmm6
-; GFNISSE-NEXT: psrlw $4, %xmm6
-; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
-; GFNISSE-NEXT: pblendvb %xmm0, %xmm6, %xmm1
-; GFNISSE-NEXT: movdqa %xmm1, %xmm6
-; GFNISSE-NEXT: psrlw $2, %xmm6
-; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
-; GFNISSE-NEXT: movdqa %xmm4, %xmm0
-; GFNISSE-NEXT: pblendvb %xmm0, %xmm6, %xmm1
-; GFNISSE-NEXT: movdqa %xmm1, %xmm6
-; GFNISSE-NEXT: psrlw $1, %xmm6
-; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
-; GFNISSE-NEXT: paddb %xmm4, %xmm4
-; GFNISSE-NEXT: movdqa %xmm4, %xmm0
-; GFNISSE-NEXT: pblendvb %xmm0, %xmm6, %xmm1
-; GFNISSE-NEXT: pandn %xmm5, %xmm3
-; GFNISSE-NEXT: psllw $5, %xmm3
-; GFNISSE-NEXT: movdqa %xmm3, %xmm4
-; GFNISSE-NEXT: paddb %xmm3, %xmm4
-; GFNISSE-NEXT: paddb %xmm2, %xmm2
-; GFNISSE-NEXT: movdqa %xmm2, %xmm5
-; GFNISSE-NEXT: psllw $4, %xmm5
-; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5
-; GFNISSE-NEXT: movdqa %xmm3, %xmm0
-; GFNISSE-NEXT: pblendvb %xmm0, %xmm5, %xmm2
-; GFNISSE-NEXT: movdqa %xmm2, %xmm3
-; GFNISSE-NEXT: psllw $2, %xmm3
-; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; GFNISSE-NEXT: movdqa %xmm4, %xmm0
-; GFNISSE-NEXT: pblendvb %xmm0, %xmm3, %xmm2
-; GFNISSE-NEXT: movdqa %xmm2, %xmm3
-; GFNISSE-NEXT: paddb %xmm2, %xmm3
-; GFNISSE-NEXT: paddb %xmm4, %xmm4
-; GFNISSE-NEXT: movdqa %xmm4, %xmm0
-; GFNISSE-NEXT: pblendvb %xmm0, %xmm3, %xmm2
-; GFNISSE-NEXT: por %xmm1, %xmm2
-; GFNISSE-NEXT: movdqa %xmm2, %xmm0
+; GFNISSE-NEXT: movdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; GFNISSE-NEXT: movdqa %xmm2, %xmm4
+; GFNISSE-NEXT: pandn %xmm3, %xmm4
+; GFNISSE-NEXT: movq {{.*#+}} xmm5 = [9241421688590303745,0]
+; GFNISSE-NEXT: movdqa %xmm5, %xmm6
+; GFNISSE-NEXT: pshufb %xmm4, %xmm6
+; GFNISSE-NEXT: movq {{.*#+}} xmm7 = [72909780498219007,0]
+; GFNISSE-NEXT: movdqa %xmm7, %xmm8
+; GFNISSE-NEXT: pshufb %xmm4, %xmm8
+; GFNISSE-NEXT: paddb %xmm0, %xmm0
+; GFNISSE-NEXT: pand %xmm8, %xmm0
+; GFNISSE-NEXT: gf2p8mulb %xmm6, %xmm0
+; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [9241421688590303745,9241421688590303745]
+; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm1
+; GFNISSE-NEXT: pand %xmm3, %xmm2
+; GFNISSE-NEXT: pshufb %xmm2, %xmm7
+; GFNISSE-NEXT: pand %xmm1, %xmm7
+; GFNISSE-NEXT: pshufb %xmm2, %xmm5
+; GFNISSE-NEXT: gf2p8mulb %xmm7, %xmm5
+; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm5
+; GFNISSE-NEXT: por %xmm5, %xmm0
; GFNISSE-NEXT: retq
;
; GFNIAVX1-LABEL: var_fshr_v16i8:
; GFNIAVX1: # %bb.0:
; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; GFNIAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
-; GFNIAVX1-NEXT: vpsllw $5, %xmm4, %xmm4
-; GFNIAVX1-NEXT: vpaddb %xmm4, %xmm4, %xmm5
-; GFNIAVX1-NEXT: vpsrlw $4, %xmm1, %xmm6
-; GFNIAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6
-; GFNIAVX1-NEXT: vpblendvb %xmm4, %xmm6, %xmm1, %xmm1
-; GFNIAVX1-NEXT: vpsrlw $2, %xmm1, %xmm4
-; GFNIAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
-; GFNIAVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
-; GFNIAVX1-NEXT: vpsrlw $1, %xmm1, %xmm4
-; GFNIAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
-; GFNIAVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
-; GFNIAVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
-; GFNIAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
-; GFNIAVX1-NEXT: vpsllw $5, %xmm2, %xmm2
-; GFNIAVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm3
+; GFNIAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4
+; GFNIAVX1-NEXT: vmovq {{.*#+}} xmm5 = [9241421688590303745,0]
+; GFNIAVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm6
+; GFNIAVX1-NEXT: vmovq {{.*#+}} xmm7 = [72909780498219007,0]
+; GFNIAVX1-NEXT: vpshufb %xmm4, %xmm7, %xmm4
; GFNIAVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm0
-; GFNIAVX1-NEXT: vpsllw $4, %xmm0, %xmm4
-; GFNIAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
-; GFNIAVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0
-; GFNIAVX1-NEXT: vpsllw $2, %xmm0, %xmm2
-; GFNIAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; GFNIAVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
-; GFNIAVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm2
-; GFNIAVX1-NEXT: vpaddb %xmm3, %xmm3, %xmm3
-; GFNIAVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vgf2p8mulb %xmm6, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm4 = [9241421688590303745,9241421688590303745]
+; GFNIAVX1-NEXT: # xmm4 = mem[0,0]
+; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm4, %xmm1, %xmm1
+; GFNIAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
+; GFNIAVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm3
+; GFNIAVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
+; GFNIAVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2
+; GFNIAVX1-NEXT: vgf2p8mulb %xmm2, %xmm1, %xmm1
+; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm4, %xmm1, %xmm1
; GFNIAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; GFNIAVX1-NEXT: retq
;
; GFNIAVX2-LABEL: var_fshr_v16i8:
; GFNIAVX2: # %bb.0:
; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; GFNIAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4
-; GFNIAVX2-NEXT: vpsllw $5, %xmm4, %xmm4
-; GFNIAVX2-NEXT: vpaddb %xmm4, %xmm4, %xmm5
-; GFNIAVX2-NEXT: vpsrlw $4, %xmm1, %xmm6
-; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6
-; GFNIAVX2-NEXT: vpblendvb %xmm4, %xmm6, %xmm1, %xmm1
-; GFNIAVX2-NEXT: vpsrlw $2, %xmm1, %xmm4
-; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
-; GFNIAVX2-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
-; GFNIAVX2-NEXT: vpsrlw $1, %xmm1, %xmm4
-; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
-; GFNIAVX2-NEXT: vpaddb %xmm5, %xmm5, %xmm5
-; GFNIAVX2-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
-; GFNIAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2
-; GFNIAVX2-NEXT: vpsllw $5, %xmm2, %xmm2
-; GFNIAVX2-NEXT: vpaddb %xmm2, %xmm2, %xmm3
+; GFNIAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4
+; GFNIAVX2-NEXT: vmovq {{.*#+}} xmm5 = [9241421688590303745,0]
+; GFNIAVX2-NEXT: vpshufb %xmm4, %xmm5, %xmm6
+; GFNIAVX2-NEXT: vmovq {{.*#+}} xmm7 = [72909780498219007,0]
+; GFNIAVX2-NEXT: vpshufb %xmm4, %xmm7, %xmm4
; GFNIAVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm0
-; GFNIAVX2-NEXT: vpsllw $4, %xmm0, %xmm4
-; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
-; GFNIAVX2-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0
-; GFNIAVX2-NEXT: vpsllw $2, %xmm0, %xmm2
-; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; GFNIAVX2-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
-; GFNIAVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm2
-; GFNIAVX2-NEXT: vpaddb %xmm3, %xmm3, %xmm3
-; GFNIAVX2-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
+; GFNIAVX2-NEXT: vpand %xmm4, %xmm0, %xmm0
+; GFNIAVX2-NEXT: vgf2p8mulb %xmm6, %xmm0, %xmm0
+; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} xmm4 = [9241421688590303745,9241421688590303745]
+; GFNIAVX2-NEXT: vgf2p8affineqb $0, %xmm4, %xmm1, %xmm1
+; GFNIAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
+; GFNIAVX2-NEXT: vpshufb %xmm2, %xmm7, %xmm3
+; GFNIAVX2-NEXT: vpand %xmm3, %xmm1, %xmm1
+; GFNIAVX2-NEXT: vpshufb %xmm2, %xmm5, %xmm2
+; GFNIAVX2-NEXT: vgf2p8mulb %xmm2, %xmm1, %xmm1
+; GFNIAVX2-NEXT: vgf2p8affineqb $0, %xmm4, %xmm1, %xmm1
; GFNIAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
; GFNIAVX2-NEXT: retq
;
@@ -719,34 +676,25 @@ define <32 x i8> @var_fshl_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %amt) nou
; GFNIAVX512VL-LABEL: var_fshl_v32i8:
; GFNIAVX512VL: # %bb.0:
; GFNIAVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; GFNIAVX512VL-NEXT: vpandn %ymm3, %ymm2, %ymm4
-; GFNIAVX512VL-NEXT: vpsllw $5, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm4
+; GFNIAVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [9241421688590303745,0,9241421688590303745,0]
+; GFNIAVX512VL-NEXT: # ymm5 = mem[0,1,0,1]
+; GFNIAVX512VL-NEXT: vpshufb %ymm4, %ymm5, %ymm6
+; GFNIAVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [72909780498219007,0,72909780498219007,0]
+; GFNIAVX512VL-NEXT: # ymm7 = mem[0,1,0,1]
+; GFNIAVX512VL-NEXT: vpshufb %ymm4, %ymm7, %ymm4
+; GFNIAVX512VL-NEXT: vpand %ymm4, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT: vgf2p8mulb %ymm6, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT: vpandn %ymm3, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT: vpshufb %ymm2, %ymm7, %ymm3
; GFNIAVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm1
-; GFNIAVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; GFNIAVX512VL-NEXT: vpand %ymm5, %ymm1, %ymm1
-; GFNIAVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm6
-; GFNIAVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm6, %ymm6
-; GFNIAVX512VL-NEXT: vpblendvb %ymm4, %ymm6, %ymm1, %ymm1
-; GFNIAVX512VL-NEXT: vpsrlw $2, %ymm1, %ymm6
-; GFNIAVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm6, %ymm6
-; GFNIAVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm4
-; GFNIAVX512VL-NEXT: vpblendvb %ymm4, %ymm6, %ymm1, %ymm1
-; GFNIAVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm6
-; GFNIAVX512VL-NEXT: vpand %ymm5, %ymm6, %ymm5
-; GFNIAVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm4
-; GFNIAVX512VL-NEXT: vpblendvb %ymm4, %ymm5, %ymm1, %ymm1
-; GFNIAVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2
-; GFNIAVX512VL-NEXT: vpsllw $5, %ymm2, %ymm2
-; GFNIAVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm3
-; GFNIAVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4
-; GFNIAVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm4, %ymm4
-; GFNIAVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
-; GFNIAVX512VL-NEXT: vpsllw $2, %ymm0, %ymm2
-; GFNIAVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2
-; GFNIAVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
-; GFNIAVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm2
-; GFNIAVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3
-; GFNIAVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1
+; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm4 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
+; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm4, %ymm1, %ymm1
+; GFNIAVX512VL-NEXT: vpand %ymm3, %ymm1, %ymm1
+; GFNIAVX512VL-NEXT: vpshufb %ymm2, %ymm5, %ymm2
+; GFNIAVX512VL-NEXT: vgf2p8mulb %ymm2, %ymm1, %ymm1
+; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm4, %ymm1, %ymm1
; GFNIAVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
; GFNIAVX512VL-NEXT: retq
;
@@ -769,219 +717,133 @@ define <32 x i8> @var_fshl_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %amt) nou
define <32 x i8> @var_fshr_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %amt) nounwind {
; GFNISSE-LABEL: var_fshr_v32i8:
; GFNISSE: # %bb.0:
-; GFNISSE-NEXT: movdqa %xmm4, %xmm6
-; GFNISSE-NEXT: movdqa %xmm0, %xmm4
-; GFNISSE-NEXT: movdqa %xmm2, %xmm9
-; GFNISSE-NEXT: psrlw $4, %xmm9
-; GFNISSE-NEXT: movdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; GFNISSE-NEXT: pand %xmm8, %xmm9
-; GFNISSE-NEXT: movdqa {{.*#+}} xmm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; GFNISSE-NEXT: movdqa %xmm6, %xmm0
-; GFNISSE-NEXT: pand %xmm7, %xmm0
-; GFNISSE-NEXT: psllw $5, %xmm0
-; GFNISSE-NEXT: pblendvb %xmm0, %xmm9, %xmm2
-; GFNISSE-NEXT: movdqa %xmm2, %xmm10
-; GFNISSE-NEXT: psrlw $2, %xmm10
-; GFNISSE-NEXT: movdqa {{.*#+}} xmm9 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
-; GFNISSE-NEXT: pand %xmm9, %xmm10
-; GFNISSE-NEXT: paddb %xmm0, %xmm0
-; GFNISSE-NEXT: pblendvb %xmm0, %xmm10, %xmm2
-; GFNISSE-NEXT: movdqa %xmm2, %xmm11
-; GFNISSE-NEXT: psrlw $1, %xmm11
-; GFNISSE-NEXT: movdqa {{.*#+}} xmm10 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; GFNISSE-NEXT: pand %xmm10, %xmm11
-; GFNISSE-NEXT: paddb %xmm0, %xmm0
-; GFNISSE-NEXT: pblendvb %xmm0, %xmm11, %xmm2
-; GFNISSE-NEXT: paddb %xmm4, %xmm4
-; GFNISSE-NEXT: movdqa %xmm4, %xmm12
-; GFNISSE-NEXT: psllw $4, %xmm12
-; GFNISSE-NEXT: movdqa {{.*#+}} xmm11 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; GFNISSE-NEXT: pand %xmm11, %xmm12
-; GFNISSE-NEXT: pandn %xmm7, %xmm6
-; GFNISSE-NEXT: psllw $5, %xmm6
-; GFNISSE-NEXT: movdqa %xmm6, %xmm0
-; GFNISSE-NEXT: pblendvb %xmm0, %xmm12, %xmm4
-; GFNISSE-NEXT: movdqa %xmm4, %xmm13
-; GFNISSE-NEXT: psllw $2, %xmm13
-; GFNISSE-NEXT: movdqa {{.*#+}} xmm12 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
-; GFNISSE-NEXT: pand %xmm12, %xmm13
-; GFNISSE-NEXT: paddb %xmm6, %xmm6
-; GFNISSE-NEXT: movdqa %xmm6, %xmm0
-; GFNISSE-NEXT: pblendvb %xmm0, %xmm13, %xmm4
-; GFNISSE-NEXT: movdqa %xmm4, %xmm13
-; GFNISSE-NEXT: paddb %xmm4, %xmm13
-; GFNISSE-NEXT: paddb %xmm6, %xmm6
-; GFNISSE-NEXT: movdqa %xmm6, %xmm0
-; GFNISSE-NEXT: pblendvb %xmm0, %xmm13, %xmm4
-; GFNISSE-NEXT: por %xmm2, %xmm4
-; GFNISSE-NEXT: movdqa %xmm3, %xmm2
-; GFNISSE-NEXT: psrlw $4, %xmm2
-; GFNISSE-NEXT: pand %xmm8, %xmm2
-; GFNISSE-NEXT: movdqa %xmm5, %xmm0
-; GFNISSE-NEXT: pand %xmm7, %xmm0
-; GFNISSE-NEXT: psllw $5, %xmm0
-; GFNISSE-NEXT: pblendvb %xmm0, %xmm2, %xmm3
-; GFNISSE-NEXT: movdqa %xmm3, %xmm2
-; GFNISSE-NEXT: psrlw $2, %xmm2
-; GFNISSE-NEXT: pand %xmm9, %xmm2
-; GFNISSE-NEXT: paddb %xmm0, %xmm0
-; GFNISSE-NEXT: pblendvb %xmm0, %xmm2, %xmm3
-; GFNISSE-NEXT: movdqa %xmm3, %xmm2
-; GFNISSE-NEXT: psrlw $1, %xmm2
-; GFNISSE-NEXT: pand %xmm10, %xmm2
+; GFNISSE-NEXT: movdqa {{.*#+}} xmm8 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; GFNISSE-NEXT: movdqa %xmm4, %xmm9
+; GFNISSE-NEXT: pandn %xmm8, %xmm9
+; GFNISSE-NEXT: movq {{.*#+}} xmm6 = [9241421688590303745,0]
+; GFNISSE-NEXT: movdqa %xmm6, %xmm10
+; GFNISSE-NEXT: pshufb %xmm9, %xmm10
+; GFNISSE-NEXT: movq {{.*#+}} xmm7 = [72909780498219007,0]
+; GFNISSE-NEXT: movdqa %xmm7, %xmm11
+; GFNISSE-NEXT: pshufb %xmm9, %xmm11
; GFNISSE-NEXT: paddb %xmm0, %xmm0
-; GFNISSE-NEXT: pblendvb %xmm0, %xmm2, %xmm3
+; GFNISSE-NEXT: pand %xmm11, %xmm0
+; GFNISSE-NEXT: gf2p8mulb %xmm10, %xmm0
+; GFNISSE-NEXT: movdqa {{.*#+}} xmm9 = [9241421688590303745,9241421688590303745]
+; GFNISSE-NEXT: gf2p8affineqb $0, %xmm9, %xmm2
+; GFNISSE-NEXT: pand %xmm8, %xmm4
+; GFNISSE-NEXT: movdqa %xmm7, %xmm10
+; GFNISSE-NEXT: pshufb %xmm4, %xmm10
+; GFNISSE-NEXT: pand %xmm2, %xmm10
+; GFNISSE-NEXT: movdqa %xmm6, %xmm2
+; GFNISSE-NEXT: pshufb %xmm4, %xmm2
+; GFNISSE-NEXT: gf2p8mulb %xmm10, %xmm2
+; GFNISSE-NEXT: gf2p8affineqb $0, %xmm9, %xmm2
+; GFNISSE-NEXT: por %xmm2, %xmm0
+; GFNISSE-NEXT: movdqa %xmm5, %xmm2
+; GFNISSE-NEXT: pandn %xmm8, %xmm2
+; GFNISSE-NEXT: movdqa %xmm6, %xmm4
+; GFNISSE-NEXT: pshufb %xmm2, %xmm4
+; GFNISSE-NEXT: movdqa %xmm7, %xmm10
+; GFNISSE-NEXT: pshufb %xmm2, %xmm10
; GFNISSE-NEXT: paddb %xmm1, %xmm1
-; GFNISSE-NEXT: movdqa %xmm1, %xmm2
-; GFNISSE-NEXT: psllw $4, %xmm2
-; GFNISSE-NEXT: pand %xmm11, %xmm2
-; GFNISSE-NEXT: pandn %xmm7, %xmm5
-; GFNISSE-NEXT: psllw $5, %xmm5
-; GFNISSE-NEXT: movdqa %xmm5, %xmm0
-; GFNISSE-NEXT: pblendvb %xmm0, %xmm2, %xmm1
-; GFNISSE-NEXT: movdqa %xmm1, %xmm2
-; GFNISSE-NEXT: psllw $2, %xmm2
-; GFNISSE-NEXT: pand %xmm12, %xmm2
-; GFNISSE-NEXT: paddb %xmm5, %xmm5
-; GFNISSE-NEXT: movdqa %xmm5, %xmm0
-; GFNISSE-NEXT: pblendvb %xmm0, %xmm2, %xmm1
-; GFNISSE-NEXT: movdqa %xmm1, %xmm2
-; GFNISSE-NEXT: paddb %xmm1, %xmm2
-; GFNISSE-NEXT: paddb %xmm5, %xmm5
-; GFNISSE-NEXT: movdqa %xmm5, %xmm0
-; GFNISSE-NEXT: pb...
[truncated]
|
You can test this locally with the following command:git-clang-format --diff 754ff0f54a4b09a8e4b00783475c51f66b949b66 c9038213eed9d80554f384c42f380e5dee0a8ae7 -- llvm/lib/Target/X86/X86ISelLowering.cpp View the diff from clang-format here.diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index c2ccd9eb03..e7709d03ac 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -55864,9 +55864,8 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
}
break;
case X86ISD::GF2P8MULB:
- if (!IsSplat &&
- (VT.is256BitVector() ||
- (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
+ if (!IsSplat && (VT.is256BitVector() ||
+ (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
return DAG.getNode(Op0.getOpcode(), DL, VT,
ConcatSubOperand(VT, Ops, 0),
ConcatSubOperand(VT, Ops, 1));
|
5877500
to
79fd221
Compare
As detailed here: https://github.com/InstLatx64/InstLatX64_Demo/blob/master/GFNI_Demo.h These are a bit more complicated than gf2p8affine look ups, requiring us to convert a SHL shift value / amount into a GF so we can perform a multiplication. SRL/SRA need to be converted to SHL via bitreverse/variable-sign-extension. Followup to llvm#89115
79fd221
to
c903821
Compare
As detailed here: https://github.com/InstLatx64/InstLatX64_Demo/blob/master/GFNI_Demo.h
These are a bit more complicated than gf2p8affine look ups, requiring us to convert a SHL shift value / amount into a GF so we can perform a multiplication. SRL/SRA need to be converted to SHL via bitreverse/variable-sign-extension.
Followup to #89115
CC @shamithoke