Skip to content

Commit b6a2207

Browse files
committed
[X86] Move bitselect matching to vpternlog into X86ISelDAGToDAG.cpp
This allows us to reduce the use count on the condition node before the match. This enables load folding for that operand without relying on the peephole pass. This will be improved on for broadcast load folding in a subsequent commit. This still requires a bunch of isel patterns for vXi16/vXi8 types though. llvm-svn: 373156
1 parent 0ac4aac commit b6a2207

17 files changed

+286
-169
lines changed

llvm/lib/Target/X86/X86ISelDAGToDAG.cpp

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -510,6 +510,7 @@ namespace {
510510
bool combineIncDecVector(SDNode *Node);
511511
bool tryShrinkShlLogicImm(SDNode *N);
512512
bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask);
513+
bool tryMatchBitSelect(SDNode *N);
513514

514515
MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
515516
const SDLoc &dl, MVT VT, SDNode *Node);
@@ -4275,6 +4276,55 @@ bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
42754276
return true;
42764277
}
42774278

4279+
// Try to match the bitselect pattern (or (and A, B), (andn A, C)). Turn it
4280+
// into vpternlog.
4281+
bool X86DAGToDAGISel::tryMatchBitSelect(SDNode *N) {
4282+
assert(N->getOpcode() == ISD::OR && "Unexpected opcode!");
4283+
4284+
MVT NVT = N->getSimpleValueType(0);
4285+
4286+
// Make sure we support VPTERNLOG.
4287+
if (!NVT.isVector() || !Subtarget->hasAVX512())
4288+
return false;
4289+
4290+
// We need VLX for 128/256-bit.
4291+
if (!(Subtarget->hasVLX() || NVT.is512BitVector()))
4292+
return false;
4293+
4294+
SDValue N0 = N->getOperand(0);
4295+
SDValue N1 = N->getOperand(1);
4296+
4297+
// Canonicalize AND to LHS.
4298+
if (N1.getOpcode() == ISD::AND)
4299+
std::swap(N0, N1);
4300+
4301+
if (N0.getOpcode() != ISD::AND ||
4302+
N1.getOpcode() != X86ISD::ANDNP ||
4303+
!N0.hasOneUse() || !N1.hasOneUse())
4304+
return false;
4305+
4306+
// ANDN is not commutable, use it to pick down A and C.
4307+
SDValue A = N1.getOperand(0);
4308+
SDValue C = N1.getOperand(1);
4309+
4310+
// AND is commutable, if one operand matches A, the other operand is B.
4311+
// Otherwise this isn't a match.
4312+
SDValue B;
4313+
if (N0.getOperand(0) == A)
4314+
B = N0.getOperand(1);
4315+
else if (N0.getOperand(1) == A)
4316+
B = N0.getOperand(0);
4317+
else
4318+
return false;
4319+
4320+
SDLoc dl(N);
4321+
SDValue Imm = CurDAG->getTargetConstant(0xCA, dl, MVT::i8);
4322+
SDValue Ternlog = CurDAG->getNode(X86ISD::VPTERNLOG, dl, NVT, A, B, C, Imm);
4323+
ReplaceNode(N, Ternlog.getNode());
4324+
SelectCode(Ternlog.getNode());
4325+
return true;
4326+
}
4327+
42784328
void X86DAGToDAGISel::Select(SDNode *Node) {
42794329
MVT NVT = Node->getSimpleValueType(0);
42804330
unsigned Opcode = Node->getOpcode();
@@ -4433,6 +4483,9 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
44334483
if (tryShrinkShlLogicImm(Node))
44344484
return;
44354485

4486+
if (Opcode == ISD::OR && tryMatchBitSelect(Node))
4487+
return;
4488+
44364489
LLVM_FALLTHROUGH;
44374490
case ISD::ADD:
44384491
case ISD::SUB: {

llvm/lib/Target/X86/X86InstrAVX512.td

Lines changed: 107 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -11436,6 +11436,113 @@ defm VPTERNLOGD : avx512_common_ternlog<"vpternlogd", SchedWriteVecALU,
1143611436
defm VPTERNLOGQ : avx512_common_ternlog<"vpternlogq", SchedWriteVecALU,
1143711437
avx512vl_i64_info>, VEX_W;
1143811438

11439+
// Patterns to use VPTERNLOG for vXi16/vXi8 vectors.
11440+
let Predicates = [HasVLX] in {
11441+
def : Pat<(v16i8 (X86vpternlog VR128X:$src1, VR128X:$src2, VR128X:$src3,
11442+
(i8 timm:$src4))),
11443+
(VPTERNLOGQZ128rri VR128X:$src1, VR128X:$src2, VR128X:$src3,
11444+
timm:$src4)>;
11445+
def : Pat<(v16i8 (X86vpternlog VR128X:$src1, VR128X:$src2,
11446+
(loadv16i8 addr:$src3), (i8 timm:$src4))),
11447+
(VPTERNLOGQZ128rmi VR128X:$src1, VR128X:$src2, addr:$src3,
11448+
timm:$src4)>;
11449+
def : Pat<(v16i8 (X86vpternlog (loadv16i8 addr:$src3), VR128X:$src2,
11450+
VR128X:$src1, (i8 timm:$src4))),
11451+
(VPTERNLOGQZ128rmi VR128X:$src1, VR128X:$src2, addr:$src3,
11452+
(VPTERNLOG321_imm8 timm:$src4))>;
11453+
def : Pat<(v16i8 (X86vpternlog VR128X:$src1, (loadv16i8 addr:$src3),
11454+
VR128X:$src2, (i8 timm:$src4))),
11455+
(VPTERNLOGQZ128rmi VR128X:$src1, VR128X:$src2, addr:$src3,
11456+
(VPTERNLOG132_imm8 timm:$src4))>;
11457+
11458+
def : Pat<(v8i16 (X86vpternlog VR128X:$src1, VR128X:$src2, VR128X:$src3,
11459+
(i8 timm:$src4))),
11460+
(VPTERNLOGQZ128rri VR128X:$src1, VR128X:$src2, VR128X:$src3,
11461+
timm:$src4)>;
11462+
def : Pat<(v8i16 (X86vpternlog VR128X:$src1, VR128X:$src2,
11463+
(loadv8i16 addr:$src3), (i8 timm:$src4))),
11464+
(VPTERNLOGQZ128rmi VR128X:$src1, VR128X:$src2, addr:$src3,
11465+
timm:$src4)>;
11466+
def : Pat<(v8i16 (X86vpternlog (loadv8i16 addr:$src3), VR128X:$src2,
11467+
VR128X:$src1, (i8 timm:$src4))),
11468+
(VPTERNLOGQZ128rmi VR128X:$src1, VR128X:$src2, addr:$src3,
11469+
(VPTERNLOG321_imm8 timm:$src4))>;
11470+
def : Pat<(v8i16 (X86vpternlog VR128X:$src1, (loadv8i16 addr:$src3),
11471+
VR128X:$src2, (i8 timm:$src4))),
11472+
(VPTERNLOGQZ128rmi VR128X:$src1, VR128X:$src2, addr:$src3,
11473+
(VPTERNLOG132_imm8 timm:$src4))>;
11474+
11475+
def : Pat<(v32i8 (X86vpternlog VR256X:$src1, VR256X:$src2, VR256X:$src3,
11476+
(i8 timm:$src4))),
11477+
(VPTERNLOGQZ256rri VR256X:$src1, VR256X:$src2, VR256X:$src3,
11478+
timm:$src4)>;
11479+
def : Pat<(v32i8 (X86vpternlog VR256X:$src1, VR256X:$src2,
11480+
(loadv32i8 addr:$src3), (i8 timm:$src4))),
11481+
(VPTERNLOGQZ256rmi VR256X:$src1, VR256X:$src2, addr:$src3,
11482+
timm:$src4)>;
11483+
def : Pat<(v32i8 (X86vpternlog (loadv32i8 addr:$src3), VR256X:$src2,
11484+
VR256X:$src1, (i8 timm:$src4))),
11485+
(VPTERNLOGQZ256rmi VR256X:$src1, VR256X:$src2, addr:$src3,
11486+
(VPTERNLOG321_imm8 timm:$src4))>;
11487+
def : Pat<(v32i8 (X86vpternlog VR256X:$src1, (loadv32i8 addr:$src3),
11488+
VR256X:$src2, (i8 timm:$src4))),
11489+
(VPTERNLOGQZ256rmi VR256X:$src1, VR256X:$src2, addr:$src3,
11490+
(VPTERNLOG132_imm8 timm:$src4))>;
11491+
11492+
def : Pat<(v16i16 (X86vpternlog VR256X:$src1, VR256X:$src2, VR256X:$src3,
11493+
(i8 timm:$src4))),
11494+
(VPTERNLOGQZ256rri VR256X:$src1, VR256X:$src2, VR256X:$src3,
11495+
timm:$src4)>;
11496+
def : Pat<(v16i16 (X86vpternlog VR256X:$src1, VR256X:$src2,
11497+
(loadv16i16 addr:$src3), (i8 timm:$src4))),
11498+
(VPTERNLOGQZ256rmi VR256X:$src1, VR256X:$src2, addr:$src3,
11499+
timm:$src4)>;
11500+
def : Pat<(v16i16 (X86vpternlog (loadv16i16 addr:$src3), VR256X:$src2,
11501+
VR256X:$src1, (i8 timm:$src4))),
11502+
(VPTERNLOGQZ256rmi VR256X:$src1, VR256X:$src2, addr:$src3,
11503+
(VPTERNLOG321_imm8 timm:$src4))>;
11504+
def : Pat<(v16i16 (X86vpternlog VR256X:$src1, (loadv16i16 addr:$src3),
11505+
VR256X:$src2, (i8 timm:$src4))),
11506+
(VPTERNLOGQZ256rmi VR256X:$src1, VR256X:$src2, addr:$src3,
11507+
(VPTERNLOG132_imm8 timm:$src4))>;
11508+
}
11509+
11510+
let Predicates = [HasAVX512] in {
11511+
def : Pat<(v64i8 (X86vpternlog VR512:$src1, VR512:$src2, VR512:$src3,
11512+
(i8 timm:$src4))),
11513+
(VPTERNLOGQZrri VR512:$src1, VR512:$src2, VR512:$src3,
11514+
timm:$src4)>;
11515+
def : Pat<(v64i8 (X86vpternlog VR512:$src1, VR512:$src2,
11516+
(loadv64i8 addr:$src3), (i8 timm:$src4))),
11517+
(VPTERNLOGQZrmi VR512:$src1, VR512:$src2, addr:$src3,
11518+
timm:$src4)>;
11519+
def : Pat<(v64i8 (X86vpternlog (loadv64i8 addr:$src3), VR512:$src2,
11520+
VR512:$src1, (i8 timm:$src4))),
11521+
(VPTERNLOGQZrmi VR512:$src1, VR512:$src2, addr:$src3,
11522+
(VPTERNLOG321_imm8 timm:$src4))>;
11523+
def : Pat<(v64i8 (X86vpternlog VR512:$src1, (loadv64i8 addr:$src3),
11524+
VR512:$src2, (i8 timm:$src4))),
11525+
(VPTERNLOGQZrmi VR512:$src1, VR512:$src2, addr:$src3,
11526+
(VPTERNLOG132_imm8 timm:$src4))>;
11527+
11528+
def : Pat<(v32i16 (X86vpternlog VR512:$src1, VR512:$src2, VR512:$src3,
11529+
(i8 timm:$src4))),
11530+
(VPTERNLOGQZrri VR512:$src1, VR512:$src2, VR512:$src3,
11531+
timm:$src4)>;
11532+
def : Pat<(v32i16 (X86vpternlog VR512:$src1, VR512:$src2,
11533+
(loadv32i16 addr:$src3), (i8 timm:$src4))),
11534+
(VPTERNLOGQZrmi VR512:$src1, VR512:$src2, addr:$src3,
11535+
timm:$src4)>;
11536+
def : Pat<(v32i16 (X86vpternlog (loadv32i16 addr:$src3), VR512:$src2,
11537+
VR512:$src1, (i8 timm:$src4))),
11538+
(VPTERNLOGQZrmi VR512:$src1, VR512:$src2, addr:$src3,
11539+
(VPTERNLOG321_imm8 timm:$src4))>;
11540+
def : Pat<(v32i16 (X86vpternlog VR512:$src1, (loadv32i16 addr:$src3),
11541+
VR512:$src2, (i8 timm:$src4))),
11542+
(VPTERNLOGQZrmi VR512:$src1, VR512:$src2, addr:$src3,
11543+
(VPTERNLOG132_imm8 timm:$src4))>;
11544+
}
11545+
1143911546
// Patterns to implement vnot using vpternlog instead of creating all ones
1144011547
// using pcmpeq or vpternlog and then xoring with that. The value 15 is chosen
1144111548
// so that the result is only dependent on src0. But we use the same source
@@ -11533,49 +11640,6 @@ let Predicates = [HasVLX] in {
1153311640
(VPTERNLOGQZ256rri VR256X:$src, VR256X:$src, VR256X:$src, (i8 15))>;
1153411641
}
1153511642

11536-
let Predicates = [HasVLX] in {
11537-
def : Pat<(v16i8 (or (and VR128X:$src1, VR128X:$src2),
11538-
(X86andnp VR128X:$src1, VR128X:$src3))),
11539-
(VPTERNLOGQZ128rri VR128X:$src1, VR128X:$src2, VR128X:$src3, (i8 202))>;
11540-
def : Pat<(v8i16 (or (and VR128X:$src1, VR128X:$src2),
11541-
(X86andnp VR128X:$src1, VR128X:$src3))),
11542-
(VPTERNLOGQZ128rri VR128X:$src1, VR128X:$src2, VR128X:$src3, (i8 202))>;
11543-
def : Pat<(v4i32 (or (and VR128X:$src1, VR128X:$src2),
11544-
(X86andnp VR128X:$src1, VR128X:$src3))),
11545-
(VPTERNLOGQZ128rri VR128X:$src1, VR128X:$src2, VR128X:$src3, (i8 202))>;
11546-
def : Pat<(v2i64 (or (and VR128X:$src1, VR128X:$src2),
11547-
(X86andnp VR128X:$src1, VR128X:$src3))),
11548-
(VPTERNLOGQZ128rri VR128X:$src1, VR128X:$src2, VR128X:$src3, (i8 202))>;
11549-
11550-
def : Pat<(v32i8 (or (and VR256X:$src1, VR256X:$src2),
11551-
(X86andnp VR256X:$src1, VR256X:$src3))),
11552-
(VPTERNLOGQZ256rri VR256X:$src1, VR256X:$src2, VR256X:$src3, (i8 202))>;
11553-
def : Pat<(v16i16 (or (and VR256X:$src1, VR256X:$src2),
11554-
(X86andnp VR256X:$src1, VR256X:$src3))),
11555-
(VPTERNLOGQZ256rri VR256X:$src1, VR256X:$src2, VR256X:$src3, (i8 202))>;
11556-
def : Pat<(v8i32 (or (and VR256X:$src1, VR256X:$src2),
11557-
(X86andnp VR256X:$src1, VR256X:$src3))),
11558-
(VPTERNLOGQZ256rri VR256X:$src1, VR256X:$src2, VR256X:$src3, (i8 202))>;
11559-
def : Pat<(v4i64 (or (and VR256X:$src1, VR256X:$src2),
11560-
(X86andnp VR256X:$src1, VR256X:$src3))),
11561-
(VPTERNLOGQZ256rri VR256X:$src1, VR256X:$src2, VR256X:$src3, (i8 202))>;
11562-
}
11563-
11564-
let Predicates = [HasAVX512] in {
11565-
def : Pat<(v64i8 (or (and VR512:$src1, VR512:$src2),
11566-
(X86andnp VR512:$src1, VR512:$src3))),
11567-
(VPTERNLOGQZrri VR512:$src1, VR512:$src2, VR512:$src3, (i8 202))>;
11568-
def : Pat<(v32i16 (or (and VR512:$src1, VR512:$src2),
11569-
(X86andnp VR512:$src1, VR512:$src3))),
11570-
(VPTERNLOGQZrri VR512:$src1, VR512:$src2, VR512:$src3, (i8 202))>;
11571-
def : Pat<(v16i32 (or (and VR512:$src1, VR512:$src2),
11572-
(X86andnp VR512:$src1, VR512:$src3))),
11573-
(VPTERNLOGQZrri VR512:$src1, VR512:$src2, VR512:$src3, (i8 202))>;
11574-
def : Pat<(v8i64 (or (and VR512:$src1, VR512:$src2),
11575-
(X86andnp VR512:$src1, VR512:$src3))),
11576-
(VPTERNLOGQZrri VR512:$src1, VR512:$src2, VR512:$src3, (i8 202))>;
11577-
}
11578-
1157911643
//===----------------------------------------------------------------------===//
1158011644
// AVX-512 - FixupImm
1158111645
//===----------------------------------------------------------------------===//

llvm/test/CodeGen/X86/combine-bitselect.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -548,9 +548,9 @@ define <8 x i64> @bitselect_v8i64_mm(<8 x i64>* nocapture readonly, <8 x i64>* n
548548
;
549549
; AVX512F-LABEL: bitselect_v8i64_mm:
550550
; AVX512F: # %bb.0:
551-
; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm1
552-
; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm0
553-
; AVX512F-NEXT: vpternlogq $228, {{.*}}(%rip), %zmm1, %zmm0
551+
; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm1
552+
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022]
553+
; AVX512F-NEXT: vpternlogq $202, (%rdi), %zmm1, %zmm0
554554
; AVX512F-NEXT: retq
555555
%3 = load <8 x i64>, <8 x i64>* %0
556556
%4 = load <8 x i64>, <8 x i64>* %1

llvm/test/CodeGen/X86/vec-copysign-avx512.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ define <4 x float> @v4f32(<4 x float> %a, <4 x float> %b) nounwind {
66
; CHECK-LABEL: v4f32:
77
; CHECK: ## %bb.0:
88
; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN]
9-
; CHECK-NEXT: vpternlogq $226, %xmm1, %xmm2, %xmm0
9+
; CHECK-NEXT: vpternlogd $226, %xmm1, %xmm2, %xmm0
1010
; CHECK-NEXT: retq
1111
%tmp = tail call <4 x float> @llvm.copysign.v4f32( <4 x float> %a, <4 x float> %b )
1212
ret <4 x float> %tmp
@@ -16,7 +16,7 @@ define <8 x float> @v8f32(<8 x float> %a, <8 x float> %b) nounwind {
1616
; CHECK-LABEL: v8f32:
1717
; CHECK: ## %bb.0:
1818
; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
19-
; CHECK-NEXT: vpternlogq $226, %ymm1, %ymm2, %ymm0
19+
; CHECK-NEXT: vpternlogd $226, %ymm1, %ymm2, %ymm0
2020
; CHECK-NEXT: retq
2121
%tmp = tail call <8 x float> @llvm.copysign.v8f32( <8 x float> %a, <8 x float> %b )
2222
ret <8 x float> %tmp
@@ -26,7 +26,7 @@ define <16 x float> @v16f32(<16 x float> %a, <16 x float> %b) nounwind {
2626
; CHECK-LABEL: v16f32:
2727
; CHECK: ## %bb.0:
2828
; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm2 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
29-
; CHECK-NEXT: vpternlogq $226, %zmm1, %zmm2, %zmm0
29+
; CHECK-NEXT: vpternlogd $226, %zmm1, %zmm2, %zmm0
3030
; CHECK-NEXT: retq
3131
%tmp = tail call <16 x float> @llvm.copysign.v16f32( <16 x float> %a, <16 x float> %b )
3232
ret <16 x float> %tmp

llvm/test/CodeGen/X86/vector-fshl-128.ll

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2991,9 +2991,9 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwi
29912991
;
29922992
; AVX512VL-LABEL: splatconstant_funnnel_v16i8:
29932993
; AVX512VL: # %bb.0:
2994-
; AVX512VL-NEXT: vpsrlw $4, %xmm1, %xmm1
2995-
; AVX512VL-NEXT: vpsllw $4, %xmm0, %xmm0
2996-
; AVX512VL-NEXT: vpternlogq $228, {{.*}}(%rip), %xmm1, %xmm0
2994+
; AVX512VL-NEXT: vpsllw $4, %xmm0, %xmm2
2995+
; AVX512VL-NEXT: vpsrlw $4, %xmm1, %xmm0
2996+
; AVX512VL-NEXT: vpternlogq $216, {{.*}}(%rip), %xmm2, %xmm0
29972997
; AVX512VL-NEXT: retq
29982998
;
29992999
; AVX512BW-LABEL: splatconstant_funnnel_v16i8:
@@ -3016,16 +3016,16 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwi
30163016
;
30173017
; AVX512VLBW-LABEL: splatconstant_funnnel_v16i8:
30183018
; AVX512VLBW: # %bb.0:
3019-
; AVX512VLBW-NEXT: vpsrlw $4, %xmm1, %xmm1
3020-
; AVX512VLBW-NEXT: vpsllw $4, %xmm0, %xmm0
3021-
; AVX512VLBW-NEXT: vpternlogq $228, {{.*}}(%rip), %xmm1, %xmm0
3019+
; AVX512VLBW-NEXT: vpsllw $4, %xmm0, %xmm2
3020+
; AVX512VLBW-NEXT: vpsrlw $4, %xmm1, %xmm0
3021+
; AVX512VLBW-NEXT: vpternlogq $216, {{.*}}(%rip), %xmm2, %xmm0
30223022
; AVX512VLBW-NEXT: retq
30233023
;
30243024
; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v16i8:
30253025
; AVX512VLVBMI2: # %bb.0:
3026-
; AVX512VLVBMI2-NEXT: vpsrlw $4, %xmm1, %xmm1
3027-
; AVX512VLVBMI2-NEXT: vpsllw $4, %xmm0, %xmm0
3028-
; AVX512VLVBMI2-NEXT: vpternlogq $228, {{.*}}(%rip), %xmm1, %xmm0
3026+
; AVX512VLVBMI2-NEXT: vpsllw $4, %xmm0, %xmm2
3027+
; AVX512VLVBMI2-NEXT: vpsrlw $4, %xmm1, %xmm0
3028+
; AVX512VLVBMI2-NEXT: vpternlogq $216, {{.*}}(%rip), %xmm2, %xmm0
30293029
; AVX512VLVBMI2-NEXT: retq
30303030
;
30313031
; XOP-LABEL: splatconstant_funnnel_v16i8:

llvm/test/CodeGen/X86/vector-fshl-256.ll

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2514,9 +2514,9 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwi
25142514
;
25152515
; AVX512VL-LABEL: splatconstant_funnnel_v32i8:
25162516
; AVX512VL: # %bb.0:
2517-
; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm1
2518-
; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm0
2519-
; AVX512VL-NEXT: vpternlogq $228, {{.*}}(%rip), %ymm1, %ymm0
2517+
; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2
2518+
; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm0
2519+
; AVX512VL-NEXT: vpternlogq $216, {{.*}}(%rip), %ymm2, %ymm0
25202520
; AVX512VL-NEXT: retq
25212521
;
25222522
; AVX512BW-LABEL: splatconstant_funnnel_v32i8:
@@ -2539,16 +2539,16 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwi
25392539
;
25402540
; AVX512VLBW-LABEL: splatconstant_funnnel_v32i8:
25412541
; AVX512VLBW: # %bb.0:
2542-
; AVX512VLBW-NEXT: vpsrlw $4, %ymm1, %ymm1
2543-
; AVX512VLBW-NEXT: vpsllw $4, %ymm0, %ymm0
2544-
; AVX512VLBW-NEXT: vpternlogq $228, {{.*}}(%rip), %ymm1, %ymm0
2542+
; AVX512VLBW-NEXT: vpsllw $4, %ymm0, %ymm2
2543+
; AVX512VLBW-NEXT: vpsrlw $4, %ymm1, %ymm0
2544+
; AVX512VLBW-NEXT: vpternlogq $216, {{.*}}(%rip), %ymm2, %ymm0
25452545
; AVX512VLBW-NEXT: retq
25462546
;
25472547
; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v32i8:
25482548
; AVX512VLVBMI2: # %bb.0:
2549-
; AVX512VLVBMI2-NEXT: vpsrlw $4, %ymm1, %ymm1
2550-
; AVX512VLVBMI2-NEXT: vpsllw $4, %ymm0, %ymm0
2551-
; AVX512VLVBMI2-NEXT: vpternlogq $228, {{.*}}(%rip), %ymm1, %ymm0
2549+
; AVX512VLVBMI2-NEXT: vpsllw $4, %ymm0, %ymm2
2550+
; AVX512VLVBMI2-NEXT: vpsrlw $4, %ymm1, %ymm0
2551+
; AVX512VLVBMI2-NEXT: vpternlogq $216, {{.*}}(%rip), %ymm2, %ymm0
25522552
; AVX512VLVBMI2-NEXT: retq
25532553
;
25542554
; XOPAVX1-LABEL: splatconstant_funnnel_v32i8:

llvm/test/CodeGen/X86/vector-fshl-512.ll

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1559,30 +1559,30 @@ define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwi
15591559
;
15601560
; AVX512BW-LABEL: splatconstant_funnnel_v64i8:
15611561
; AVX512BW: # %bb.0:
1562-
; AVX512BW-NEXT: vpsrlw $4, %zmm1, %zmm1
1563-
; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm0
1564-
; AVX512BW-NEXT: vpternlogq $228, {{.*}}(%rip), %zmm1, %zmm0
1562+
; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm2
1563+
; AVX512BW-NEXT: vpsrlw $4, %zmm1, %zmm0
1564+
; AVX512BW-NEXT: vpternlogq $216, {{.*}}(%rip), %zmm2, %zmm0
15651565
; AVX512BW-NEXT: retq
15661566
;
15671567
; AVX512VBMI2-LABEL: splatconstant_funnnel_v64i8:
15681568
; AVX512VBMI2: # %bb.0:
1569-
; AVX512VBMI2-NEXT: vpsrlw $4, %zmm1, %zmm1
1570-
; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm0
1571-
; AVX512VBMI2-NEXT: vpternlogq $228, {{.*}}(%rip), %zmm1, %zmm0
1569+
; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm2
1570+
; AVX512VBMI2-NEXT: vpsrlw $4, %zmm1, %zmm0
1571+
; AVX512VBMI2-NEXT: vpternlogq $216, {{.*}}(%rip), %zmm2, %zmm0
15721572
; AVX512VBMI2-NEXT: retq
15731573
;
15741574
; AVX512VLBW-LABEL: splatconstant_funnnel_v64i8:
15751575
; AVX512VLBW: # %bb.0:
1576-
; AVX512VLBW-NEXT: vpsrlw $4, %zmm1, %zmm1
1577-
; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm0
1578-
; AVX512VLBW-NEXT: vpternlogq $228, {{.*}}(%rip), %zmm1, %zmm0
1576+
; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm2
1577+
; AVX512VLBW-NEXT: vpsrlw $4, %zmm1, %zmm0
1578+
; AVX512VLBW-NEXT: vpternlogq $216, {{.*}}(%rip), %zmm2, %zmm0
15791579
; AVX512VLBW-NEXT: retq
15801580
;
15811581
; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v64i8:
15821582
; AVX512VLVBMI2: # %bb.0:
1583-
; AVX512VLVBMI2-NEXT: vpsrlw $4, %zmm1, %zmm1
1584-
; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm0
1585-
; AVX512VLVBMI2-NEXT: vpternlogq $228, {{.*}}(%rip), %zmm1, %zmm0
1583+
; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm2
1584+
; AVX512VLVBMI2-NEXT: vpsrlw $4, %zmm1, %zmm0
1585+
; AVX512VLVBMI2-NEXT: vpternlogq $216, {{.*}}(%rip), %zmm2, %zmm0
15861586
; AVX512VLVBMI2-NEXT: retq
15871587
%res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>)
15881588
ret <64 x i8> %res

0 commit comments

Comments
 (0)