Skip to content

Commit 0f652d8

Browse files
author
Simon Pilgrim
committed
[X86] LowerRotate - recognise hidden ROTR patterns for better vXi8 codegen
Check for a hidden ISD::ROTR (rotl(sub(0,x))) - vXi8 lowering can handle both (its always beneficial for splats, but otherwise only if we have VPTERNLOG). We currently hit infinite loops in TargetLowering::expandROT if we set ISD::ROTR to custom, which needs addressing before we extend this much further.
1 parent 47eb3f1 commit 0f652d8

File tree

3 files changed

+108
-103
lines changed

3 files changed

+108
-103
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 30 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -29854,20 +29854,30 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
2985429854
if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()))
2985529855
return SDValue();
2985629856

29857+
// Check for a hidden ISD::ROTR, vXi8 lowering can handle both, but we
29858+
// currently hit infinite loops in legalization if we allow ISD::ROTR.
29859+
// FIXME: Infinite ROTL<->ROTR legalization in TargetLowering::expandROT.
29860+
SDValue HiddenROTRAmt;
29861+
if (Amt.getOpcode() == ISD::SUB &&
29862+
ISD::isBuildVectorAllZeros(Amt.getOperand(0).getNode()))
29863+
HiddenROTRAmt = Amt.getOperand(1);
29864+
2985729865
MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
2985829866

2985929867
// If the amount is a splat, attempt to fold as unpack(x,x) << zext(y):
2986029868
// rotl(x,y) -> (((aext(x) << bw) | zext(x)) << (y & (bw-1))) >> bw.
29861-
if (SDValue BaseRotAmt =
29862-
DAG.getSplatValue(DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask))) {
29869+
// rotr(x,y) -> (((aext(x) << bw) | zext(x)) >> (y & (bw-1))).
29870+
if (SDValue BaseRotAmt = DAG.getSplatValue(DAG.getNode(
29871+
ISD::AND, DL, VT, HiddenROTRAmt ? HiddenROTRAmt : Amt, AmtMask))) {
29872+
unsigned ShiftX86Opc = HiddenROTRAmt ? X86ISD::VSRLI : X86ISD::VSHLI;
2986329873
BaseRotAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, BaseRotAmt);
2986429874
SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
2986529875
SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
29866-
Lo = getTargetVShiftNode(X86ISD::VSHLI, DL, ExtVT, Lo, BaseRotAmt,
29876+
Lo = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Lo, BaseRotAmt,
2986729877
Subtarget, DAG);
29868-
Hi = getTargetVShiftNode(X86ISD::VSHLI, DL, ExtVT, Hi, BaseRotAmt,
29878+
Hi = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Hi, BaseRotAmt,
2986929879
Subtarget, DAG);
29870-
return getPack(DAG, Subtarget, DL, VT, Lo, Hi, /*PackHiHalf */ true);
29880+
return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !HiddenROTRAmt);
2987129881
}
2987229882

2987329883
// We don't need ModuloAmt here as we just peek at individual bits.
@@ -29889,6 +29899,15 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
2988929899
return DAG.getSelect(DL, SelVT, C, V0, V1);
2989029900
};
2989129901

29902+
// 'Hidden' ROTR is currently only profitable on AVX512 targets where we
29903+
// have VPTERNLOG.
29904+
unsigned ShiftLHS = ISD::SHL;
29905+
unsigned ShiftRHS = ISD::SRL;
29906+
if (HiddenROTRAmt && useVPTERNLOG(Subtarget, VT)) {
29907+
std::swap(ShiftLHS, ShiftRHS);
29908+
Amt = HiddenROTRAmt;
29909+
}
29910+
2989229911
// Turn 'a' into a mask suitable for VSELECT: a = a << 5;
2989329912
// We can safely do this using i16 shifts as we're only interested in
2989429913
// the 3 lower bits of each byte.
@@ -29900,8 +29919,8 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
2990029919
SDValue M;
2990129920
M = DAG.getNode(
2990229921
ISD::OR, DL, VT,
29903-
DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(4, DL, VT)),
29904-
DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(4, DL, VT)));
29922+
DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(4, DL, VT)),
29923+
DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(4, DL, VT)));
2990529924
R = SignBitSelect(VT, Amt, M, R);
2990629925

2990729926
// a += a
@@ -29910,8 +29929,8 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
2991029929
// r = VSELECT(r, rot(r, 2), a);
2991129930
M = DAG.getNode(
2991229931
ISD::OR, DL, VT,
29913-
DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(2, DL, VT)),
29914-
DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(6, DL, VT)));
29932+
DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(2, DL, VT)),
29933+
DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(6, DL, VT)));
2991529934
R = SignBitSelect(VT, Amt, M, R);
2991629935

2991729936
// a += a
@@ -29920,8 +29939,8 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
2992029939
// return VSELECT(r, rot(r, 1), a);
2992129940
M = DAG.getNode(
2992229941
ISD::OR, DL, VT,
29923-
DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(1, DL, VT)),
29924-
DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(7, DL, VT)));
29942+
DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(1, DL, VT)),
29943+
DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(7, DL, VT)));
2992529944
return SignBitSelect(VT, Amt, M, R);
2992629945
}
2992729946

llvm/test/CodeGen/X86/vector-fshr-rot-128.ll

Lines changed: 36 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1195,47 +1195,44 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind
11951195
define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind {
11961196
; SSE2-LABEL: splatvar_funnnel_v16i8:
11971197
; SSE2: # %bb.0:
1198-
; SSE2-NEXT: pxor %xmm2, %xmm2
1199-
; SSE2-NEXT: psubb %xmm1, %xmm2
1200-
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
1201-
; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
1202-
; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1203-
; SSE2-NEXT: movdqa %xmm0, %xmm1
1204-
; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1205-
; SSE2-NEXT: psllw %xmm2, %xmm1
1206-
; SSE2-NEXT: psrlw $8, %xmm1
1198+
; SSE2-NEXT: movdqa %xmm0, %xmm2
1199+
; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
1200+
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1201+
; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
1202+
; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1203+
; SSE2-NEXT: psrlw %xmm1, %xmm2
1204+
; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
1205+
; SSE2-NEXT: pand %xmm3, %xmm2
12071206
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1208-
; SSE2-NEXT: psllw %xmm2, %xmm0
1209-
; SSE2-NEXT: psrlw $8, %xmm0
1210-
; SSE2-NEXT: packuswb %xmm1, %xmm0
1207+
; SSE2-NEXT: psrlw %xmm1, %xmm0
1208+
; SSE2-NEXT: pand %xmm3, %xmm0
1209+
; SSE2-NEXT: packuswb %xmm2, %xmm0
12111210
; SSE2-NEXT: retq
12121211
;
12131212
; SSE41-LABEL: splatvar_funnnel_v16i8:
12141213
; SSE41: # %bb.0:
1215-
; SSE41-NEXT: pxor %xmm2, %xmm2
1216-
; SSE41-NEXT: psubb %xmm1, %xmm2
1217-
; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
1218-
; SSE41-NEXT: movdqa %xmm0, %xmm1
1219-
; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1220-
; SSE41-NEXT: psllw %xmm2, %xmm1
1221-
; SSE41-NEXT: psrlw $8, %xmm1
1214+
; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1215+
; SSE41-NEXT: movdqa %xmm0, %xmm2
1216+
; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
1217+
; SSE41-NEXT: psrlw %xmm1, %xmm2
1218+
; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
1219+
; SSE41-NEXT: pand %xmm3, %xmm2
12221220
; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1223-
; SSE41-NEXT: psllw %xmm2, %xmm0
1224-
; SSE41-NEXT: psrlw $8, %xmm0
1225-
; SSE41-NEXT: packuswb %xmm1, %xmm0
1221+
; SSE41-NEXT: psrlw %xmm1, %xmm0
1222+
; SSE41-NEXT: pand %xmm3, %xmm0
1223+
; SSE41-NEXT: packuswb %xmm2, %xmm0
12261224
; SSE41-NEXT: retq
12271225
;
12281226
; AVX-LABEL: splatvar_funnnel_v16i8:
12291227
; AVX: # %bb.0:
1230-
; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
1231-
; AVX-NEXT: vpsubb %xmm1, %xmm2, %xmm1
12321228
; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
12331229
; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1234-
; AVX-NEXT: vpsllw %xmm1, %xmm2, %xmm2
1235-
; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2
1230+
; AVX-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
1231+
; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
1232+
; AVX-NEXT: vpand %xmm3, %xmm2, %xmm2
12361233
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1237-
; AVX-NEXT: vpsllw %xmm1, %xmm0, %xmm0
1238-
; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0
1234+
; AVX-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
1235+
; AVX-NEXT: vpand %xmm3, %xmm0, %xmm0
12391236
; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
12401237
; AVX-NEXT: retq
12411238
;
@@ -1349,19 +1346,18 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind
13491346
;
13501347
; X86-SSE2-LABEL: splatvar_funnnel_v16i8:
13511348
; X86-SSE2: # %bb.0:
1352-
; X86-SSE2-NEXT: pxor %xmm2, %xmm2
1353-
; X86-SSE2-NEXT: psubb %xmm1, %xmm2
1354-
; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
1355-
; X86-SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
1356-
; X86-SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1357-
; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
1358-
; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1359-
; X86-SSE2-NEXT: psllw %xmm2, %xmm1
1360-
; X86-SSE2-NEXT: psrlw $8, %xmm1
1349+
; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
1350+
; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
1351+
; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
1352+
; X86-SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
1353+
; X86-SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1354+
; X86-SSE2-NEXT: psrlw %xmm1, %xmm2
1355+
; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
1356+
; X86-SSE2-NEXT: pand %xmm3, %xmm2
13611357
; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1362-
; X86-SSE2-NEXT: psllw %xmm2, %xmm0
1363-
; X86-SSE2-NEXT: psrlw $8, %xmm0
1364-
; X86-SSE2-NEXT: packuswb %xmm1, %xmm0
1358+
; X86-SSE2-NEXT: psrlw %xmm1, %xmm0
1359+
; X86-SSE2-NEXT: pand %xmm3, %xmm0
1360+
; X86-SSE2-NEXT: packuswb %xmm2, %xmm0
13651361
; X86-SSE2-NEXT: retl
13661362
%splat = shufflevector <16 x i8> %amt, <16 x i8> undef, <16 x i32> zeroinitializer
13671363
%res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %x, <16 x i8> %splat)

llvm/test/CodeGen/X86/vector-fshr-rot-256.ll

Lines changed: 42 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -490,43 +490,38 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind {
490490
;
491491
; AVX512F-LABEL: var_funnnel_v32i8:
492492
; AVX512F: # %bb.0:
493-
; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm2
494-
; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm3
493+
; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm2
494+
; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm3
495495
; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm3
496-
; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
497-
; AVX512F-NEXT: vpsubb %ymm1, %ymm2, %ymm1
498496
; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm1
499497
; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
500-
; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm2
501-
; AVX512F-NEXT: vpsrlw $6, %ymm0, %ymm3
498+
; AVX512F-NEXT: vpsrlw $2, %ymm0, %ymm2
499+
; AVX512F-NEXT: vpsllw $6, %ymm0, %ymm3
502500
; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm3
503501
; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
504502
; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
505-
; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm2
506-
; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
507-
; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm3
508-
; AVX512F-NEXT: vpor %ymm2, %ymm3, %ymm2
503+
; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm2
504+
; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm3
505+
; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm3
509506
; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
510-
; AVX512F-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
507+
; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
511508
; AVX512F-NEXT: retq
512509
;
513510
; AVX512VL-LABEL: var_funnnel_v32i8:
514511
; AVX512VL: # %bb.0:
515-
; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2
516-
; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3
512+
; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm2
513+
; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm3
517514
; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm2, %ymm3
518-
; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
519-
; AVX512VL-NEXT: vpsubb %ymm1, %ymm2, %ymm1
520515
; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1
521516
; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
522-
; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm2
523-
; AVX512VL-NEXT: vpsrlw $6, %ymm0, %ymm3
517+
; AVX512VL-NEXT: vpsrlw $2, %ymm0, %ymm2
518+
; AVX512VL-NEXT: vpsllw $6, %ymm0, %ymm3
524519
; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm2, %ymm3
525520
; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
526521
; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
527-
; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm2
528-
; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm3
529-
; AVX512VL-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm3
522+
; AVX512VL-NEXT: vpsrlw $1, %ymm0, %ymm2
523+
; AVX512VL-NEXT: vpsllw $7, %ymm0, %ymm3
524+
; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm2, %ymm3
530525
; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
531526
; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
532527
; AVX512VL-NEXT: retq
@@ -975,70 +970,65 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounw
975970
define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind {
976971
; AVX1-LABEL: splatvar_funnnel_v32i8:
977972
; AVX1: # %bb.0:
978-
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
979-
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
980-
; AVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1
981973
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
982974
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
983975
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
984-
; AVX1-NEXT: vpsllw %xmm1, %xmm3, %xmm3
985-
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
976+
; AVX1-NEXT: vpsrlw %xmm1, %xmm3, %xmm3
977+
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
978+
; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
986979
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
987-
; AVX1-NEXT: vpsllw %xmm1, %xmm2, %xmm2
988-
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
980+
; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
981+
; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
989982
; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
990983
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
991-
; AVX1-NEXT: vpsllw %xmm1, %xmm3, %xmm3
992-
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
984+
; AVX1-NEXT: vpsrlw %xmm1, %xmm3, %xmm3
985+
; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
993986
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
994-
; AVX1-NEXT: vpsllw %xmm1, %xmm0, %xmm0
995-
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
987+
; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
988+
; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
996989
; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
997990
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
998991
; AVX1-NEXT: retq
999992
;
1000993
; AVX2-LABEL: splatvar_funnnel_v32i8:
1001994
; AVX2: # %bb.0:
1002-
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
1003-
; AVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1
995+
; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1004996
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1005997
; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1006-
; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1007-
; AVX2-NEXT: vpsllw %xmm1, %ymm2, %ymm2
1008-
; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
998+
; AVX2-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
999+
; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1000+
; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2
10091001
; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1010-
; AVX2-NEXT: vpsllw %xmm1, %ymm0, %ymm0
1011-
; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
1002+
; AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
1003+
; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0
10121004
; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
10131005
; AVX2-NEXT: retq
10141006
;
10151007
; AVX512F-LABEL: splatvar_funnnel_v32i8:
10161008
; AVX512F: # %bb.0:
1017-
; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
1018-
; AVX512F-NEXT: vpsubb %xmm1, %xmm2, %xmm1
1009+
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
10191010
; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
10201011
; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1021-
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1022-
; AVX512F-NEXT: vpsllw %xmm1, %ymm2, %ymm2
1023-
; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
1012+
; AVX512F-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
1013+
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1014+
; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
10241015
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1025-
; AVX512F-NEXT: vpsllw %xmm1, %ymm0, %ymm0
1026-
; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
1016+
; AVX512F-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
1017+
; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0
10271018
; AVX512F-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
10281019
; AVX512F-NEXT: retq
10291020
;
10301021
; AVX512VL-LABEL: splatvar_funnnel_v32i8:
10311022
; AVX512VL: # %bb.0:
1032-
; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
1033-
; AVX512VL-NEXT: vpsubb %xmm1, %xmm2, %xmm1
1023+
; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
10341024
; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
10351025
; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1036-
; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1037-
; AVX512VL-NEXT: vpsllw %xmm1, %ymm2, %ymm2
1038-
; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2
1026+
; AVX512VL-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
1027+
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1028+
; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2
10391029
; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1040-
; AVX512VL-NEXT: vpsllw %xmm1, %ymm0, %ymm0
1041-
; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
1030+
; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
1031+
; AVX512VL-NEXT: vpand %ymm3, %ymm0, %ymm0
10421032
; AVX512VL-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
10431033
; AVX512VL-NEXT: retq
10441034
;

0 commit comments

Comments
 (0)