Skip to content

Commit e8d60f1

Browse files
committed
[X86] Disable lowering constant build vectors to broadcasts on AVX512 targets
On AVX512 targets we're better off keeping constant vector at full width to ensure that they can be load folded into vector instructions, reducing register pressure. If a vector constant remains as a basic load, X86FixupVectorConstantsPass will still convert this to a broadcast instruction for us. Non-VLX targets are still seeing some regressions due to these being implicitly widened to 512-bit ops in isel patterns and not in the DAG, so I've limited this to just 512-bit vectors for now.
1 parent 2abd71e commit e8d60f1

File tree

98 files changed

+92997
-96569
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

98 files changed

+92997
-96569
lines changed

llvm/lib/Target/X86/X86FixupVectorConstants.cpp

Lines changed: 206 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -407,6 +407,26 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
407407
{X86::VMOVSDZrm, 1, 64, rebuildZeroUpperCst},
408408
{X86::VMOVDDUPZ128rm, 1, 64, rebuildSplatCst}},
409409
128, 1);
410+
case X86::VMOVAPDZ128rmk:
411+
case X86::VMOVUPDZ128rmk:
412+
return FixupConstant({{X86::VMOVSDZrmk, 1, 64, rebuildZeroUpperCst},
413+
{X86::VMOVDDUPZ128rmk, 1, 64, rebuildSplatCst}},
414+
128, 3);
415+
case X86::VMOVAPDZ128rmkz:
416+
case X86::VMOVUPDZ128rmkz:
417+
return FixupConstant({{X86::VMOVSDZrmkz, 1, 64, rebuildZeroUpperCst},
418+
{X86::VMOVDDUPZ128rmkz, 1, 64, rebuildSplatCst}},
419+
128, 2);
420+
case X86::VMOVAPSZ128rmk:
421+
case X86::VMOVUPSZ128rmk:
422+
return FixupConstant({{X86::VMOVSSZrmk, 1, 32, rebuildZeroUpperCst},
423+
{X86::VBROADCASTSSZ128rmk, 1, 32, rebuildSplatCst}},
424+
128, 3);
425+
case X86::VMOVAPSZ128rmkz:
426+
case X86::VMOVUPSZ128rmkz:
427+
return FixupConstant({{X86::VMOVSSZrmkz, 1, 32, rebuildZeroUpperCst},
428+
{X86::VBROADCASTSSZ128rmkz, 1, 32, rebuildSplatCst}},
429+
128, 2);
410430
case X86::VMOVAPDZ256rm:
411431
case X86::VMOVAPSZ256rm:
412432
case X86::VMOVUPDZ256rm:
@@ -416,6 +436,26 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
416436
{X86::VBROADCASTSDZ256rm, 1, 64, rebuildSplatCst},
417437
{X86::VBROADCASTF32X4Z256rm, 1, 128, rebuildSplatCst}},
418438
256, 1);
439+
case X86::VMOVAPDZ256rmk:
440+
case X86::VMOVUPDZ256rmk:
441+
return FixupConstant({{X86::VBROADCASTSDZ256rmk, 1, 64, rebuildSplatCst}},
442+
256, 3);
443+
case X86::VMOVAPDZ256rmkz:
444+
case X86::VMOVUPDZ256rmkz:
445+
return FixupConstant({{X86::VBROADCASTSDZ256rmkz, 1, 64, rebuildSplatCst}},
446+
256, 2);
447+
case X86::VMOVAPSZ256rmk:
448+
case X86::VMOVUPSZ256rmk:
449+
return FixupConstant(
450+
{{X86::VBROADCASTSSZ256rmk, 1, 32, rebuildSplatCst},
451+
{X86::VBROADCASTF32X4Z256rmk, 1, 128, rebuildSplatCst}},
452+
256, 3);
453+
case X86::VMOVAPSZ256rmkz:
454+
case X86::VMOVUPSZ256rmkz:
455+
return FixupConstant(
456+
{{X86::VBROADCASTSSZ256rmkz, 1, 32, rebuildSplatCst},
457+
{X86::VBROADCASTF32X4Z256rmkz, 1, 128, rebuildSplatCst}},
458+
256, 2);
419459
case X86::VMOVAPDZrm:
420460
case X86::VMOVAPSZrm:
421461
case X86::VMOVUPDZrm:
@@ -425,6 +465,26 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
425465
{X86::VBROADCASTF32X4rm, 1, 128, rebuildSplatCst},
426466
{X86::VBROADCASTF64X4rm, 1, 256, rebuildSplatCst}},
427467
512, 1);
468+
case X86::VMOVAPDZrmk:
469+
case X86::VMOVUPDZrmk:
470+
return FixupConstant({{X86::VBROADCASTSDZrmk, 1, 64, rebuildSplatCst},
471+
{X86::VBROADCASTF64X4rmk, 1, 256, rebuildSplatCst}},
472+
512, 3);
473+
case X86::VMOVAPDZrmkz:
474+
case X86::VMOVUPDZrmkz:
475+
return FixupConstant({{X86::VBROADCASTSDZrmkz, 1, 64, rebuildSplatCst},
476+
{X86::VBROADCASTF64X4rmkz, 1, 256, rebuildSplatCst}},
477+
512, 2);
478+
case X86::VMOVAPSZrmk:
479+
case X86::VMOVUPSZrmk:
480+
return FixupConstant({{X86::VBROADCASTSSZrmk, 1, 32, rebuildSplatCst},
481+
{X86::VBROADCASTF32X4rmk, 1, 128, rebuildSplatCst}},
482+
512, 3);
483+
case X86::VMOVAPSZrmkz:
484+
case X86::VMOVUPSZrmkz:
485+
return FixupConstant({{X86::VBROADCASTSSZrmkz, 1, 32, rebuildSplatCst},
486+
{X86::VBROADCASTF32X4rmkz, 1, 128, rebuildSplatCst}},
487+
512, 2);
428488
/* Integer Loads */
429489
case X86::MOVDQArm:
430490
case X86::MOVDQUrm: {
@@ -520,6 +580,42 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
520580
{X86::VPMOVZXDQZ128rm, 2, 32, rebuildZExtCst}};
521581
return FixupConstant(Fixups, 128, 1);
522582
}
583+
case X86::VMOVDQA32Z128rmk:
584+
case X86::VMOVDQU32Z128rmk:
585+
return FixupConstant({{X86::VPBROADCASTDZ128rmk, 1, 32, rebuildSplatCst},
586+
{X86::VPMOVSXBDZ128rmk, 4, 8, rebuildSExtCst},
587+
{X86::VPMOVZXBDZ128rmk, 4, 8, rebuildZExtCst},
588+
{X86::VPMOVSXWDZ128rmk, 4, 16, rebuildSExtCst},
589+
{X86::VPMOVZXWDZ128rmk, 4, 16, rebuildZExtCst}},
590+
128, 3);
591+
case X86::VMOVDQA32Z128rmkz:
592+
case X86::VMOVDQU32Z128rmkz:
593+
return FixupConstant({{X86::VPBROADCASTDZ128rmkz, 1, 32, rebuildSplatCst},
594+
{X86::VPMOVSXBDZ128rmkz, 4, 8, rebuildSExtCst},
595+
{X86::VPMOVZXBDZ128rmkz, 4, 8, rebuildZExtCst},
596+
{X86::VPMOVSXWDZ128rmkz, 4, 16, rebuildSExtCst},
597+
{X86::VPMOVZXWDZ128rmkz, 4, 16, rebuildZExtCst}},
598+
128, 2);
599+
case X86::VMOVDQA64Z128rmk:
600+
case X86::VMOVDQU64Z128rmk:
601+
return FixupConstant({{X86::VPMOVSXBQZ128rmk, 2, 8, rebuildSExtCst},
602+
{X86::VPMOVZXBQZ128rmk, 2, 8, rebuildZExtCst},
603+
{X86::VPMOVSXWQZ128rmk, 2, 16, rebuildSExtCst},
604+
{X86::VPMOVZXWQZ128rmk, 2, 16, rebuildZExtCst},
605+
{X86::VPBROADCASTQZ128rmk, 1, 64, rebuildSplatCst},
606+
{X86::VPMOVSXDQZ128rmk, 2, 32, rebuildSExtCst},
607+
{X86::VPMOVZXDQZ128rmk, 2, 32, rebuildZExtCst}},
608+
128, 3);
609+
case X86::VMOVDQA64Z128rmkz:
610+
case X86::VMOVDQU64Z128rmkz:
611+
return FixupConstant({{X86::VPMOVSXBQZ128rmkz, 2, 8, rebuildSExtCst},
612+
{X86::VPMOVZXBQZ128rmkz, 2, 8, rebuildZExtCst},
613+
{X86::VPMOVSXWQZ128rmkz, 2, 16, rebuildSExtCst},
614+
{X86::VPMOVZXWQZ128rmkz, 2, 16, rebuildZExtCst},
615+
{X86::VPBROADCASTQZ128rmkz, 1, 64, rebuildSplatCst},
616+
{X86::VPMOVSXDQZ128rmkz, 2, 32, rebuildSExtCst},
617+
{X86::VPMOVZXDQZ128rmkz, 2, 32, rebuildZExtCst}},
618+
128, 2);
523619
case X86::VMOVDQA32Z256rm:
524620
case X86::VMOVDQA64Z256rm:
525621
case X86::VMOVDQU32Z256rm:
@@ -544,6 +640,46 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
544640
{X86::VPMOVZXDQZ256rm, 4, 32, rebuildZExtCst}};
545641
return FixupConstant(Fixups, 256, 1);
546642
}
643+
case X86::VMOVDQA32Z256rmk:
644+
case X86::VMOVDQU32Z256rmk:
645+
return FixupConstant(
646+
{{X86::VPBROADCASTDZ256rmk, 1, 32, rebuildSplatCst},
647+
{X86::VPMOVSXBDZ256rmk, 8, 8, rebuildSExtCst},
648+
{X86::VPMOVZXBDZ256rmk, 8, 8, rebuildZExtCst},
649+
{X86::VBROADCASTI32X4Z256rmk, 1, 128, rebuildSplatCst},
650+
{X86::VPMOVSXWDZ256rmk, 8, 16, rebuildSExtCst},
651+
{X86::VPMOVZXWDZ256rmk, 8, 16, rebuildZExtCst}},
652+
256, 3);
653+
case X86::VMOVDQA32Z256rmkz:
654+
case X86::VMOVDQU32Z256rmkz:
655+
return FixupConstant(
656+
{{X86::VPBROADCASTDZ256rmkz, 1, 32, rebuildSplatCst},
657+
{X86::VPMOVSXBDZ256rmkz, 8, 8, rebuildSExtCst},
658+
{X86::VPMOVZXBDZ256rmkz, 8, 8, rebuildZExtCst},
659+
{X86::VBROADCASTI32X4Z256rmkz, 1, 128, rebuildSplatCst},
660+
{X86::VPMOVSXWDZ256rmkz, 8, 16, rebuildSExtCst},
661+
{X86::VPMOVZXWDZ256rmkz, 8, 16, rebuildZExtCst}},
662+
256, 2);
663+
case X86::VMOVDQA64Z256rmk:
664+
case X86::VMOVDQU64Z256rmk:
665+
return FixupConstant({{X86::VPMOVSXBQZ256rmk, 4, 8, rebuildSExtCst},
666+
{X86::VPMOVZXBQZ256rmk, 4, 8, rebuildZExtCst},
667+
{X86::VPBROADCASTQZ256rmk, 1, 64, rebuildSplatCst},
668+
{X86::VPMOVSXWQZ256rmk, 4, 16, rebuildSExtCst},
669+
{X86::VPMOVZXWQZ256rmk, 4, 16, rebuildZExtCst},
670+
{X86::VPMOVSXDQZ256rmk, 4, 32, rebuildSExtCst},
671+
{X86::VPMOVZXDQZ256rmk, 4, 32, rebuildZExtCst}},
672+
256, 3);
673+
case X86::VMOVDQA64Z256rmkz:
674+
case X86::VMOVDQU64Z256rmkz:
675+
return FixupConstant({{X86::VPMOVSXBQZ256rmkz, 4, 8, rebuildSExtCst},
676+
{X86::VPMOVZXBQZ256rmkz, 4, 8, rebuildZExtCst},
677+
{X86::VPBROADCASTQZ256rmkz, 1, 64, rebuildSplatCst},
678+
{X86::VPMOVSXWQZ256rmkz, 4, 16, rebuildSExtCst},
679+
{X86::VPMOVZXWQZ256rmkz, 4, 16, rebuildZExtCst},
680+
{X86::VPMOVSXDQZ256rmkz, 4, 32, rebuildSExtCst},
681+
{X86::VPMOVZXDQZ256rmkz, 4, 32, rebuildZExtCst}},
682+
256, 2);
547683
case X86::VMOVDQA32Zrm:
548684
case X86::VMOVDQA64Zrm:
549685
case X86::VMOVDQU32Zrm:
@@ -569,43 +705,93 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
569705
{X86::VPMOVZXDQZrm, 8, 32, rebuildZExtCst}};
570706
return FixupConstant(Fixups, 512, 1);
571707
}
708+
case X86::VMOVDQA32Zrmk:
709+
case X86::VMOVDQU32Zrmk:
710+
return FixupConstant({{X86::VPBROADCASTDZrmk, 1, 32, rebuildSplatCst},
711+
{X86::VBROADCASTI32X4rmk, 1, 128, rebuildSplatCst},
712+
{X86::VPMOVSXBDZrmk, 16, 8, rebuildSExtCst},
713+
{X86::VPMOVZXBDZrmk, 16, 8, rebuildZExtCst},
714+
{X86::VPMOVSXWDZrmk, 16, 16, rebuildSExtCst},
715+
{X86::VPMOVZXWDZrmk, 16, 16, rebuildZExtCst}},
716+
512, 3);
717+
case X86::VMOVDQA32Zrmkz:
718+
case X86::VMOVDQU32Zrmkz:
719+
return FixupConstant({{X86::VPBROADCASTDZrmkz, 1, 32, rebuildSplatCst},
720+
{X86::VBROADCASTI32X4rmkz, 1, 128, rebuildSplatCst},
721+
{X86::VPMOVSXBDZrmkz, 16, 8, rebuildSExtCst},
722+
{X86::VPMOVZXBDZrmkz, 16, 8, rebuildZExtCst},
723+
{X86::VPMOVSXWDZrmkz, 16, 16, rebuildSExtCst},
724+
{X86::VPMOVZXWDZrmkz, 16, 16, rebuildZExtCst}},
725+
512, 2);
726+
case X86::VMOVDQA64Zrmk:
727+
case X86::VMOVDQU64Zrmk:
728+
return FixupConstant({{X86::VPBROADCASTQZrmk, 1, 64, rebuildSplatCst},
729+
{X86::VPMOVSXBQZrmk, 8, 8, rebuildSExtCst},
730+
{X86::VPMOVZXBQZrmk, 8, 8, rebuildZExtCst},
731+
{X86::VPMOVSXWQZrmk, 8, 16, rebuildSExtCst},
732+
{X86::VPMOVZXWQZrmk, 8, 16, rebuildZExtCst},
733+
{X86::VBROADCASTI64X4rmk, 1, 256, rebuildSplatCst},
734+
{X86::VPMOVSXDQZrmk, 8, 32, rebuildSExtCst},
735+
{X86::VPMOVZXDQZrmk, 8, 32, rebuildZExtCst}},
736+
512, 3);
737+
case X86::VMOVDQA64Zrmkz:
738+
case X86::VMOVDQU64Zrmkz:
739+
return FixupConstant({{X86::VPBROADCASTQZrmkz, 1, 64, rebuildSplatCst},
740+
{X86::VPMOVSXBQZrmkz, 8, 8, rebuildSExtCst},
741+
{X86::VPMOVZXBQZrmkz, 8, 8, rebuildZExtCst},
742+
{X86::VPMOVSXWQZrmkz, 8, 16, rebuildSExtCst},
743+
{X86::VPMOVZXWQZrmkz, 8, 16, rebuildZExtCst},
744+
{X86::VBROADCASTI64X4rmkz, 1, 256, rebuildSplatCst},
745+
{X86::VPMOVSXDQZrmkz, 8, 32, rebuildSExtCst},
746+
{X86::VPMOVZXDQZrmkz, 8, 32, rebuildZExtCst}},
747+
512, 2);
572748
}
573749

574-
auto ConvertToBroadcastAVX512 = [&](unsigned OpSrc32, unsigned OpSrc64) {
575-
unsigned OpBcst32 = 0, OpBcst64 = 0;
576-
unsigned OpNoBcst32 = 0, OpNoBcst64 = 0;
750+
auto ConvertToBroadcastAVX512 = [&](unsigned OpSrc16, unsigned OpSrc32,
751+
unsigned OpSrc64) {
752+
if (OpSrc16) {
753+
if (const X86FoldTableEntry *Mem2Bcst =
754+
llvm::lookupBroadcastFoldTableBySize(OpSrc16, 16)) {
755+
unsigned OpBcst16 = Mem2Bcst->DstOp;
756+
unsigned OpNoBcst16 = Mem2Bcst->Flags & TB_INDEX_MASK;
757+
FixupEntry Fixups[] = {{(int)OpBcst16, 1, 16, rebuildSplatCst}};
758+
// TODO: Add support for RegBitWidth, but currently rebuildSplatCst
759+
// doesn't require it (defaults to Constant::getPrimitiveSizeInBits).
760+
if (FixupConstant(Fixups, 0, OpNoBcst16))
761+
return true;
762+
}
763+
}
577764
if (OpSrc32) {
578765
if (const X86FoldTableEntry *Mem2Bcst =
579766
llvm::lookupBroadcastFoldTableBySize(OpSrc32, 32)) {
580-
OpBcst32 = Mem2Bcst->DstOp;
581-
OpNoBcst32 = Mem2Bcst->Flags & TB_INDEX_MASK;
767+
unsigned OpBcst32 = Mem2Bcst->DstOp;
768+
unsigned OpNoBcst32 = Mem2Bcst->Flags & TB_INDEX_MASK;
769+
FixupEntry Fixups[] = {{(int)OpBcst32, 1, 32, rebuildSplatCst}};
770+
// TODO: Add support for RegBitWidth, but currently rebuildSplatCst
771+
// doesn't require it (defaults to Constant::getPrimitiveSizeInBits).
772+
if (FixupConstant(Fixups, 0, OpNoBcst32))
773+
return true;
582774
}
583775
}
584776
if (OpSrc64) {
585777
if (const X86FoldTableEntry *Mem2Bcst =
586778
llvm::lookupBroadcastFoldTableBySize(OpSrc64, 64)) {
587-
OpBcst64 = Mem2Bcst->DstOp;
588-
OpNoBcst64 = Mem2Bcst->Flags & TB_INDEX_MASK;
779+
unsigned OpBcst64 = Mem2Bcst->DstOp;
780+
unsigned OpNoBcst64 = Mem2Bcst->Flags & TB_INDEX_MASK;
781+
FixupEntry Fixups[] = {{(int)OpBcst64, 1, 64, rebuildSplatCst}};
782+
// TODO: Add support for RegBitWidth, but currently rebuildSplatCst
783+
// doesn't require it (defaults to Constant::getPrimitiveSizeInBits).
784+
if (FixupConstant(Fixups, 0, OpNoBcst64))
785+
return true;
589786
}
590787
}
591-
assert(((OpBcst32 == 0) || (OpBcst64 == 0) || (OpNoBcst32 == OpNoBcst64)) &&
592-
"OperandNo mismatch");
593-
594-
if (OpBcst32 || OpBcst64) {
595-
unsigned OpNo = OpBcst32 == 0 ? OpNoBcst64 : OpNoBcst32;
596-
FixupEntry Fixups[] = {{(int)OpBcst32, 32, 32, rebuildSplatCst},
597-
{(int)OpBcst64, 64, 64, rebuildSplatCst}};
598-
// TODO: Add support for RegBitWidth, but currently rebuildSplatCst
599-
// doesn't require it (defaults to Constant::getPrimitiveSizeInBits).
600-
return FixupConstant(Fixups, 0, OpNo);
601-
}
602788
return false;
603789
};
604790

605791
// Attempt to find a AVX512 mapping from a full width memory-fold instruction
606792
// to a broadcast-fold instruction variant.
607793
if ((MI.getDesc().TSFlags & X86II::EncodingMask) == X86II::EVEX)
608-
return ConvertToBroadcastAVX512(Opc, Opc);
794+
return ConvertToBroadcastAVX512(Opc, Opc, Opc);
609795

610796
// Reverse the X86InstrInfo::setExecutionDomainCustom EVEX->VEX logic
611797
// conversion to see if we can convert to a broadcasted (integer) logic op.
@@ -662,7 +848,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
662848
break;
663849
}
664850
if (OpSrc32 || OpSrc64)
665-
return ConvertToBroadcastAVX512(OpSrc32, OpSrc64);
851+
return ConvertToBroadcastAVX512(0, OpSrc32, OpSrc64);
666852
}
667853

668854
return false;

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7227,6 +7227,14 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
72277227
assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
72287228
"Unsupported vector type for broadcast.");
72297229

7230+
// On AVX512VL targets we're better off keeping the full width constant load
7231+
// and letting X86FixupVectorConstantsPass handle conversion to
7232+
// broadcast/broadcast-fold.
7233+
// AVX512 targets without AVX512VL can do this only for 512-bit vectors.
7234+
if (Subtarget.hasAVX512() && (Subtarget.hasVLX() || VT.is512BitVector()) &&
7235+
BVOp->isConstant())
7236+
return SDValue();
7237+
72307238
// See if the build vector is a repeating sequence of scalars (inc. splat).
72317239
SDValue Ld;
72327240
BitVector UndefElements;

llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll

Lines changed: 8 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1480,10 +1480,8 @@ define void @vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2(ptr %i
14801480
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
14811481
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
14821482
; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
1483-
; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1484-
; AVX512F-NEXT: # ymm2 = mem[0,1,0,1]
1485-
; AVX512F-NEXT: vpternlogq $202, %ymm0, %ymm1, %ymm2
1486-
; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0
1483+
; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
1484+
; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
14871485
; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
14881486
; AVX512F-NEXT: vzeroupper
14891487
; AVX512F-NEXT: retq
@@ -1495,10 +1493,8 @@ define void @vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2(ptr %i
14951493
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
14961494
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
14971495
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
1498-
; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1499-
; AVX512DQ-NEXT: # ymm2 = mem[0,1,0,1]
1500-
; AVX512DQ-NEXT: vpternlogq $202, %ymm0, %ymm1, %ymm2
1501-
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm0
1496+
; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
1497+
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
15021498
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
15031499
; AVX512DQ-NEXT: vzeroupper
15041500
; AVX512DQ-NEXT: retq
@@ -3260,10 +3256,8 @@ define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %i
32603256
; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1
32613257
; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
32623258
; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1]
3263-
; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3264-
; AVX512F-NEXT: # ymm3 = mem[0,1,0,1]
3265-
; AVX512F-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm3
3266-
; AVX512F-NEXT: vpaddb (%rdx), %ymm3, %ymm1
3259+
; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2
3260+
; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1
32673261
; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
32683262
; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
32693263
; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
@@ -3277,10 +3271,8 @@ define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %i
32773271
; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1
32783272
; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
32793273
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1]
3280-
; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3281-
; AVX512DQ-NEXT: # ymm3 = mem[0,1,0,1]
3282-
; AVX512DQ-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm3
3283-
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm3, %ymm1
3274+
; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2
3275+
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1
32843276
; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
32853277
; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)
32863278
; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx)

0 commit comments

Comments
 (0)