Skip to content

Commit 6eed82f

Browse files
committed
[X86] Disable lowering constant build vectors to broadcasts on AVX512 targets
On AVX512 targets we're better off keeping constant vector at full width to ensure that they can be load folded into vector instructions, reducing register pressure. If a vector constant remains as a basic load, X86FixupVectorConstantsPass will still convert this to a broadcast instruction for us. Non-VLX targets are still seeing some regressions due to these being implicitly widened to 512-bit ops in isel patterns and not in the DAG, so I've limited this to just 512-bit vectors for now.
1 parent 9d7df23 commit 6eed82f

File tree

102 files changed

+92465
-96088
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

102 files changed

+92465
-96088
lines changed

llvm/lib/Target/X86/X86FixupVectorConstants.cpp

Lines changed: 187 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -424,6 +424,26 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
424424
{X86::VMOVSDZrm, 1, 64, rebuildZeroUpperCst},
425425
{X86::VMOVDDUPZ128rm, 1, 64, rebuildSplatCst}},
426426
128, 1);
427+
case X86::VMOVAPDZ128rmk:
428+
case X86::VMOVUPDZ128rmk:
429+
return FixupConstant({{X86::VMOVSDZrmk, 1, 64, rebuildZeroUpperCst},
430+
{X86::VMOVDDUPZ128rmk, 1, 64, rebuildSplatCst}},
431+
128, 3);
432+
case X86::VMOVAPDZ128rmkz:
433+
case X86::VMOVUPDZ128rmkz:
434+
return FixupConstant({{X86::VMOVSDZrmkz, 1, 64, rebuildZeroUpperCst},
435+
{X86::VMOVDDUPZ128rmkz, 1, 64, rebuildSplatCst}},
436+
128, 2);
437+
case X86::VMOVAPSZ128rmk:
438+
case X86::VMOVUPSZ128rmk:
439+
return FixupConstant({{X86::VMOVSSZrmk, 1, 32, rebuildZeroUpperCst},
440+
{X86::VBROADCASTSSZ128rmk, 1, 32, rebuildSplatCst}},
441+
128, 3);
442+
case X86::VMOVAPSZ128rmkz:
443+
case X86::VMOVUPSZ128rmkz:
444+
return FixupConstant({{X86::VMOVSSZrmkz, 1, 32, rebuildZeroUpperCst},
445+
{X86::VBROADCASTSSZ128rmkz, 1, 32, rebuildSplatCst}},
446+
128, 2);
427447
case X86::VMOVAPDZ256rm:
428448
case X86::VMOVAPSZ256rm:
429449
case X86::VMOVUPDZ256rm:
@@ -433,6 +453,26 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
433453
{X86::VBROADCASTSDZ256rm, 1, 64, rebuildSplatCst},
434454
{X86::VBROADCASTF32X4Z256rm, 1, 128, rebuildSplatCst}},
435455
256, 1);
456+
case X86::VMOVAPDZ256rmk:
457+
case X86::VMOVUPDZ256rmk:
458+
return FixupConstant({{X86::VBROADCASTSDZ256rmk, 1, 64, rebuildSplatCst}},
459+
256, 3);
460+
case X86::VMOVAPDZ256rmkz:
461+
case X86::VMOVUPDZ256rmkz:
462+
return FixupConstant({{X86::VBROADCASTSDZ256rmkz, 1, 64, rebuildSplatCst}},
463+
256, 2);
464+
case X86::VMOVAPSZ256rmk:
465+
case X86::VMOVUPSZ256rmk:
466+
return FixupConstant(
467+
{{X86::VBROADCASTSSZ256rmk, 1, 32, rebuildSplatCst},
468+
{X86::VBROADCASTF32X4Z256rmk, 1, 128, rebuildSplatCst}},
469+
256, 3);
470+
case X86::VMOVAPSZ256rmkz:
471+
case X86::VMOVUPSZ256rmkz:
472+
return FixupConstant(
473+
{{X86::VBROADCASTSSZ256rmkz, 1, 32, rebuildSplatCst},
474+
{X86::VBROADCASTF32X4Z256rmkz, 1, 128, rebuildSplatCst}},
475+
256, 2);
436476
case X86::VMOVAPDZrm:
437477
case X86::VMOVAPSZrm:
438478
case X86::VMOVUPDZrm:
@@ -442,6 +482,26 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
442482
{X86::VBROADCASTF32X4Zrm, 1, 128, rebuildSplatCst},
443483
{X86::VBROADCASTF64X4Zrm, 1, 256, rebuildSplatCst}},
444484
512, 1);
485+
case X86::VMOVAPDZrmk:
486+
case X86::VMOVUPDZrmk:
487+
return FixupConstant({{X86::VBROADCASTSDZrmk, 1, 64, rebuildSplatCst},
488+
{X86::VBROADCASTF64X4Zrmk, 1, 256, rebuildSplatCst}},
489+
512, 3);
490+
case X86::VMOVAPDZrmkz:
491+
case X86::VMOVUPDZrmkz:
492+
return FixupConstant({{X86::VBROADCASTSDZrmkz, 1, 64, rebuildSplatCst},
493+
{X86::VBROADCASTF64X4Zrmkz, 1, 256, rebuildSplatCst}},
494+
512, 2);
495+
case X86::VMOVAPSZrmk:
496+
case X86::VMOVUPSZrmk:
497+
return FixupConstant({{X86::VBROADCASTSSZrmk, 1, 32, rebuildSplatCst},
498+
{X86::VBROADCASTF32X4Zrmk, 1, 128, rebuildSplatCst}},
499+
512, 3);
500+
case X86::VMOVAPSZrmkz:
501+
case X86::VMOVUPSZrmkz:
502+
return FixupConstant({{X86::VBROADCASTSSZrmkz, 1, 32, rebuildSplatCst},
503+
{X86::VBROADCASTF32X4Zrmkz, 1, 128, rebuildSplatCst}},
504+
512, 2);
445505
/* Integer Loads */
446506
case X86::MOVDQArm:
447507
case X86::MOVDQUrm: {
@@ -537,6 +597,42 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
537597
{X86::VPMOVZXDQZ128rm, 2, 32, rebuildZExtCst}};
538598
return FixupConstant(Fixups, 128, 1);
539599
}
600+
case X86::VMOVDQA32Z128rmk:
601+
case X86::VMOVDQU32Z128rmk:
602+
return FixupConstant({{X86::VPBROADCASTDZ128rmk, 1, 32, rebuildSplatCst},
603+
{X86::VPMOVSXBDZ128rmk, 4, 8, rebuildSExtCst},
604+
{X86::VPMOVZXBDZ128rmk, 4, 8, rebuildZExtCst},
605+
{X86::VPMOVSXWDZ128rmk, 4, 16, rebuildSExtCst},
606+
{X86::VPMOVZXWDZ128rmk, 4, 16, rebuildZExtCst}},
607+
128, 3);
608+
case X86::VMOVDQA32Z128rmkz:
609+
case X86::VMOVDQU32Z128rmkz:
610+
return FixupConstant({{X86::VPBROADCASTDZ128rmkz, 1, 32, rebuildSplatCst},
611+
{X86::VPMOVSXBDZ128rmkz, 4, 8, rebuildSExtCst},
612+
{X86::VPMOVZXBDZ128rmkz, 4, 8, rebuildZExtCst},
613+
{X86::VPMOVSXWDZ128rmkz, 4, 16, rebuildSExtCst},
614+
{X86::VPMOVZXWDZ128rmkz, 4, 16, rebuildZExtCst}},
615+
128, 2);
616+
case X86::VMOVDQA64Z128rmk:
617+
case X86::VMOVDQU64Z128rmk:
618+
return FixupConstant({{X86::VPMOVSXBQZ128rmk, 2, 8, rebuildSExtCst},
619+
{X86::VPMOVZXBQZ128rmk, 2, 8, rebuildZExtCst},
620+
{X86::VPMOVSXWQZ128rmk, 2, 16, rebuildSExtCst},
621+
{X86::VPMOVZXWQZ128rmk, 2, 16, rebuildZExtCst},
622+
{X86::VPBROADCASTQZ128rmk, 1, 64, rebuildSplatCst},
623+
{X86::VPMOVSXDQZ128rmk, 2, 32, rebuildSExtCst},
624+
{X86::VPMOVZXDQZ128rmk, 2, 32, rebuildZExtCst}},
625+
128, 3);
626+
case X86::VMOVDQA64Z128rmkz:
627+
case X86::VMOVDQU64Z128rmkz:
628+
return FixupConstant({{X86::VPMOVSXBQZ128rmkz, 2, 8, rebuildSExtCst},
629+
{X86::VPMOVZXBQZ128rmkz, 2, 8, rebuildZExtCst},
630+
{X86::VPMOVSXWQZ128rmkz, 2, 16, rebuildSExtCst},
631+
{X86::VPMOVZXWQZ128rmkz, 2, 16, rebuildZExtCst},
632+
{X86::VPBROADCASTQZ128rmkz, 1, 64, rebuildSplatCst},
633+
{X86::VPMOVSXDQZ128rmkz, 2, 32, rebuildSExtCst},
634+
{X86::VPMOVZXDQZ128rmkz, 2, 32, rebuildZExtCst}},
635+
128, 2);
540636
case X86::VMOVDQA32Z256rm:
541637
case X86::VMOVDQA64Z256rm:
542638
case X86::VMOVDQU32Z256rm:
@@ -561,6 +657,46 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
561657
{X86::VPMOVZXDQZ256rm, 4, 32, rebuildZExtCst}};
562658
return FixupConstant(Fixups, 256, 1);
563659
}
660+
case X86::VMOVDQA32Z256rmk:
661+
case X86::VMOVDQU32Z256rmk:
662+
return FixupConstant(
663+
{{X86::VPBROADCASTDZ256rmk, 1, 32, rebuildSplatCst},
664+
{X86::VPMOVSXBDZ256rmk, 8, 8, rebuildSExtCst},
665+
{X86::VPMOVZXBDZ256rmk, 8, 8, rebuildZExtCst},
666+
{X86::VBROADCASTI32X4Z256rmk, 1, 128, rebuildSplatCst},
667+
{X86::VPMOVSXWDZ256rmk, 8, 16, rebuildSExtCst},
668+
{X86::VPMOVZXWDZ256rmk, 8, 16, rebuildZExtCst}},
669+
256, 3);
670+
case X86::VMOVDQA32Z256rmkz:
671+
case X86::VMOVDQU32Z256rmkz:
672+
return FixupConstant(
673+
{{X86::VPBROADCASTDZ256rmkz, 1, 32, rebuildSplatCst},
674+
{X86::VPMOVSXBDZ256rmkz, 8, 8, rebuildSExtCst},
675+
{X86::VPMOVZXBDZ256rmkz, 8, 8, rebuildZExtCst},
676+
{X86::VBROADCASTI32X4Z256rmkz, 1, 128, rebuildSplatCst},
677+
{X86::VPMOVSXWDZ256rmkz, 8, 16, rebuildSExtCst},
678+
{X86::VPMOVZXWDZ256rmkz, 8, 16, rebuildZExtCst}},
679+
256, 2);
680+
case X86::VMOVDQA64Z256rmk:
681+
case X86::VMOVDQU64Z256rmk:
682+
return FixupConstant({{X86::VPMOVSXBQZ256rmk, 4, 8, rebuildSExtCst},
683+
{X86::VPMOVZXBQZ256rmk, 4, 8, rebuildZExtCst},
684+
{X86::VPBROADCASTQZ256rmk, 1, 64, rebuildSplatCst},
685+
{X86::VPMOVSXWQZ256rmk, 4, 16, rebuildSExtCst},
686+
{X86::VPMOVZXWQZ256rmk, 4, 16, rebuildZExtCst},
687+
{X86::VPMOVSXDQZ256rmk, 4, 32, rebuildSExtCst},
688+
{X86::VPMOVZXDQZ256rmk, 4, 32, rebuildZExtCst}},
689+
256, 3);
690+
case X86::VMOVDQA64Z256rmkz:
691+
case X86::VMOVDQU64Z256rmkz:
692+
return FixupConstant({{X86::VPMOVSXBQZ256rmkz, 4, 8, rebuildSExtCst},
693+
{X86::VPMOVZXBQZ256rmkz, 4, 8, rebuildZExtCst},
694+
{X86::VPBROADCASTQZ256rmkz, 1, 64, rebuildSplatCst},
695+
{X86::VPMOVSXWQZ256rmkz, 4, 16, rebuildSExtCst},
696+
{X86::VPMOVZXWQZ256rmkz, 4, 16, rebuildZExtCst},
697+
{X86::VPMOVSXDQZ256rmkz, 4, 32, rebuildSExtCst},
698+
{X86::VPMOVZXDQZ256rmkz, 4, 32, rebuildZExtCst}},
699+
256, 2);
564700
case X86::VMOVDQA32Zrm:
565701
case X86::VMOVDQA64Zrm:
566702
case X86::VMOVDQU32Zrm:
@@ -586,43 +722,67 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
586722
{X86::VPMOVZXDQZrm, 8, 32, rebuildZExtCst}};
587723
return FixupConstant(Fixups, 512, 1);
588724
}
725+
case X86::VMOVDQA32Zrmk:
726+
case X86::VMOVDQU32Zrmk:
727+
return FixupConstant({{X86::VPBROADCASTDZrmk, 1, 32, rebuildSplatCst},
728+
{X86::VBROADCASTI32X4Zrmk, 1, 128, rebuildSplatCst},
729+
{X86::VPMOVSXBDZrmk, 16, 8, rebuildSExtCst},
730+
{X86::VPMOVZXBDZrmk, 16, 8, rebuildZExtCst},
731+
{X86::VPMOVSXWDZrmk, 16, 16, rebuildSExtCst},
732+
{X86::VPMOVZXWDZrmk, 16, 16, rebuildZExtCst}},
733+
512, 3);
734+
case X86::VMOVDQA32Zrmkz:
735+
case X86::VMOVDQU32Zrmkz:
736+
return FixupConstant({{X86::VPBROADCASTDZrmkz, 1, 32, rebuildSplatCst},
737+
{X86::VBROADCASTI32X4Zrmkz, 1, 128, rebuildSplatCst},
738+
{X86::VPMOVSXBDZrmkz, 16, 8, rebuildSExtCst},
739+
{X86::VPMOVZXBDZrmkz, 16, 8, rebuildZExtCst},
740+
{X86::VPMOVSXWDZrmkz, 16, 16, rebuildSExtCst},
741+
{X86::VPMOVZXWDZrmkz, 16, 16, rebuildZExtCst}},
742+
512, 2);
743+
case X86::VMOVDQA64Zrmk:
744+
case X86::VMOVDQU64Zrmk:
745+
return FixupConstant({{X86::VPBROADCASTQZrmk, 1, 64, rebuildSplatCst},
746+
{X86::VPMOVSXBQZrmk, 8, 8, rebuildSExtCst},
747+
{X86::VPMOVZXBQZrmk, 8, 8, rebuildZExtCst},
748+
{X86::VPMOVSXWQZrmk, 8, 16, rebuildSExtCst},
749+
{X86::VPMOVZXWQZrmk, 8, 16, rebuildZExtCst},
750+
{X86::VBROADCASTI64X4Zrmk, 1, 256, rebuildSplatCst},
751+
{X86::VPMOVSXDQZrmk, 8, 32, rebuildSExtCst},
752+
{X86::VPMOVZXDQZrmk, 8, 32, rebuildZExtCst}},
753+
512, 3);
754+
case X86::VMOVDQA64Zrmkz:
755+
case X86::VMOVDQU64Zrmkz:
756+
return FixupConstant({{X86::VPBROADCASTQZrmkz, 1, 64, rebuildSplatCst},
757+
{X86::VPMOVSXBQZrmkz, 8, 8, rebuildSExtCst},
758+
{X86::VPMOVZXBQZrmkz, 8, 8, rebuildZExtCst},
759+
{X86::VPMOVSXWQZrmkz, 8, 16, rebuildSExtCst},
760+
{X86::VPMOVZXWQZrmkz, 8, 16, rebuildZExtCst},
761+
{X86::VBROADCASTI64X4Zrmkz, 1, 256, rebuildSplatCst},
762+
{X86::VPMOVSXDQZrmkz, 8, 32, rebuildSExtCst},
763+
{X86::VPMOVZXDQZrmkz, 8, 32, rebuildZExtCst}},
764+
512, 2);
589765
}
590766

591-
auto ConvertToBroadcastAVX512 = [&](unsigned OpSrc32, unsigned OpSrc64) {
592-
unsigned OpBcst32 = 0, OpBcst64 = 0;
593-
unsigned OpNoBcst32 = 0, OpNoBcst64 = 0;
594-
if (OpSrc32) {
595-
if (const X86FoldTableEntry *Mem2Bcst =
596-
llvm::lookupBroadcastFoldTableBySize(OpSrc32, 32)) {
597-
OpBcst32 = Mem2Bcst->DstOp;
598-
OpNoBcst32 = Mem2Bcst->Flags & TB_INDEX_MASK;
599-
}
600-
}
601-
if (OpSrc64) {
602-
if (const X86FoldTableEntry *Mem2Bcst =
603-
llvm::lookupBroadcastFoldTableBySize(OpSrc64, 64)) {
604-
OpBcst64 = Mem2Bcst->DstOp;
605-
OpNoBcst64 = Mem2Bcst->Flags & TB_INDEX_MASK;
606-
}
607-
}
608-
assert(((OpBcst32 == 0) || (OpBcst64 == 0) || (OpNoBcst32 == OpNoBcst64)) &&
609-
"OperandNo mismatch");
610-
611-
if (OpBcst32 || OpBcst64) {
612-
unsigned OpNo = OpBcst32 == 0 ? OpNoBcst64 : OpNoBcst32;
613-
FixupEntry Fixups[] = {{(int)OpBcst32, 32, 32, rebuildSplatCst},
614-
{(int)OpBcst64, 64, 64, rebuildSplatCst}};
767+
auto ConvertToBroadcast = [&](unsigned OpSrc, int BW) {
768+
if (const X86FoldTableEntry *Mem2Bcst =
769+
llvm::lookupBroadcastFoldTableBySize(OpSrc, BW)) {
770+
unsigned OpBcst = Mem2Bcst->DstOp;
771+
unsigned OpNoBcst = Mem2Bcst->Flags & TB_INDEX_MASK;
772+
FixupEntry Fixups[] = {{(int)OpBcst, 1, BW, rebuildSplatCst}};
615773
// TODO: Add support for RegBitWidth, but currently rebuildSplatCst
616774
// doesn't require it (defaults to Constant::getPrimitiveSizeInBits).
617-
return FixupConstant(Fixups, 0, OpNo);
775+
if (FixupConstant(Fixups, 0, OpNoBcst))
776+
return true;
618777
}
619778
return false;
620779
};
621780

622781
// Attempt to find a AVX512 mapping from a full width memory-fold instruction
623782
// to a broadcast-fold instruction variant.
624783
if ((MI.getDesc().TSFlags & X86II::EncodingMask) == X86II::EVEX)
625-
return ConvertToBroadcastAVX512(Opc, Opc);
784+
return ConvertToBroadcast(Opc, 16) || ConvertToBroadcast(Opc, 32) ||
785+
ConvertToBroadcast(Opc, 64);
626786

627787
// Reverse the X86InstrInfo::setExecutionDomainCustom EVEX->VEX logic
628788
// conversion to see if we can convert to a broadcasted (integer) logic op.
@@ -679,7 +839,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
679839
break;
680840
}
681841
if (OpSrc32 || OpSrc64)
682-
return ConvertToBroadcastAVX512(OpSrc32, OpSrc64);
842+
return ConvertToBroadcast(OpSrc32, 32) || ConvertToBroadcast(OpSrc64, 64);
683843
}
684844

685845
return false;

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7502,6 +7502,14 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
75027502
assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
75037503
"Unsupported vector type for broadcast.");
75047504

7505+
// On AVX512VL targets we're better off keeping the full width constant load
7506+
// and letting X86FixupVectorConstantsPass handle conversion to
7507+
// broadcast/broadcast-fold.
7508+
// AVX512 targets without AVX512VL can do this only for 512-bit vectors.
7509+
if (Subtarget.hasAVX512() && (Subtarget.hasVLX() || VT.is512BitVector()) &&
7510+
BVOp->isConstant())
7511+
return SDValue();
7512+
75057513
// See if the build vector is a repeating sequence of scalars (inc. splat).
75067514
SDValue Ld;
75077515
BitVector UndefElements;

llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll

Lines changed: 8 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1480,10 +1480,8 @@ define void @vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2(ptr %i
14801480
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
14811481
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
14821482
; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
1483-
; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1484-
; AVX512F-NEXT: # ymm2 = mem[0,1,0,1]
1485-
; AVX512F-NEXT: vpternlogq {{.*#+}} ymm2 = ymm0 ^ (ymm2 & (ymm1 ^ ymm0))
1486-
; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0
1483+
; AVX512F-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1))
1484+
; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
14871485
; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
14881486
; AVX512F-NEXT: vzeroupper
14891487
; AVX512F-NEXT: retq
@@ -1495,10 +1493,8 @@ define void @vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2(ptr %i
14951493
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
14961494
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
14971495
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
1498-
; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1499-
; AVX512DQ-NEXT: # ymm2 = mem[0,1,0,1]
1500-
; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm0 ^ (ymm2 & (ymm1 ^ ymm0))
1501-
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm0
1496+
; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1))
1497+
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
15021498
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
15031499
; AVX512DQ-NEXT: vzeroupper
15041500
; AVX512DQ-NEXT: retq
@@ -3253,10 +3249,8 @@ define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %i
32533249
; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1
32543250
; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
32553251
; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1]
3256-
; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3257-
; AVX512F-NEXT: # ymm3 = mem[0,1,0,1]
3258-
; AVX512F-NEXT: vpternlogq {{.*#+}} ymm3 = ymm2 ^ (ymm3 & (ymm1 ^ ymm2))
3259-
; AVX512F-NEXT: vpaddb (%rdx), %ymm3, %ymm1
3252+
; AVX512F-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm1))
3253+
; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1
32603254
; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
32613255
; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
32623256
; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
@@ -3270,10 +3264,8 @@ define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %i
32703264
; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1
32713265
; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
32723266
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1]
3273-
; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3274-
; AVX512DQ-NEXT: # ymm3 = mem[0,1,0,1]
3275-
; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm2 ^ (ymm3 & (ymm1 ^ ymm2))
3276-
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm3, %ymm1
3267+
; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm1))
3268+
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1
32773269
; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
32783270
; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)
32793271
; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx)

0 commit comments

Comments
 (0)