Skip to content

Commit 86ed907

Browse files
committed
[X86] Disable lowering constant build vectors to broadcasts on AVX512 targets
On AVX512 targets we're better off keeping constant vector at full width to ensure that they can be load folded into vector instructions, reducing register pressure. If a vector constant remains as a basic load, X86FixupVectorConstantsPass will still convert this to a broadcast instruction for us. Non-VLX targets are still seeing some regressions due to these being implicitly widened to 512-bit ops in isel patterns and not in the DAG, so I've limited this to just 512-bit vectors for now.
1 parent f958ad3 commit 86ed907

File tree

96 files changed

+92596
-96160
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

96 files changed

+92596
-96160
lines changed

llvm/lib/Target/X86/X86FixupVectorConstants.cpp

Lines changed: 200 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -397,6 +397,26 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
397397
{X86::VMOVSDZrm, 1, 64, rebuildZeroUpperCst},
398398
{X86::VMOVDDUPZ128rm, 1, 64, rebuildSplatCst}},
399399
1);
400+
case X86::VMOVAPDZ128rmk:
401+
case X86::VMOVUPDZ128rmk:
402+
return FixupConstant({{X86::VMOVSDZrmk, 1, 64, rebuildZeroUpperCst},
403+
{X86::VMOVDDUPZ128rmk, 1, 64, rebuildSplatCst}},
404+
3);
405+
case X86::VMOVAPDZ128rmkz:
406+
case X86::VMOVUPDZ128rmkz:
407+
return FixupConstant({{X86::VMOVSDZrmkz, 1, 64, rebuildZeroUpperCst},
408+
{X86::VMOVDDUPZ128rmkz, 1, 64, rebuildSplatCst}},
409+
2);
410+
case X86::VMOVAPSZ128rmk:
411+
case X86::VMOVUPSZ128rmk:
412+
return FixupConstant({{X86::VMOVSSZrmk, 1, 32, rebuildZeroUpperCst},
413+
{X86::VBROADCASTSSZ128rmk, 1, 32, rebuildSplatCst}},
414+
3);
415+
case X86::VMOVAPSZ128rmkz:
416+
case X86::VMOVUPSZ128rmkz:
417+
return FixupConstant({{X86::VMOVSSZrmkz, 1, 32, rebuildZeroUpperCst},
418+
{X86::VBROADCASTSSZ128rmkz, 1, 32, rebuildSplatCst}},
419+
2);
400420
case X86::VMOVAPDZ256rm:
401421
case X86::VMOVAPSZ256rm:
402422
case X86::VMOVUPDZ256rm:
@@ -406,6 +426,26 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
406426
{X86::VBROADCASTSDZ256rm, 1, 64, rebuildSplatCst},
407427
{X86::VBROADCASTF32X4Z256rm, 1, 128, rebuildSplatCst}},
408428
1);
429+
case X86::VMOVAPDZ256rmk:
430+
case X86::VMOVUPDZ256rmk:
431+
return FixupConstant({{X86::VBROADCASTSDZ256rmk, 1, 64, rebuildSplatCst}},
432+
3);
433+
case X86::VMOVAPDZ256rmkz:
434+
case X86::VMOVUPDZ256rmkz:
435+
return FixupConstant({{X86::VBROADCASTSDZ256rmkz, 1, 64, rebuildSplatCst}},
436+
2);
437+
case X86::VMOVAPSZ256rmk:
438+
case X86::VMOVUPSZ256rmk:
439+
return FixupConstant(
440+
{{X86::VBROADCASTSSZ256rmk, 1, 32, rebuildSplatCst},
441+
{X86::VBROADCASTF32X4Z256rmk, 1, 128, rebuildSplatCst}},
442+
3);
443+
case X86::VMOVAPSZ256rmkz:
444+
case X86::VMOVUPSZ256rmkz:
445+
return FixupConstant(
446+
{{X86::VBROADCASTSSZ256rmkz, 1, 32, rebuildSplatCst},
447+
{X86::VBROADCASTF32X4Z256rmkz, 1, 128, rebuildSplatCst}},
448+
2);
409449
case X86::VMOVAPDZrm:
410450
case X86::VMOVAPSZrm:
411451
case X86::VMOVUPDZrm:
@@ -415,6 +455,26 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
415455
{X86::VBROADCASTF32X4rm, 1, 128, rebuildSplatCst},
416456
{X86::VBROADCASTF64X4rm, 1, 256, rebuildSplatCst}},
417457
1);
458+
case X86::VMOVAPDZrmk:
459+
case X86::VMOVUPDZrmk:
460+
return FixupConstant({{X86::VBROADCASTSDZrmk, 1, 64, rebuildSplatCst},
461+
{X86::VBROADCASTF64X4rmk, 1, 256, rebuildSplatCst}},
462+
3);
463+
case X86::VMOVAPDZrmkz:
464+
case X86::VMOVUPDZrmkz:
465+
return FixupConstant({{X86::VBROADCASTSDZrmkz, 1, 64, rebuildSplatCst},
466+
{X86::VBROADCASTF64X4rmkz, 1, 256, rebuildSplatCst}},
467+
2);
468+
case X86::VMOVAPSZrmk:
469+
case X86::VMOVUPSZrmk:
470+
return FixupConstant({{X86::VBROADCASTSSZrmk, 1, 32, rebuildSplatCst},
471+
{X86::VBROADCASTF32X4rmk, 1, 128, rebuildSplatCst}},
472+
3);
473+
case X86::VMOVAPSZrmkz:
474+
case X86::VMOVUPSZrmkz:
475+
return FixupConstant({{X86::VBROADCASTSSZrmkz, 1, 32, rebuildSplatCst},
476+
{X86::VBROADCASTF32X4rmkz, 1, 128, rebuildSplatCst}},
477+
2);
418478
/* Integer Loads */
419479
case X86::MOVDQArm:
420480
case X86::MOVDQUrm: {
@@ -510,6 +570,42 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
510570
{X86::VPMOVZXDQZ128rm, 2, 32, rebuildZExtCst}};
511571
return FixupConstant(Fixups, 1);
512572
}
573+
case X86::VMOVDQA32Z128rmk:
574+
case X86::VMOVDQU32Z128rmk:
575+
return FixupConstant({{X86::VPBROADCASTDZ128rmk, 1, 32, rebuildSplatCst},
576+
{X86::VPMOVSXBDZ128rmk, 4, 8, rebuildSExtCst},
577+
{X86::VPMOVZXBDZ128rmk, 4, 8, rebuildZExtCst},
578+
{X86::VPMOVSXWDZ128rmk, 4, 16, rebuildSExtCst},
579+
{X86::VPMOVZXWDZ128rmk, 4, 16, rebuildZExtCst}},
580+
3);
581+
case X86::VMOVDQA32Z128rmkz:
582+
case X86::VMOVDQU32Z128rmkz:
583+
return FixupConstant({{X86::VPBROADCASTDZ128rmkz, 1, 32, rebuildSplatCst},
584+
{X86::VPMOVSXBDZ128rmkz, 4, 8, rebuildSExtCst},
585+
{X86::VPMOVZXBDZ128rmkz, 4, 8, rebuildZExtCst},
586+
{X86::VPMOVSXWDZ128rmkz, 4, 16, rebuildSExtCst},
587+
{X86::VPMOVZXWDZ128rmkz, 4, 16, rebuildZExtCst}},
588+
2);
589+
case X86::VMOVDQA64Z128rmk:
590+
case X86::VMOVDQU64Z128rmk:
591+
return FixupConstant({{X86::VPMOVSXBQZ128rmk, 2, 8, rebuildSExtCst},
592+
{X86::VPMOVZXBQZ128rmk, 2, 8, rebuildZExtCst},
593+
{X86::VPMOVSXWQZ128rmk, 2, 16, rebuildSExtCst},
594+
{X86::VPMOVZXWQZ128rmk, 2, 16, rebuildZExtCst},
595+
{X86::VPBROADCASTQZ128rmk, 1, 64, rebuildSplatCst},
596+
{X86::VPMOVSXDQZ128rmk, 2, 32, rebuildSExtCst},
597+
{X86::VPMOVZXDQZ128rmk, 2, 32, rebuildZExtCst}},
598+
3);
599+
case X86::VMOVDQA64Z128rmkz:
600+
case X86::VMOVDQU64Z128rmkz:
601+
return FixupConstant({{X86::VPMOVSXBQZ128rmkz, 2, 8, rebuildSExtCst},
602+
{X86::VPMOVZXBQZ128rmkz, 2, 8, rebuildZExtCst},
603+
{X86::VPMOVSXWQZ128rmkz, 2, 16, rebuildSExtCst},
604+
{X86::VPMOVZXWQZ128rmkz, 2, 16, rebuildZExtCst},
605+
{X86::VPBROADCASTQZ128rmkz, 1, 64, rebuildSplatCst},
606+
{X86::VPMOVSXDQZ128rmkz, 2, 32, rebuildSExtCst},
607+
{X86::VPMOVZXDQZ128rmkz, 2, 32, rebuildZExtCst}},
608+
2);
513609
case X86::VMOVDQA32Z256rm:
514610
case X86::VMOVDQA64Z256rm:
515611
case X86::VMOVDQU32Z256rm:
@@ -534,6 +630,46 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
534630
{X86::VPMOVZXDQZ256rm, 4, 32, rebuildZExtCst}};
535631
return FixupConstant(Fixups, 1);
536632
}
633+
case X86::VMOVDQA32Z256rmk:
634+
case X86::VMOVDQU32Z256rmk:
635+
return FixupConstant(
636+
{{X86::VPBROADCASTDZ256rmk, 1, 32, rebuildSplatCst},
637+
{X86::VPMOVSXBDZ256rmk, 8, 8, rebuildSExtCst},
638+
{X86::VPMOVZXBDZ256rmk, 8, 8, rebuildZExtCst},
639+
{X86::VBROADCASTI32X4Z256rmk, 1, 128, rebuildSplatCst},
640+
{X86::VPMOVSXWDZ256rmk, 8, 16, rebuildSExtCst},
641+
{X86::VPMOVZXWDZ256rmk, 8, 16, rebuildZExtCst}},
642+
3);
643+
case X86::VMOVDQA32Z256rmkz:
644+
case X86::VMOVDQU32Z256rmkz:
645+
return FixupConstant(
646+
{{X86::VPBROADCASTDZ256rmkz, 1, 32, rebuildSplatCst},
647+
{X86::VPMOVSXBDZ256rmkz, 8, 8, rebuildSExtCst},
648+
{X86::VPMOVZXBDZ256rmkz, 8, 8, rebuildZExtCst},
649+
{X86::VBROADCASTI32X4Z256rmkz, 1, 128, rebuildSplatCst},
650+
{X86::VPMOVSXWDZ256rmkz, 8, 16, rebuildSExtCst},
651+
{X86::VPMOVZXWDZ256rmkz, 8, 16, rebuildZExtCst}},
652+
2);
653+
case X86::VMOVDQA64Z256rmk:
654+
case X86::VMOVDQU64Z256rmk:
655+
return FixupConstant({{X86::VPMOVSXBQZ256rmk, 4, 8, rebuildSExtCst},
656+
{X86::VPMOVZXBQZ256rmk, 4, 8, rebuildZExtCst},
657+
{X86::VPBROADCASTQZ256rmk, 1, 64, rebuildSplatCst},
658+
{X86::VPMOVSXWQZ256rmk, 4, 16, rebuildSExtCst},
659+
{X86::VPMOVZXWQZ256rmk, 4, 16, rebuildZExtCst},
660+
{X86::VPMOVSXDQZ256rmk, 4, 32, rebuildSExtCst},
661+
{X86::VPMOVZXDQZ256rmk, 4, 32, rebuildZExtCst}},
662+
3);
663+
case X86::VMOVDQA64Z256rmkz:
664+
case X86::VMOVDQU64Z256rmkz:
665+
return FixupConstant({{X86::VPMOVSXBQZ256rmkz, 4, 8, rebuildSExtCst},
666+
{X86::VPMOVZXBQZ256rmkz, 4, 8, rebuildZExtCst},
667+
{X86::VPBROADCASTQZ256rmkz, 1, 64, rebuildSplatCst},
668+
{X86::VPMOVSXWQZ256rmkz, 4, 16, rebuildSExtCst},
669+
{X86::VPMOVZXWQZ256rmkz, 4, 16, rebuildZExtCst},
670+
{X86::VPMOVSXDQZ256rmkz, 4, 32, rebuildSExtCst},
671+
{X86::VPMOVZXDQZ256rmkz, 4, 32, rebuildZExtCst}},
672+
2);
537673
case X86::VMOVDQA32Zrm:
538674
case X86::VMOVDQA64Zrm:
539675
case X86::VMOVDQU32Zrm:
@@ -559,41 +695,87 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
559695
{X86::VPMOVZXDQZrm, 8, 32, rebuildZExtCst}};
560696
return FixupConstant(Fixups, 1);
561697
}
698+
case X86::VMOVDQA32Zrmk:
699+
case X86::VMOVDQU32Zrmk:
700+
return FixupConstant({{X86::VPBROADCASTDZrmk, 1, 32, rebuildSplatCst},
701+
{X86::VBROADCASTI32X4rmk, 1, 128, rebuildSplatCst},
702+
{X86::VPMOVSXBDZrmk, 16, 8, rebuildSExtCst},
703+
{X86::VPMOVZXBDZrmk, 16, 8, rebuildZExtCst},
704+
{X86::VPMOVSXWDZrmk, 16, 16, rebuildSExtCst},
705+
{X86::VPMOVZXWDZrmk, 16, 16, rebuildZExtCst}},
706+
3);
707+
case X86::VMOVDQA32Zrmkz:
708+
case X86::VMOVDQU32Zrmkz:
709+
return FixupConstant({{X86::VPBROADCASTDZrmkz, 1, 32, rebuildSplatCst},
710+
{X86::VBROADCASTI32X4rmkz, 1, 128, rebuildSplatCst},
711+
{X86::VPMOVSXBDZrmkz, 16, 8, rebuildSExtCst},
712+
{X86::VPMOVZXBDZrmkz, 16, 8, rebuildZExtCst},
713+
{X86::VPMOVSXWDZrmkz, 16, 16, rebuildSExtCst},
714+
{X86::VPMOVZXWDZrmkz, 16, 16, rebuildZExtCst}},
715+
2);
716+
case X86::VMOVDQA64Zrmk:
717+
case X86::VMOVDQU64Zrmk:
718+
return FixupConstant({{X86::VPBROADCASTQZrmk, 1, 64, rebuildSplatCst},
719+
{X86::VPMOVSXBQZrmk, 8, 8, rebuildSExtCst},
720+
{X86::VPMOVZXBQZrmk, 8, 8, rebuildZExtCst},
721+
{X86::VPMOVSXWQZrmk, 8, 16, rebuildSExtCst},
722+
{X86::VPMOVZXWQZrmk, 8, 16, rebuildZExtCst},
723+
{X86::VBROADCASTI64X4rmk, 1, 256, rebuildSplatCst},
724+
{X86::VPMOVSXDQZrmk, 8, 32, rebuildSExtCst},
725+
{X86::VPMOVZXDQZrmk, 8, 32, rebuildZExtCst}},
726+
3);
727+
case X86::VMOVDQA64Zrmkz:
728+
case X86::VMOVDQU64Zrmkz:
729+
return FixupConstant({{X86::VPBROADCASTQZrmkz, 1, 64, rebuildSplatCst},
730+
{X86::VPMOVSXBQZrmkz, 8, 8, rebuildSExtCst},
731+
{X86::VPMOVZXBQZrmkz, 8, 8, rebuildZExtCst},
732+
{X86::VPMOVSXWQZrmkz, 8, 16, rebuildSExtCst},
733+
{X86::VPMOVZXWQZrmkz, 8, 16, rebuildZExtCst},
734+
{X86::VBROADCASTI64X4rmkz, 1, 256, rebuildSplatCst},
735+
{X86::VPMOVSXDQZrmkz, 8, 32, rebuildSExtCst},
736+
{X86::VPMOVZXDQZrmkz, 8, 32, rebuildZExtCst}},
737+
2);
562738
}
563739

564-
auto ConvertToBroadcastAVX512 = [&](unsigned OpSrc32, unsigned OpSrc64) {
565-
unsigned OpBcst32 = 0, OpBcst64 = 0;
566-
unsigned OpNoBcst32 = 0, OpNoBcst64 = 0;
740+
auto ConvertToBroadcastAVX512 = [&](unsigned OpSrc16, unsigned OpSrc32,
741+
unsigned OpSrc64) {
742+
if (OpSrc16) {
743+
if (const X86FoldTableEntry *Mem2Bcst =
744+
llvm::lookupBroadcastFoldTableBySize(OpSrc16, 16)) {
745+
unsigned OpBcst16 = Mem2Bcst->DstOp;
746+
unsigned OpNoBcst16 = Mem2Bcst->Flags & TB_INDEX_MASK;
747+
FixupEntry Fixups[] = {{(int)OpBcst16, 1, 16, rebuildSplatCst}};
748+
if (FixupConstant(Fixups, OpNoBcst16))
749+
return true;
750+
}
751+
}
567752
if (OpSrc32) {
568753
if (const X86FoldTableEntry *Mem2Bcst =
569754
llvm::lookupBroadcastFoldTableBySize(OpSrc32, 32)) {
570-
OpBcst32 = Mem2Bcst->DstOp;
571-
OpNoBcst32 = Mem2Bcst->Flags & TB_INDEX_MASK;
755+
unsigned OpBcst32 = Mem2Bcst->DstOp;
756+
unsigned OpNoBcst32 = Mem2Bcst->Flags & TB_INDEX_MASK;
757+
FixupEntry Fixups[] = {{(int)OpBcst32, 1, 32, rebuildSplatCst}};
758+
if (FixupConstant(Fixups, OpNoBcst32))
759+
return true;
572760
}
573761
}
574762
if (OpSrc64) {
575763
if (const X86FoldTableEntry *Mem2Bcst =
576764
llvm::lookupBroadcastFoldTableBySize(OpSrc64, 64)) {
577-
OpBcst64 = Mem2Bcst->DstOp;
578-
OpNoBcst64 = Mem2Bcst->Flags & TB_INDEX_MASK;
765+
unsigned OpBcst64 = Mem2Bcst->DstOp;
766+
unsigned OpNoBcst64 = Mem2Bcst->Flags & TB_INDEX_MASK;
767+
FixupEntry Fixups[] = {{(int)OpBcst64, 1, 64, rebuildSplatCst}};
768+
if (FixupConstant(Fixups, OpNoBcst64))
769+
return true;
579770
}
580771
}
581-
assert(((OpBcst32 == 0) || (OpBcst64 == 0) || (OpNoBcst32 == OpNoBcst64)) &&
582-
"OperandNo mismatch");
583-
584-
if (OpBcst32 || OpBcst64) {
585-
unsigned OpNo = OpBcst32 == 0 ? OpNoBcst64 : OpNoBcst32;
586-
FixupEntry Fixups[] = {{(int)OpBcst32, 32, 32, rebuildSplatCst},
587-
{(int)OpBcst64, 64, 64, rebuildSplatCst}};
588-
return FixupConstant(Fixups, OpNo);
589-
}
590772
return false;
591773
};
592774

593775
// Attempt to find a AVX512 mapping from a full width memory-fold instruction
594776
// to a broadcast-fold instruction variant.
595777
if ((MI.getDesc().TSFlags & X86II::EncodingMask) == X86II::EVEX)
596-
return ConvertToBroadcastAVX512(Opc, Opc);
778+
return ConvertToBroadcastAVX512(Opc, Opc, Opc);
597779

598780
// Reverse the X86InstrInfo::setExecutionDomainCustom EVEX->VEX logic
599781
// conversion to see if we can convert to a broadcasted (integer) logic op.
@@ -650,7 +832,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
650832
break;
651833
}
652834
if (OpSrc32 || OpSrc64)
653-
return ConvertToBroadcastAVX512(OpSrc32, OpSrc64);
835+
return ConvertToBroadcastAVX512(0, OpSrc32, OpSrc64);
654836
}
655837

656838
return false;

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7150,6 +7150,14 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
71507150
assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
71517151
"Unsupported vector type for broadcast.");
71527152

7153+
// On AVX512VL targets we're better off keeping the full width constant load
7154+
// and letting X86FixupVectorConstantsPass handle conversion to
7155+
// broadcast/broadcast-fold.
7156+
// AVX512 targets without AVX512VL can do this only for 512-bit vectors.
7157+
if (Subtarget.hasAVX512() && (Subtarget.hasVLX() || VT.is512BitVector()) &&
7158+
BVOp->isConstant())
7159+
return SDValue();
7160+
71537161
// See if the build vector is a repeating sequence of scalars (inc. splat).
71547162
SDValue Ld;
71557163
BitVector UndefElements;

llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll

Lines changed: 8 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1480,10 +1480,8 @@ define void @vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2(ptr %i
14801480
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
14811481
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
14821482
; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
1483-
; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1484-
; AVX512F-NEXT: # ymm2 = mem[0,1,0,1]
1485-
; AVX512F-NEXT: vpternlogq $202, %ymm0, %ymm1, %ymm2
1486-
; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0
1483+
; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
1484+
; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
14871485
; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
14881486
; AVX512F-NEXT: vzeroupper
14891487
; AVX512F-NEXT: retq
@@ -1495,10 +1493,8 @@ define void @vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2(ptr %i
14951493
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
14961494
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
14971495
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
1498-
; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1499-
; AVX512DQ-NEXT: # ymm2 = mem[0,1,0,1]
1500-
; AVX512DQ-NEXT: vpternlogq $202, %ymm0, %ymm1, %ymm2
1501-
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm0
1496+
; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
1497+
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
15021498
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
15031499
; AVX512DQ-NEXT: vzeroupper
15041500
; AVX512DQ-NEXT: retq
@@ -3260,10 +3256,8 @@ define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %i
32603256
; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1
32613257
; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
32623258
; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1]
3263-
; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3264-
; AVX512F-NEXT: # ymm3 = mem[0,1,0,1]
3265-
; AVX512F-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm3
3266-
; AVX512F-NEXT: vpaddb (%rdx), %ymm3, %ymm1
3259+
; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2
3260+
; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1
32673261
; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
32683262
; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
32693263
; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
@@ -3277,10 +3271,8 @@ define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %i
32773271
; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1
32783272
; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
32793273
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1]
3280-
; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3281-
; AVX512DQ-NEXT: # ymm3 = mem[0,1,0,1]
3282-
; AVX512DQ-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm3
3283-
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm3, %ymm1
3274+
; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2
3275+
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1
32843276
; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
32853277
; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)
32863278
; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx)

0 commit comments

Comments
 (0)