Skip to content

Commit be6c752

Browse files
authored
[X86] X86FixupVectorConstantsPass - use VPMOVSX/ZX extensions for PS/PD domain moves (#122601)
For targets with free domain moves, or AVX512 support, allow the use of VPMOVSX/ZX extension loads to reduce the load sizes. I've limited this to extension to i32/i64 types as we're mostly interested in shuffle mask loading here, but we could include i16 types as well just as easily. Inspired by a regression on #122485
1 parent 4f7dc1b commit be6c752

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

56 files changed

+2352
-1706
lines changed

llvm/lib/Target/X86/X86FixupVectorConstants.cpp

Lines changed: 90 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -338,6 +338,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
338338
bool HasDQI = ST->hasDQI();
339339
bool HasBWI = ST->hasBWI();
340340
bool HasVLX = ST->hasVLX();
341+
bool MultiDomain = ST->hasAVX512() || ST->hasNoDomainDelayMov();
341342

342343
struct FixupEntry {
343344
int Op;
@@ -401,47 +402,107 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
401402
case X86::VMOVAPDrm:
402403
case X86::VMOVAPSrm:
403404
case X86::VMOVUPDrm:
404-
case X86::VMOVUPSrm:
405-
return FixupConstant({{X86::VMOVSSrm, 1, 32, rebuildZeroUpperCst},
406-
{X86::VBROADCASTSSrm, 1, 32, rebuildSplatCst},
407-
{X86::VMOVSDrm, 1, 64, rebuildZeroUpperCst},
408-
{X86::VMOVDDUPrm, 1, 64, rebuildSplatCst}},
409-
128, 1);
405+
case X86::VMOVUPSrm: {
406+
FixupEntry Fixups[] = {
407+
{MultiDomain ? X86::VPMOVSXBQrm : 0, 2, 8, rebuildSExtCst},
408+
{MultiDomain ? X86::VPMOVZXBQrm : 0, 2, 8, rebuildZExtCst},
409+
{X86::VMOVSSrm, 1, 32, rebuildZeroUpperCst},
410+
{X86::VBROADCASTSSrm, 1, 32, rebuildSplatCst},
411+
{MultiDomain ? X86::VPMOVSXBDrm : 0, 4, 8, rebuildSExtCst},
412+
{MultiDomain ? X86::VPMOVZXBDrm : 0, 4, 8, rebuildZExtCst},
413+
{MultiDomain ? X86::VPMOVSXWQrm : 0, 2, 16, rebuildSExtCst},
414+
{MultiDomain ? X86::VPMOVZXWQrm : 0, 2, 16, rebuildZExtCst},
415+
{X86::VMOVSDrm, 1, 64, rebuildZeroUpperCst},
416+
{X86::VMOVDDUPrm, 1, 64, rebuildSplatCst},
417+
{MultiDomain ? X86::VPMOVSXWDrm : 0, 4, 16, rebuildSExtCst},
418+
{MultiDomain ? X86::VPMOVZXWDrm : 0, 4, 16, rebuildZExtCst},
419+
{MultiDomain ? X86::VPMOVSXDQrm : 0, 2, 32, rebuildSExtCst},
420+
{MultiDomain ? X86::VPMOVZXDQrm : 0, 2, 32, rebuildZExtCst}};
421+
return FixupConstant(Fixups, 128, 1);
422+
}
410423
case X86::VMOVAPDYrm:
411424
case X86::VMOVAPSYrm:
412425
case X86::VMOVUPDYrm:
413-
case X86::VMOVUPSYrm:
414-
return FixupConstant({{X86::VBROADCASTSSYrm, 1, 32, rebuildSplatCst},
415-
{X86::VBROADCASTSDYrm, 1, 64, rebuildSplatCst},
416-
{X86::VBROADCASTF128rm, 1, 128, rebuildSplatCst}},
417-
256, 1);
426+
case X86::VMOVUPSYrm: {
427+
FixupEntry Fixups[] = {
428+
{X86::VBROADCASTSSYrm, 1, 32, rebuildSplatCst},
429+
{HasAVX2 && MultiDomain ? X86::VPMOVSXBQYrm : 0, 4, 8, rebuildSExtCst},
430+
{HasAVX2 && MultiDomain ? X86::VPMOVZXBQYrm : 0, 4, 8, rebuildZExtCst},
431+
{X86::VBROADCASTSDYrm, 1, 64, rebuildSplatCst},
432+
{HasAVX2 && MultiDomain ? X86::VPMOVSXBDYrm : 0, 8, 8, rebuildSExtCst},
433+
{HasAVX2 && MultiDomain ? X86::VPMOVZXBDYrm : 0, 8, 8, rebuildZExtCst},
434+
{HasAVX2 && MultiDomain ? X86::VPMOVSXWQYrm : 0, 4, 16, rebuildSExtCst},
435+
{HasAVX2 && MultiDomain ? X86::VPMOVZXWQYrm : 0, 4, 16, rebuildZExtCst},
436+
{X86::VBROADCASTF128rm, 1, 128, rebuildSplatCst},
437+
{HasAVX2 && MultiDomain ? X86::VPMOVSXWDYrm : 0, 8, 16, rebuildSExtCst},
438+
{HasAVX2 && MultiDomain ? X86::VPMOVZXWDYrm : 0, 8, 16, rebuildZExtCst},
439+
{HasAVX2 && MultiDomain ? X86::VPMOVSXDQYrm : 0, 4, 32, rebuildSExtCst},
440+
{HasAVX2 && MultiDomain ? X86::VPMOVZXDQYrm : 0, 4, 32,
441+
rebuildZExtCst}};
442+
return FixupConstant(Fixups, 256, 1);
443+
}
418444
case X86::VMOVAPDZ128rm:
419445
case X86::VMOVAPSZ128rm:
420446
case X86::VMOVUPDZ128rm:
421-
case X86::VMOVUPSZ128rm:
422-
return FixupConstant({{X86::VMOVSSZrm, 1, 32, rebuildZeroUpperCst},
423-
{X86::VBROADCASTSSZ128rm, 1, 32, rebuildSplatCst},
424-
{X86::VMOVSDZrm, 1, 64, rebuildZeroUpperCst},
425-
{X86::VMOVDDUPZ128rm, 1, 64, rebuildSplatCst}},
426-
128, 1);
447+
case X86::VMOVUPSZ128rm: {
448+
FixupEntry Fixups[] = {
449+
{MultiDomain ? X86::VPMOVSXBQZ128rm : 0, 2, 8, rebuildSExtCst},
450+
{MultiDomain ? X86::VPMOVZXBQZ128rm : 0, 2, 8, rebuildZExtCst},
451+
{X86::VMOVSSZrm, 1, 32, rebuildZeroUpperCst},
452+
{X86::VBROADCASTSSZ128rm, 1, 32, rebuildSplatCst},
453+
{MultiDomain ? X86::VPMOVSXBDZ128rm : 0, 4, 8, rebuildSExtCst},
454+
{MultiDomain ? X86::VPMOVZXBDZ128rm : 0, 4, 8, rebuildZExtCst},
455+
{MultiDomain ? X86::VPMOVSXWQZ128rm : 0, 2, 16, rebuildSExtCst},
456+
{MultiDomain ? X86::VPMOVZXWQZ128rm : 0, 2, 16, rebuildZExtCst},
457+
{X86::VMOVSDZrm, 1, 64, rebuildZeroUpperCst},
458+
{X86::VMOVDDUPZ128rm, 1, 64, rebuildSplatCst},
459+
{MultiDomain ? X86::VPMOVSXWDZ128rm : 0, 4, 16, rebuildSExtCst},
460+
{MultiDomain ? X86::VPMOVZXWDZ128rm : 0, 4, 16, rebuildZExtCst},
461+
{MultiDomain ? X86::VPMOVSXDQZ128rm : 0, 2, 32, rebuildSExtCst},
462+
{MultiDomain ? X86::VPMOVZXDQZ128rm : 0, 2, 32, rebuildZExtCst}};
463+
return FixupConstant(Fixups, 128, 1);
464+
}
427465
case X86::VMOVAPDZ256rm:
428466
case X86::VMOVAPSZ256rm:
429467
case X86::VMOVUPDZ256rm:
430-
case X86::VMOVUPSZ256rm:
431-
return FixupConstant(
432-
{{X86::VBROADCASTSSZ256rm, 1, 32, rebuildSplatCst},
433-
{X86::VBROADCASTSDZ256rm, 1, 64, rebuildSplatCst},
434-
{X86::VBROADCASTF32X4Z256rm, 1, 128, rebuildSplatCst}},
435-
256, 1);
468+
case X86::VMOVUPSZ256rm: {
469+
FixupEntry Fixups[] = {
470+
{X86::VBROADCASTSSZ256rm, 1, 32, rebuildSplatCst},
471+
{MultiDomain ? X86::VPMOVSXBQZ256rm : 0, 4, 8, rebuildSExtCst},
472+
{MultiDomain ? X86::VPMOVZXBQZ256rm : 0, 4, 8, rebuildZExtCst},
473+
{X86::VBROADCASTSDZ256rm, 1, 64, rebuildSplatCst},
474+
{MultiDomain ? X86::VPMOVSXBDZ256rm : 0, 8, 8, rebuildSExtCst},
475+
{MultiDomain ? X86::VPMOVZXBDZ256rm : 0, 8, 8, rebuildZExtCst},
476+
{MultiDomain ? X86::VPMOVSXWQZ256rm : 0, 4, 16, rebuildSExtCst},
477+
{MultiDomain ? X86::VPMOVZXWQZ256rm : 0, 4, 16, rebuildZExtCst},
478+
{X86::VBROADCASTF32X4Z256rm, 1, 128, rebuildSplatCst},
479+
{MultiDomain ? X86::VPMOVSXWDZ256rm : 0, 8, 16, rebuildSExtCst},
480+
{MultiDomain ? X86::VPMOVZXWDZ256rm : 0, 8, 16, rebuildZExtCst},
481+
{MultiDomain ? X86::VPMOVSXDQZ256rm : 0, 4, 32, rebuildSExtCst},
482+
{MultiDomain ? X86::VPMOVZXDQZ256rm : 0, 4, 32, rebuildZExtCst}};
483+
return FixupConstant(Fixups, 256, 1);
484+
}
436485
case X86::VMOVAPDZrm:
437486
case X86::VMOVAPSZrm:
438487
case X86::VMOVUPDZrm:
439-
case X86::VMOVUPSZrm:
440-
return FixupConstant({{X86::VBROADCASTSSZrm, 1, 32, rebuildSplatCst},
441-
{X86::VBROADCASTSDZrm, 1, 64, rebuildSplatCst},
442-
{X86::VBROADCASTF32X4Zrm, 1, 128, rebuildSplatCst},
443-
{X86::VBROADCASTF64X4Zrm, 1, 256, rebuildSplatCst}},
444-
512, 1);
488+
case X86::VMOVUPSZrm: {
489+
FixupEntry Fixups[] = {
490+
{X86::VBROADCASTSSZrm, 1, 32, rebuildSplatCst},
491+
{X86::VBROADCASTSDZrm, 1, 64, rebuildSplatCst},
492+
{MultiDomain ? X86::VPMOVSXBQZrm : 0, 8, 8, rebuildSExtCst},
493+
{MultiDomain ? X86::VPMOVZXBQZrm : 0, 8, 8, rebuildZExtCst},
494+
{X86::VBROADCASTF32X4Zrm, 1, 128, rebuildSplatCst},
495+
{MultiDomain ? X86::VPMOVSXBDZrm : 0, 16, 8, rebuildSExtCst},
496+
{MultiDomain ? X86::VPMOVZXBDZrm : 0, 16, 8, rebuildZExtCst},
497+
{MultiDomain ? X86::VPMOVSXWQZrm : 0, 8, 16, rebuildSExtCst},
498+
{MultiDomain ? X86::VPMOVZXWQZrm : 0, 8, 16, rebuildZExtCst},
499+
{X86::VBROADCASTF64X4Zrm, 1, 256, rebuildSplatCst},
500+
{MultiDomain ? X86::VPMOVSXWDZrm : 0, 16, 16, rebuildSExtCst},
501+
{MultiDomain ? X86::VPMOVZXWDZrm : 0, 16, 16, rebuildZExtCst},
502+
{MultiDomain ? X86::VPMOVSXDQZrm : 0, 8, 32, rebuildSExtCst},
503+
{MultiDomain ? X86::VPMOVZXDQZrm : 0, 8, 32, rebuildZExtCst}};
504+
return FixupConstant(Fixups, 512, 1);
505+
}
445506
/* Integer Loads */
446507
case X86::MOVDQArm:
447508
case X86::MOVDQUrm: {

llvm/test/CodeGen/X86/avx512-build-vector.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ define <16 x float> @test3(<4 x float> %a) {
1515
; CHECK-LABEL: test3:
1616
; CHECK: ## %bb.0:
1717
; CHECK-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
18-
; CHECK-NEXT: vmovaps {{.*#+}} zmm2 = [0,1,2,3,4,18,16,7,8,9,10,11,12,13,14,15]
18+
; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,2,3,4,18,16,7,8,9,10,11,12,13,14,15]
1919
; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
2020
; CHECK-NEXT: vpermt2ps %zmm0, %zmm2, %zmm1
2121
; CHECK-NEXT: vmovaps %zmm1, %zmm0

llvm/test/CodeGen/X86/avx512-mask-op.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1060,12 +1060,12 @@ define i32 @test13_crash(i32 %x, i32 %y) {
10601060
define <4 x i1> @test14() {
10611061
; CHECK-LABEL: test14:
10621062
; CHECK: ## %bb.0:
1063-
; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,0,1]
1063+
; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm0 = [1,1,0,1]
10641064
; CHECK-NEXT: retq
10651065
;
10661066
; X86-LABEL: test14:
10671067
; X86: ## %bb.0:
1068-
; X86-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,0,1]
1068+
; X86-NEXT: vpmovsxbd {{.*#+}} xmm0 = [1,1,0,1]
10691069
; X86-NEXT: retl
10701070
%a = bitcast i16 21845 to <16 x i1>
10711071
%b = extractelement <16 x i1> %a, i32 2

0 commit comments

Comments
 (0)