Skip to content

Commit c4f2b09

Browse files
committed
[X86] Add masked versions of VPERMT2* and VPERMI2* to load folding tables.
llvm-svn: 289186
1 parent f74fcdd commit c4f2b09

File tree

2 files changed

+112
-6
lines changed

2 files changed

+112
-6
lines changed

llvm/lib/Target/X86/X86InstrInfo.cpp

Lines changed: 84 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2675,11 +2675,23 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
26752675
{ X86::VPANDQZrrk, X86::VPANDQZrmk, 0 },
26762676
{ X86::VPERMBZrrk, X86::VPERMBZrmk, 0 },
26772677
{ X86::VPERMDZrrk, X86::VPERMDZrmk, 0 },
2678+
{ X86::VPERMI2Brrk, X86::VPERMI2Brmk, 0 },
2679+
{ X86::VPERMI2Drrk, X86::VPERMI2Drmk, 0 },
2680+
{ X86::VPERMI2PSrrk, X86::VPERMI2PSrmk, 0 },
2681+
{ X86::VPERMI2PDrrk, X86::VPERMI2PDrmk, 0 },
2682+
{ X86::VPERMI2Qrrk, X86::VPERMI2Qrmk, 0 },
2683+
{ X86::VPERMI2Wrrk, X86::VPERMI2Wrmk, 0 },
26782684
{ X86::VPERMILPDZrrk, X86::VPERMILPDZrmk, 0 },
26792685
{ X86::VPERMILPSZrrk, X86::VPERMILPSZrmk, 0 },
26802686
{ X86::VPERMPDZrrk, X86::VPERMPDZrmk, 0 },
26812687
{ X86::VPERMPSZrrk, X86::VPERMPSZrmk, 0 },
26822688
{ X86::VPERMQZrrk, X86::VPERMQZrmk, 0 },
2689+
{ X86::VPERMT2Brrk, X86::VPERMT2Brmk, 0 },
2690+
{ X86::VPERMT2Drrk, X86::VPERMT2Drmk, 0 },
2691+
{ X86::VPERMT2PSrrk, X86::VPERMT2PSrmk, 0 },
2692+
{ X86::VPERMT2PDrrk, X86::VPERMT2PDrmk, 0 },
2693+
{ X86::VPERMT2Qrrk, X86::VPERMT2Qrmk, 0 },
2694+
{ X86::VPERMT2Wrrk, X86::VPERMT2Wrmk, 0 },
26832695
{ X86::VPERMWZrrk, X86::VPERMWZrmk, 0 },
26842696
{ X86::VPMADDUBSWZrrk, X86::VPMADDUBSWZrmk, 0 },
26852697
{ X86::VPMADDWDZrrk, X86::VPMADDWDZrmk, 0 },
@@ -2694,9 +2706,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
26942706
{ X86::VPSUBUSBZrrk, X86::VPSUBUSBZrmk, 0 },
26952707
{ X86::VPSUBUSWZrrk, X86::VPSUBUSWZrmk, 0 },
26962708
{ X86::VPTERNLOGDZrrik, X86::VPTERNLOGDZrmik, 0 },
2697-
{ X86::VPTERNLOGDZrrikz, X86::VPTERNLOGDZrmikz, 0 },
26982709
{ X86::VPTERNLOGQZrrik, X86::VPTERNLOGQZrmik, 0 },
2699-
{ X86::VPTERNLOGQZrrikz, X86::VPTERNLOGQZrmikz, 0 },
27002710
{ X86::VPUNPCKHBWZrrk, X86::VPUNPCKHBWZrmk, 0 },
27012711
{ X86::VPUNPCKHDQZrrk, X86::VPUNPCKHDQZrmk, 0 },
27022712
{ X86::VPUNPCKHQDQZrrk, X86::VPUNPCKHQDQZrmk, 0 },
@@ -2755,11 +2765,23 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
27552765
{ X86::VPANDQZ256rrk, X86::VPANDQZ256rmk, 0 },
27562766
{ X86::VPERMBZ256rrk, X86::VPERMBZ256rmk, 0 },
27572767
{ X86::VPERMDZ256rrk, X86::VPERMDZ256rmk, 0 },
2768+
{ X86::VPERMI2B256rrk, X86::VPERMI2B256rmk, 0 },
2769+
{ X86::VPERMI2D256rrk, X86::VPERMI2D256rmk, 0 },
2770+
{ X86::VPERMI2PD256rrk, X86::VPERMI2PD256rmk, 0 },
2771+
{ X86::VPERMI2PS256rrk, X86::VPERMI2PS256rmk, 0 },
2772+
{ X86::VPERMI2Q256rrk, X86::VPERMI2Q256rmk, 0 },
2773+
{ X86::VPERMI2W256rrk, X86::VPERMI2W256rmk, 0 },
27582774
{ X86::VPERMILPDZ256rrk, X86::VPERMILPDZ256rmk, 0 },
27592775
{ X86::VPERMILPSZ256rrk, X86::VPERMILPSZ256rmk, 0 },
27602776
{ X86::VPERMPDZ256rrk, X86::VPERMPDZ256rmk, 0 },
27612777
{ X86::VPERMPSZ256rrk, X86::VPERMPSZ256rmk, 0 },
27622778
{ X86::VPERMQZ256rrk, X86::VPERMQZ256rmk, 0 },
2779+
{ X86::VPERMT2B256rrk, X86::VPERMT2B256rmk, 0 },
2780+
{ X86::VPERMT2D256rrk, X86::VPERMT2D256rmk, 0 },
2781+
{ X86::VPERMT2PD256rrk, X86::VPERMT2PD256rmk, 0 },
2782+
{ X86::VPERMT2PS256rrk, X86::VPERMT2PS256rmk, 0 },
2783+
{ X86::VPERMT2Q256rrk, X86::VPERMT2Q256rmk, 0 },
2784+
{ X86::VPERMT2W256rrk, X86::VPERMT2W256rmk, 0 },
27632785
{ X86::VPERMWZ256rrk, X86::VPERMWZ256rmk, 0 },
27642786
{ X86::VPMADDUBSWZ256rrk, X86::VPMADDUBSWZ256rmk, 0 },
27652787
{ X86::VPMADDWDZ256rrk, X86::VPMADDWDZ256rmk, 0 },
@@ -2775,9 +2797,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
27752797
{ X86::VPSUBUSWZ256rrk, X86::VPSUBUSWZ256rmk, 0 },
27762798
{ X86::VPSUBWZ256rrk, X86::VPSUBWZ256rmk, 0 },
27772799
{ X86::VPTERNLOGDZ256rrik, X86::VPTERNLOGDZ256rmik, 0 },
2778-
{ X86::VPTERNLOGDZ256rrikz,X86::VPTERNLOGDZ256rmikz, 0 },
27792800
{ X86::VPTERNLOGQZ256rrik, X86::VPTERNLOGQZ256rmik, 0 },
2780-
{ X86::VPTERNLOGQZ256rrikz,X86::VPTERNLOGQZ256rmikz, 0 },
27812801
{ X86::VPUNPCKHBWZ256rrk, X86::VPUNPCKHBWZ256rmk, 0 },
27822802
{ X86::VPUNPCKHDQZ256rrk, X86::VPUNPCKHDQZ256rmk, 0 },
27832803
{ X86::VPUNPCKHQDQZ256rrk, X86::VPUNPCKHQDQZ256rmk, 0 },
@@ -2831,8 +2851,20 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
28312851
{ X86::VPANDNQZ128rrk, X86::VPANDNQZ128rmk, 0 },
28322852
{ X86::VPANDQZ128rrk, X86::VPANDQZ128rmk, 0 },
28332853
{ X86::VPERMBZ128rrk, X86::VPERMBZ128rmk, 0 },
2854+
{ X86::VPERMI2B128rrk, X86::VPERMI2B128rmk, 0 },
2855+
{ X86::VPERMI2D128rrk, X86::VPERMI2D128rmk, 0 },
2856+
{ X86::VPERMI2PD128rrk, X86::VPERMI2PD128rmk, 0 },
2857+
{ X86::VPERMI2PS128rrk, X86::VPERMI2PS128rmk, 0 },
2858+
{ X86::VPERMI2Q128rrk, X86::VPERMI2Q128rmk, 0 },
2859+
{ X86::VPERMI2W128rrk, X86::VPERMI2W128rmk, 0 },
28342860
{ X86::VPERMILPDZ128rrk, X86::VPERMILPDZ128rmk, 0 },
28352861
{ X86::VPERMILPSZ128rrk, X86::VPERMILPSZ128rmk, 0 },
2862+
{ X86::VPERMT2B128rrk, X86::VPERMT2B128rmk, 0 },
2863+
{ X86::VPERMT2D128rrk, X86::VPERMT2D128rmk, 0 },
2864+
{ X86::VPERMT2PD128rrk, X86::VPERMT2PD128rmk, 0 },
2865+
{ X86::VPERMT2PS128rrk, X86::VPERMT2PS128rmk, 0 },
2866+
{ X86::VPERMT2Q128rrk, X86::VPERMT2Q128rmk, 0 },
2867+
{ X86::VPERMT2W128rrk, X86::VPERMT2W128rmk, 0 },
28362868
{ X86::VPERMWZ128rrk, X86::VPERMWZ128rmk, 0 },
28372869
{ X86::VPMADDUBSWZ128rrk, X86::VPMADDUBSWZ128rmk, 0 },
28382870
{ X86::VPMADDWDZ128rrk, X86::VPMADDWDZ128rmk, 0 },
@@ -2848,9 +2880,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
28482880
{ X86::VPSUBUSWZ128rrk, X86::VPSUBUSWZ128rmk, 0 },
28492881
{ X86::VPSUBWZ128rrk, X86::VPSUBWZ128rmk, 0 },
28502882
{ X86::VPTERNLOGDZ128rrik, X86::VPTERNLOGDZ128rmik, 0 },
2851-
{ X86::VPTERNLOGDZ128rrikz,X86::VPTERNLOGDZ128rmikz, 0 },
28522883
{ X86::VPTERNLOGQZ128rrik, X86::VPTERNLOGQZ128rmik, 0 },
2853-
{ X86::VPTERNLOGQZ128rrikz,X86::VPTERNLOGQZ128rmikz, 0 },
28542884
{ X86::VPUNPCKHBWZ128rrk, X86::VPUNPCKHBWZ128rmk, 0 },
28552885
{ X86::VPUNPCKHDQZ128rrk, X86::VPUNPCKHDQZ128rmk, 0 },
28562886
{ X86::VPUNPCKHQDQZ128rrk, X86::VPUNPCKHQDQZ128rmk, 0 },
@@ -2869,6 +2899,54 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
28692899
{ X86::VUNPCKLPSZ128rrk, X86::VUNPCKLPSZ128rmk, 0 },
28702900
{ X86::VXORPDZ128rrk, X86::VXORPDZ128rmk, 0 },
28712901
{ X86::VXORPSZ128rrk, X86::VXORPSZ128rmk, 0 },
2902+
2903+
// 512-bit three source instructions with zero masking.
2904+
{ X86::VPERMI2Brrkz, X86::VPERMI2Brmkz, 0 },
2905+
{ X86::VPERMI2Drrkz, X86::VPERMI2Drmkz, 0 },
2906+
{ X86::VPERMI2PSrrkz, X86::VPERMI2PSrmkz, 0 },
2907+
{ X86::VPERMI2PDrrkz, X86::VPERMI2PDrmkz, 0 },
2908+
{ X86::VPERMI2Qrrkz, X86::VPERMI2Qrmkz, 0 },
2909+
{ X86::VPERMI2Wrrkz, X86::VPERMI2Wrmkz, 0 },
2910+
{ X86::VPERMT2Brrkz, X86::VPERMT2Brmkz, 0 },
2911+
{ X86::VPERMT2Drrkz, X86::VPERMT2Drmkz, 0 },
2912+
{ X86::VPERMT2PSrrkz, X86::VPERMT2PSrmkz, 0 },
2913+
{ X86::VPERMT2PDrrkz, X86::VPERMT2PDrmkz, 0 },
2914+
{ X86::VPERMT2Qrrkz, X86::VPERMT2Qrmkz, 0 },
2915+
{ X86::VPERMT2Wrrkz, X86::VPERMT2Wrmkz, 0 },
2916+
{ X86::VPTERNLOGDZrrikz, X86::VPTERNLOGDZrmikz, 0 },
2917+
{ X86::VPTERNLOGQZrrikz, X86::VPTERNLOGQZrmikz, 0 },
2918+
2919+
// 256-bit three source instructions with zero masking.
2920+
{ X86::VPERMI2B256rrkz, X86::VPERMI2B256rmkz, 0 },
2921+
{ X86::VPERMI2D256rrkz, X86::VPERMI2D256rmkz, 0 },
2922+
{ X86::VPERMI2PD256rrkz, X86::VPERMI2PD256rmkz, 0 },
2923+
{ X86::VPERMI2PS256rrkz, X86::VPERMI2PS256rmkz, 0 },
2924+
{ X86::VPERMI2Q256rrkz, X86::VPERMI2Q256rmkz, 0 },
2925+
{ X86::VPERMI2W256rrkz, X86::VPERMI2W256rmkz, 0 },
2926+
{ X86::VPERMT2B256rrkz, X86::VPERMT2B256rmkz, 0 },
2927+
{ X86::VPERMT2D256rrkz, X86::VPERMT2D256rmkz, 0 },
2928+
{ X86::VPERMT2PD256rrkz, X86::VPERMT2PD256rmkz, 0 },
2929+
{ X86::VPERMT2PS256rrkz, X86::VPERMT2PS256rmkz, 0 },
2930+
{ X86::VPERMT2Q256rrkz, X86::VPERMT2Q256rmkz, 0 },
2931+
{ X86::VPERMT2W256rrkz, X86::VPERMT2W256rmkz, 0 },
2932+
{ X86::VPTERNLOGDZ256rrikz,X86::VPTERNLOGDZ256rmikz, 0 },
2933+
{ X86::VPTERNLOGQZ256rrikz,X86::VPTERNLOGQZ256rmikz, 0 },
2934+
2935+
// 128-bit three source instructions with zero masking.
2936+
{ X86::VPERMI2B128rrkz, X86::VPERMI2B128rmkz, 0 },
2937+
{ X86::VPERMI2D128rrkz, X86::VPERMI2D128rmkz, 0 },
2938+
{ X86::VPERMI2PD128rrkz, X86::VPERMI2PD128rmkz, 0 },
2939+
{ X86::VPERMI2PS128rrkz, X86::VPERMI2PS128rmkz, 0 },
2940+
{ X86::VPERMI2Q128rrkz, X86::VPERMI2Q128rmkz, 0 },
2941+
{ X86::VPERMI2W128rrkz, X86::VPERMI2W128rmkz, 0 },
2942+
{ X86::VPERMT2B128rrkz, X86::VPERMT2B128rmkz, 0 },
2943+
{ X86::VPERMT2D128rrkz, X86::VPERMT2D128rmkz, 0 },
2944+
{ X86::VPERMT2PD128rrkz, X86::VPERMT2PD128rmkz, 0 },
2945+
{ X86::VPERMT2PS128rrkz, X86::VPERMT2PS128rmkz, 0 },
2946+
{ X86::VPERMT2Q128rrkz, X86::VPERMT2Q128rmkz, 0 },
2947+
{ X86::VPERMT2W128rrkz, X86::VPERMT2W128rmkz, 0 },
2948+
{ X86::VPTERNLOGDZ128rrikz,X86::VPTERNLOGDZ128rmikz, 0 },
2949+
{ X86::VPTERNLOGQZ128rrikz,X86::VPTERNLOGQZ128rmikz, 0 },
28722950
};
28732951

28742952
for (X86MemoryFoldTableEntry Entry : MemoryFoldTable4) {

llvm/test/CodeGen/X86/stack-folding-fp-avx512.ll

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -534,6 +534,34 @@ define <16 x float> @stack_fold_vpermi2ps(<16 x i32> %x0, <16 x float> %x1, <16
534534
}
535535
declare <16 x float> @llvm.x86.avx512.mask.vpermt2var.ps.512(<16 x i32>, <16 x float>, <16 x float>, i16)
536536

537+
define <16 x float> @stack_fold_vpermi2ps_mask(<16 x float> %x0, <16 x i32>* %x1, <16 x float> %x2, i16 %mask) {
538+
;CHECK-LABEL: stack_fold_vpermi2ps_mask
539+
;CHECK: vpermi2ps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
540+
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
541+
%x1b = load <16 x i32>, <16 x i32>* %x1
542+
%res = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1b, <16 x float> %x2, i16 %mask)
543+
ret <16 x float> %res
544+
}
545+
546+
define <16 x float> @stack_fold_vpermt2ps_mask(<16 x i32>* %x0, <16 x float> %x1, <16 x float> %x2, i16 %mask) {
547+
;CHECK-LABEL: stack_fold_vpermt2ps_mask
548+
;CHECK: vpermt2ps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
549+
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
550+
%x0b = load <16 x i32>, <16 x i32>* %x0
551+
%res = call <16 x float> @llvm.x86.avx512.mask.vpermt2var.ps.512(<16 x i32> %x0b, <16 x float> %x1, <16 x float> %x2, i16 %mask)
552+
ret <16 x float> %res
553+
}
554+
555+
define <16 x float> @stack_fold_vpermt2ps_maskz(<16 x i32>* %x0, <16 x float> %x1, <16 x float> %x2, i16 %mask) {
556+
;CHECK-LABEL: stack_fold_vpermt2ps_maskz
557+
;CHECK: vpermt2ps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
558+
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
559+
%x0b = load <16 x i32>, <16 x i32>* %x0
560+
%res = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> %x0b, <16 x float> %x1, <16 x float> %x2, i16 %mask)
561+
ret <16 x float> %res
562+
}
563+
declare <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32>, <16 x float>, <16 x float>, i16)
564+
537565
define <8 x double> @stack_fold_vpermt2pd(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2) {
538566
;CHECK-LABEL: stack_fold_vpermt2pd
539567
;CHECK: vpermt2pd {{-?[0-9]*}}(%rsp), %zmm1, %zmm0 # 64-byte Folded Reload

0 commit comments

Comments
 (0)