Skip to content

Commit ed88297

Browse files
committed
[SKX] Extended non-temporal load/store instructions for AVX512VL subsets.
Added avx512_movnt_vl multiclass for handling 256/128-bit forms of instruction. Added encoding and lowering tests. Reviewed by Elena Demikhovsky <[email protected]> llvm-svn: 215536
1 parent d97a634 commit ed88297

File tree

7 files changed

+391
-36
lines changed

7 files changed

+391
-36
lines changed

llvm/include/llvm/IR/IntrinsicsX86.td

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1954,8 +1954,6 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
19541954
llvm_i32_ty], [IntrNoMem, Commutative]>;
19551955
def int_x86_avx2_movntdqa : GCCBuiltin<"__builtin_ia32_movntdqa256">,
19561956
Intrinsic<[llvm_v4i64_ty], [llvm_ptr_ty], [IntrReadMem]>;
1957-
def int_x86_avx512_movntdqa : GCCBuiltin<"__builtin_ia32_movntdqa512">,
1958-
Intrinsic<[llvm_v8i64_ty], [llvm_ptr_ty], [IntrReadMem]>;
19591957
}
19601958

19611959
//===----------------------------------------------------------------------===//
@@ -3219,6 +3217,8 @@ let TargetPrefix = "x86" in {
32193217
Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty,
32203218
llvm_v8i64_ty, llvm_i8_ty],
32213219
[IntrNoMem]>;
3220+
def int_x86_avx512_movntdqa : GCCBuiltin<"__builtin_ia32_movntdqa512">,
3221+
Intrinsic<[llvm_v8i64_ty], [llvm_ptr_ty], [IntrReadMem]>;
32223222
}
32233223

32243224
//===----------------------------------------------------------------------===//

llvm/lib/Target/X86/X86InstrAVX512.td

Lines changed: 64 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -2090,43 +2090,73 @@ def : Pat<(v8i64 (X86Vinsert undef, GR64:$src2, (iPTR 0))),
20902090
//===----------------------------------------------------------------------===//
20912091
// AVX-512 - Non-temporals
20922092
//===----------------------------------------------------------------------===//
2093+
let SchedRW = [WriteLoad] in {
2094+
def VMOVNTDQAZrm : AVX512PI<0x2A, MRMSrcMem, (outs VR512:$dst),
2095+
(ins i512mem:$src), "vmovntdqa\t{$src, $dst|$dst, $src}",
2096+
[(set VR512:$dst, (int_x86_avx512_movntdqa addr:$src))],
2097+
SSEPackedInt>, EVEX, T8PD, EVEX_V512,
2098+
EVEX_CD8<64, CD8VF>;
2099+
2100+
let Predicates = [HasAVX512, HasVLX] in {
2101+
def VMOVNTDQAZ256rm : AVX512PI<0x2A, MRMSrcMem, (outs VR256X:$dst),
2102+
(ins i256mem:$src),
2103+
"vmovntdqa\t{$src, $dst|$dst, $src}", [],
2104+
SSEPackedInt>, EVEX, T8PD, EVEX_V256,
2105+
EVEX_CD8<64, CD8VF>;
2106+
2107+
def VMOVNTDQAZ128rm : AVX512PI<0x2A, MRMSrcMem, (outs VR128X:$dst),
2108+
(ins i128mem:$src),
2109+
"vmovntdqa\t{$src, $dst|$dst, $src}", [],
2110+
SSEPackedInt>, EVEX, T8PD, EVEX_V128,
2111+
EVEX_CD8<64, CD8VF>;
2112+
}
2113+
}
20932114

2094-
def VMOVNTDQAZrm : AVX5128I<0x2A, MRMSrcMem, (outs VR512:$dst),
2095-
(ins i512mem:$src),
2096-
"vmovntdqa\t{$src, $dst|$dst, $src}",
2097-
[(set VR512:$dst,
2098-
(int_x86_avx512_movntdqa addr:$src))]>,
2099-
EVEX, EVEX_V512, EVEX_CD8<64, CD8VF>;
2100-
2101-
// Prefer non-temporal over temporal versions
2102-
let AddedComplexity = 400, SchedRW = [WriteStore] in {
2103-
2104-
def VMOVNTPSZmr : AVX512PSI<0x2B, MRMDestMem, (outs),
2105-
(ins f512mem:$dst, VR512:$src),
2106-
"vmovntps\t{$src, $dst|$dst, $src}",
2107-
[(alignednontemporalstore (v16f32 VR512:$src),
2108-
addr:$dst)],
2109-
IIC_SSE_MOVNT>,
2110-
EVEX, EVEX_V512, EVEX_CD8<32, CD8VF>;
2111-
2112-
def VMOVNTPDZmr : AVX512PDI<0x2B, MRMDestMem, (outs),
2113-
(ins f512mem:$dst, VR512:$src),
2114-
"vmovntpd\t{$src, $dst|$dst, $src}",
2115-
[(alignednontemporalstore (v8f64 VR512:$src),
2116-
addr:$dst)],
2117-
IIC_SSE_MOVNT>,
2118-
EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
2119-
2120-
2121-
def VMOVNTDQZmr : AVX512BI<0xE7, MRMDestMem, (outs),
2122-
(ins i512mem:$dst, VR512:$src),
2123-
"vmovntdq\t{$src, $dst|$dst, $src}",
2124-
[(alignednontemporalstore (v8i64 VR512:$src),
2125-
addr:$dst)],
2126-
IIC_SSE_MOVNT>,
2127-
EVEX, EVEX_V512, EVEX_CD8<64, CD8VF>;
2115+
multiclass avx512_movnt<bits<8> opc, string OpcodeStr, PatFrag st_frag,
2116+
ValueType OpVT, RegisterClass RC, X86MemOperand memop,
2117+
Domain d, InstrItinClass itin = IIC_SSE_MOVNT> {
2118+
let SchedRW = [WriteStore], mayStore = 1,
2119+
AddedComplexity = 400 in
2120+
def mr : AVX512PI<opc, MRMDestMem, (outs), (ins memop:$dst, RC:$src),
2121+
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
2122+
[(st_frag (OpVT RC:$src), addr:$dst)], d, itin>, EVEX;
21282123
}
21292124

2125+
multiclass avx512_movnt_vl<bits<8> opc, string OpcodeStr, PatFrag st_frag,
2126+
string elty, string elsz, string vsz512,
2127+
string vsz256, string vsz128, Domain d,
2128+
Predicate prd, InstrItinClass itin = IIC_SSE_MOVNT> {
2129+
let Predicates = [prd] in
2130+
defm Z : avx512_movnt<opc, OpcodeStr, st_frag,
2131+
!cast<ValueType>("v"##vsz512##elty##elsz), VR512,
2132+
!cast<X86MemOperand>(elty##"512mem"), d, itin>,
2133+
EVEX_V512;
2134+
2135+
let Predicates = [prd, HasVLX] in {
2136+
defm Z256 : avx512_movnt<opc, OpcodeStr, st_frag,
2137+
!cast<ValueType>("v"##vsz256##elty##elsz), VR256X,
2138+
!cast<X86MemOperand>(elty##"256mem"), d, itin>,
2139+
EVEX_V256;
2140+
2141+
defm Z128 : avx512_movnt<opc, OpcodeStr, st_frag,
2142+
!cast<ValueType>("v"##vsz128##elty##elsz), VR128X,
2143+
!cast<X86MemOperand>(elty##"128mem"), d, itin>,
2144+
EVEX_V128;
2145+
}
2146+
}
2147+
2148+
defm VMOVNTDQ : avx512_movnt_vl<0xE7, "vmovntdq", alignednontemporalstore,
2149+
"i", "64", "8", "4", "2", SSEPackedInt,
2150+
HasAVX512>, PD, EVEX_CD8<64, CD8VF>;
2151+
2152+
defm VMOVNTPD : avx512_movnt_vl<0x2B, "vmovntpd", alignednontemporalstore,
2153+
"f", "64", "8", "4", "2", SSEPackedDouble,
2154+
HasAVX512>, PD, VEX_W, EVEX_CD8<64, CD8VF>;
2155+
2156+
defm VMOVNTPS : avx512_movnt_vl<0x2B, "vmovntps", alignednontemporalstore,
2157+
"f", "32", "16", "8", "4", SSEPackedSingle,
2158+
HasAVX512>, PS, EVEX_CD8<32, CD8VF>;
2159+
21302160
//===----------------------------------------------------------------------===//
21312161
// AVX-512 - Integer arithmetic
21322162
//

llvm/lib/Target/X86/X86InstrInfo.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -727,6 +727,7 @@ def HasDQI : Predicate<"Subtarget->hasDQI()">;
727727
def HasBWI : Predicate<"Subtarget->hasBWI()">;
728728
def HasVLX : Predicate<"Subtarget->hasVLX()">,
729729
AssemblerPredicate<"FeatureVLX", "AVX-512 VLX ISA">;
730+
def NoVLX : Predicate<"!Subtarget->hasVLX()">;
730731

731732
def HasPOPCNT : Predicate<"Subtarget->hasPOPCNT()">;
732733
def HasAES : Predicate<"Subtarget->hasAES()">;

llvm/lib/Target/X86/X86InstrSSE.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3697,6 +3697,7 @@ let Predicates = [UseSSE1] in {
36973697

36983698
let AddedComplexity = 400 in { // Prefer non-temporal versions
36993699
let SchedRW = [WriteStore] in {
3700+
let Predicates = [HasAVX, NoVLX] in {
37003701
def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs),
37013702
(ins f128mem:$dst, VR128:$src),
37023703
"movntps\t{$src, $dst|$dst, $src}",
@@ -3737,6 +3738,7 @@ def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs),
37373738
[(alignednontemporalstore (v4i64 VR256:$src),
37383739
addr:$dst)],
37393740
IIC_SSE_MOVNT>, VEX, VEX_L;
3741+
}
37403742

37413743
def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
37423744
"movntps\t{$src, $dst|$dst, $src}",
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=skx --show-mc-encoding | FileCheck %s
2+
3+
define void @f256(<8 x float> %A, <8 x float> %AA, i8* %B, <4 x double> %C, <4 x double> %CC, i32 %D, <4 x i64> %E, <4 x i64> %EE) {
4+
; CHECK: vmovntps %ymm{{.*}} ## encoding: [0x62
5+
%cast = bitcast i8* %B to <8 x float>*
6+
%A2 = fadd <8 x float> %A, %AA
7+
store <8 x float> %A2, <8 x float>* %cast, align 64, !nontemporal !0
8+
; CHECK: vmovntdq %ymm{{.*}} ## encoding: [0x62
9+
%cast1 = bitcast i8* %B to <4 x i64>*
10+
%E2 = add <4 x i64> %E, %EE
11+
store <4 x i64> %E2, <4 x i64>* %cast1, align 64, !nontemporal !0
12+
; CHECK: vmovntpd %ymm{{.*}} ## encoding: [0x62
13+
%cast2 = bitcast i8* %B to <4 x double>*
14+
%C2 = fadd <4 x double> %C, %CC
15+
store <4 x double> %C2, <4 x double>* %cast2, align 64, !nontemporal !0
16+
ret void
17+
}
18+
19+
define void @f128(<4 x float> %A, <4 x float> %AA, i8* %B, <2 x double> %C, <2 x double> %CC, i32 %D, <2 x i64> %E, <2 x i64> %EE) {
20+
; CHECK: vmovntps %xmm{{.*}} ## encoding: [0x62
21+
%cast = bitcast i8* %B to <4 x float>*
22+
%A2 = fadd <4 x float> %A, %AA
23+
store <4 x float> %A2, <4 x float>* %cast, align 64, !nontemporal !0
24+
; CHECK: vmovntdq %xmm{{.*}} ## encoding: [0x62
25+
%cast1 = bitcast i8* %B to <2 x i64>*
26+
%E2 = add <2 x i64> %E, %EE
27+
store <2 x i64> %E2, <2 x i64>* %cast1, align 64, !nontemporal !0
28+
; CHECK: vmovntpd %xmm{{.*}} ## encoding: [0x62
29+
%cast2 = bitcast i8* %B to <2 x double>*
30+
%C2 = fadd <2 x double> %C, %CC
31+
store <2 x double> %C2, <2 x double>* %cast2, align 64, !nontemporal !0
32+
ret void
33+
}
34+
!0 = metadata !{i32 1}

llvm/test/MC/X86/avx512-encodings.s

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -665,6 +665,102 @@
665665
// CHECK: encoding: [0x62,0xf1,0xfe,0x48,0x6f,0xb2,0xc0,0xdf,0xff,0xff]
666666
vmovdqu64 -8256(%rdx), %zmm6
667667

668+
// CHECK: vmovntdq %zmm24, (%rcx)
669+
// CHECK: encoding: [0x62,0x61,0x7d,0x48,0xe7,0x01]
670+
vmovntdq %zmm24, (%rcx)
671+
672+
// CHECK: vmovntdq %zmm24, 291(%rax,%r14,8)
673+
// CHECK: encoding: [0x62,0x21,0x7d,0x48,0xe7,0x84,0xf0,0x23,0x01,0x00,0x00]
674+
vmovntdq %zmm24, 291(%rax,%r14,8)
675+
676+
// CHECK: vmovntdq %zmm24, 8128(%rdx)
677+
// CHECK: encoding: [0x62,0x61,0x7d,0x48,0xe7,0x42,0x7f]
678+
vmovntdq %zmm24, 8128(%rdx)
679+
680+
// CHECK: vmovntdq %zmm24, 8192(%rdx)
681+
// CHECK: encoding: [0x62,0x61,0x7d,0x48,0xe7,0x82,0x00,0x20,0x00,0x00]
682+
vmovntdq %zmm24, 8192(%rdx)
683+
684+
// CHECK: vmovntdq %zmm24, -8192(%rdx)
685+
// CHECK: encoding: [0x62,0x61,0x7d,0x48,0xe7,0x42,0x80]
686+
vmovntdq %zmm24, -8192(%rdx)
687+
688+
// CHECK: vmovntdq %zmm24, -8256(%rdx)
689+
// CHECK: encoding: [0x62,0x61,0x7d,0x48,0xe7,0x82,0xc0,0xdf,0xff,0xff]
690+
vmovntdq %zmm24, -8256(%rdx)
691+
692+
// CHECK: vmovntdqa (%rcx), %zmm17
693+
// CHECK: encoding: [0x62,0xe2,0x7d,0x48,0x2a,0x09]
694+
vmovntdqa (%rcx), %zmm17
695+
696+
// CHECK: vmovntdqa 291(%rax,%r14,8), %zmm17
697+
// CHECK: encoding: [0x62,0xa2,0x7d,0x48,0x2a,0x8c,0xf0,0x23,0x01,0x00,0x00]
698+
vmovntdqa 291(%rax,%r14,8), %zmm17
699+
700+
// CHECK: vmovntdqa 8128(%rdx), %zmm17
701+
// CHECK: encoding: [0x62,0xe2,0x7d,0x48,0x2a,0x4a,0x7f]
702+
vmovntdqa 8128(%rdx), %zmm17
703+
704+
// CHECK: vmovntdqa 8192(%rdx), %zmm17
705+
// CHECK: encoding: [0x62,0xe2,0x7d,0x48,0x2a,0x8a,0x00,0x20,0x00,0x00]
706+
vmovntdqa 8192(%rdx), %zmm17
707+
708+
// CHECK: vmovntdqa -8192(%rdx), %zmm17
709+
// CHECK: encoding: [0x62,0xe2,0x7d,0x48,0x2a,0x4a,0x80]
710+
vmovntdqa -8192(%rdx), %zmm17
711+
712+
// CHECK: vmovntdqa -8256(%rdx), %zmm17
713+
// CHECK: encoding: [0x62,0xe2,0x7d,0x48,0x2a,0x8a,0xc0,0xdf,0xff,0xff]
714+
vmovntdqa -8256(%rdx), %zmm17
715+
716+
// CHECK: vmovntpd %zmm17, (%rcx)
717+
// CHECK: encoding: [0x62,0xe1,0xfd,0x48,0x2b,0x09]
718+
vmovntpd %zmm17, (%rcx)
719+
720+
// CHECK: vmovntpd %zmm17, 291(%rax,%r14,8)
721+
// CHECK: encoding: [0x62,0xa1,0xfd,0x48,0x2b,0x8c,0xf0,0x23,0x01,0x00,0x00]
722+
vmovntpd %zmm17, 291(%rax,%r14,8)
723+
724+
// CHECK: vmovntpd %zmm17, 8128(%rdx)
725+
// CHECK: encoding: [0x62,0xe1,0xfd,0x48,0x2b,0x4a,0x7f]
726+
vmovntpd %zmm17, 8128(%rdx)
727+
728+
// CHECK: vmovntpd %zmm17, 8192(%rdx)
729+
// CHECK: encoding: [0x62,0xe1,0xfd,0x48,0x2b,0x8a,0x00,0x20,0x00,0x00]
730+
vmovntpd %zmm17, 8192(%rdx)
731+
732+
// CHECK: vmovntpd %zmm17, -8192(%rdx)
733+
// CHECK: encoding: [0x62,0xe1,0xfd,0x48,0x2b,0x4a,0x80]
734+
vmovntpd %zmm17, -8192(%rdx)
735+
736+
// CHECK: vmovntpd %zmm17, -8256(%rdx)
737+
// CHECK: encoding: [0x62,0xe1,0xfd,0x48,0x2b,0x8a,0xc0,0xdf,0xff,0xff]
738+
vmovntpd %zmm17, -8256(%rdx)
739+
740+
// CHECK: vmovntps %zmm5, (%rcx)
741+
// CHECK: encoding: [0x62,0xf1,0x7c,0x48,0x2b,0x29]
742+
vmovntps %zmm5, (%rcx)
743+
744+
// CHECK: vmovntps %zmm5, 291(%rax,%r14,8)
745+
// CHECK: encoding: [0x62,0xb1,0x7c,0x48,0x2b,0xac,0xf0,0x23,0x01,0x00,0x00]
746+
vmovntps %zmm5, 291(%rax,%r14,8)
747+
748+
// CHECK: vmovntps %zmm5, 8128(%rdx)
749+
// CHECK: encoding: [0x62,0xf1,0x7c,0x48,0x2b,0x6a,0x7f]
750+
vmovntps %zmm5, 8128(%rdx)
751+
752+
// CHECK: vmovntps %zmm5, 8192(%rdx)
753+
// CHECK: encoding: [0x62,0xf1,0x7c,0x48,0x2b,0xaa,0x00,0x20,0x00,0x00]
754+
vmovntps %zmm5, 8192(%rdx)
755+
756+
// CHECK: vmovntps %zmm5, -8192(%rdx)
757+
// CHECK: encoding: [0x62,0xf1,0x7c,0x48,0x2b,0x6a,0x80]
758+
vmovntps %zmm5, -8192(%rdx)
759+
760+
// CHECK: vmovntps %zmm5, -8256(%rdx)
761+
// CHECK: encoding: [0x62,0xf1,0x7c,0x48,0x2b,0xaa,0xc0,0xdf,0xff,0xff]
762+
vmovntps %zmm5, -8256(%rdx)
763+
668764
// CHECK: vmovupd %zmm9, %zmm27
669765
// CHECK: encoding: [0x62,0x41,0xfd,0x48,0x10,0xd9]
670766
vmovupd %zmm9, %zmm27

0 commit comments

Comments
 (0)