Skip to content

Commit 254cdcd

Browse files
authored
[X86] ptest is commutable as long as only the Z flag is used. (#88969)
Fixes #88958.
1 parent efbb846 commit 254cdcd

File tree

5 files changed

+222
-17
lines changed

5 files changed

+222
-17
lines changed

llvm/lib/Target/X86/X86InstrSSE.td

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5688,6 +5688,13 @@ let Predicates = [UseSSE41, OptForSize] in {
56885688
// SSE4.1 - Packed Bit Test
56895689
//===----------------------------------------------------------------------===//
56905690

5691+
// ptest is commutable if only the Z flag is used. If the C flag is used,
5692+
// commuting would change which operand is inverted.
5693+
def X86ptest_commutable : PatFrag<(ops node:$src1, node:$src2),
5694+
(X86ptest node:$src1, node:$src2), [{
5695+
return onlyUsesZeroFlag(SDValue(Node, 0));
5696+
}]>;
5697+
56915698
// ptest instruction we'll lower to this in X86ISelLowering primarily from
56925699
// the intel intrinsic that corresponds to this.
56935700
let Defs = [EFLAGS], Predicates = [HasAVX] in {
@@ -5723,6 +5730,17 @@ def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
57235730
Sched<[SchedWriteVecTest.XMM.Folded, SchedWriteVecTest.XMM.ReadAfterFold]>;
57245731
}
57255732

5733+
let Predicates = [HasAVX] in {
5734+
def : Pat<(X86ptest_commutable (loadv2i64 addr:$src2), VR128:$src1),
5735+
(VPTESTrm VR128:$src1, addr:$src2)>;
5736+
def : Pat<(X86ptest_commutable (loadv4i64 addr:$src2), VR256:$src1),
5737+
(VPTESTYrm VR256:$src1, addr:$src2)>;
5738+
}
5739+
let Predicates = [UseSSE41] in {
5740+
def : Pat<(X86ptest_commutable (memopv2i64 addr:$src2), VR128:$src1),
5741+
(PTESTrm VR128:$src1, addr:$src2)>;
5742+
}
5743+
57265744
// The bit test instructions below are AVX only
57275745
multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC,
57285746
X86MemOperand x86memop, PatFrag mem_frag, ValueType vt,
@@ -5737,6 +5755,13 @@ multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC,
57375755
Sched<[sched.Folded, sched.ReadAfterFold]>, VEX;
57385756
}
57395757

5758+
// testps/testpd are commutable if only the Z flag is used. If the C flag is
5759+
// used, commuting would change which operand is inverted.
5760+
def X86testp_commutable : PatFrag<(ops node:$src1, node:$src2),
5761+
(X86testp node:$src1, node:$src2), [{
5762+
return onlyUsesZeroFlag(SDValue(Node, 0));
5763+
}]>;
5764+
57405765
let Defs = [EFLAGS], Predicates = [HasAVX] in {
57415766
let ExeDomain = SSEPackedSingle in {
57425767
defm VTESTPS : avx_bittest<0x0E, "vtestps", VR128, f128mem, loadv4f32, v4f32,
@@ -5752,6 +5777,18 @@ defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, loadv4f64, v4f64,
57525777
}
57535778
}
57545779

5780+
let Predicates = [HasAVX] in {
5781+
def : Pat<(X86testp_commutable (loadv4f32 addr:$src2), VR128:$src),
5782+
(VTESTPSrm VR128:$src, addr:$src2)>;
5783+
def : Pat<(X86testp_commutable (loadv8f32 addr:$src2), VR256:$src),
5784+
(VTESTPSYrm VR256:$src, addr:$src2)>;
5785+
5786+
def : Pat<(X86testp_commutable (loadv2f64 addr:$src2), VR128:$src),
5787+
(VTESTPDrm VR128:$src, addr:$src2)>;
5788+
def : Pat<(X86testp_commutable (loadv4f64 addr:$src2), VR256:$src),
5789+
(VTESTPDYrm VR256:$src, addr:$src2)>;
5790+
}
5791+
57555792
//===----------------------------------------------------------------------===//
57565793
// SSE4.1 - Misc Instructions
57575794
//===----------------------------------------------------------------------===//

llvm/test/CodeGen/X86/combine-ptest.ll

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -400,17 +400,15 @@ define i1 @PR38788(<4 x i32> %0, <4 x i32> %1) {
400400
define i32 @PR88958_1(ptr %0, <2 x i64> %1) {
401401
; SSE-LABEL: PR88958_1:
402402
; SSE: # %bb.0:
403-
; SSE-NEXT: movdqa (%rdi), %xmm1
404403
; SSE-NEXT: xorl %eax, %eax
405-
; SSE-NEXT: ptest %xmm0, %xmm1
404+
; SSE-NEXT: ptest (%rdi), %xmm0
406405
; SSE-NEXT: sete %al
407406
; SSE-NEXT: retq
408407
;
409408
; AVX-LABEL: PR88958_1:
410409
; AVX: # %bb.0:
411-
; AVX-NEXT: vmovdqa (%rdi), %xmm1
412410
; AVX-NEXT: xorl %eax, %eax
413-
; AVX-NEXT: vptest %xmm0, %xmm1
411+
; AVX-NEXT: vptest (%rdi), %xmm0
414412
; AVX-NEXT: sete %al
415413
; AVX-NEXT: retq
416414
%3 = load <2 x i64>, ptr %0

llvm/test/CodeGen/X86/combine-testpd.ll

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -255,6 +255,88 @@ end: ; preds = %entry
255255
ret void
256256
}
257257

258+
define i32 @PR88958_1(ptr %0, <2 x double> %1) {
259+
; SSE-LABEL: PR88958_1:
260+
; SSE: # %bb.0:
261+
; SSE-NEXT: xorl %eax, %eax
262+
; SSE-NEXT: ptest (%rdi), %xmm0
263+
; SSE-NEXT: sete %al
264+
; SSE-NEXT: retq
265+
;
266+
; CHECK-LABEL: PR88958_1:
267+
; CHECK: # %bb.0:
268+
; CHECK-NEXT: xorl %eax, %eax
269+
; CHECK-NEXT: vtestpd (%rdi), %xmm0
270+
; CHECK-NEXT: sete %al
271+
; CHECK-NEXT: retq
272+
%3 = load <2 x double>, ptr %0
273+
%4 = tail call i32 @llvm.x86.avx.vtestz.pd(<2 x double> %3, <2 x double> %1)
274+
ret i32 %4
275+
}
276+
277+
define i32 @PR88958_2(ptr %0, <2 x double> %1) {
278+
; SSE-LABEL: PR88958_2:
279+
; SSE: # %bb.0:
280+
; SSE-NEXT: movdqa (%rdi), %xmm1
281+
; SSE-NEXT: xorl %eax, %eax
282+
; SSE-NEXT: ptest %xmm0, %xmm1
283+
; SSE-NEXT: setb %al
284+
; SSE-NEXT: retq
285+
;
286+
; CHECK-LABEL: PR88958_2:
287+
; CHECK: # %bb.0:
288+
; CHECK-NEXT: vmovapd (%rdi), %xmm1
289+
; CHECK-NEXT: xorl %eax, %eax
290+
; CHECK-NEXT: vtestpd %xmm0, %xmm1
291+
; CHECK-NEXT: setb %al
292+
; CHECK-NEXT: retq
293+
%3 = load <2 x double>, ptr %0
294+
%4 = tail call i32 @llvm.x86.avx.vtestc.pd(<2 x double> %3, <2 x double> %1)
295+
ret i32 %4
296+
}
297+
298+
define i32 @PR88958_3(ptr %0, <4 x double> %1) {
299+
; SSE-LABEL: PR88958_1:
300+
; SSE: # %bb.0:
301+
; SSE-NEXT: xorl %eax, %eax
302+
; SSE-NEXT: ptest (%rdi), %xmm0
303+
; SSE-NEXT: sete %al
304+
; SSE-NEXT: retq
305+
;
306+
; CHECK-LABEL: PR88958_3:
307+
; CHECK: # %bb.0:
308+
; CHECK-NEXT: xorl %eax, %eax
309+
; CHECK-NEXT: vtestpd (%rdi), %ymm0
310+
; CHECK-NEXT: sete %al
311+
; CHECK-NEXT: vzeroupper
312+
; CHECK-NEXT: retq
313+
%3 = load <4 x double>, ptr %0
314+
%4 = tail call i32 @llvm.x86.avx.vtestz.pd.256(<4 x double> %3, <4 x double> %1)
315+
ret i32 %4
316+
}
317+
318+
define i32 @PR88958_4(ptr %0, <4 x double> %1) {
319+
; SSE-LABEL: PR88958_2:
320+
; SSE: # %bb.0:
321+
; SSE-NEXT: movdqa (%rdi), %xmm1
322+
; SSE-NEXT: xorl %eax, %eax
323+
; SSE-NEXT: ptest %xmm0, %xmm1
324+
; SSE-NEXT: setb %al
325+
; SSE-NEXT: retq
326+
;
327+
; CHECK-LABEL: PR88958_4:
328+
; CHECK: # %bb.0:
329+
; CHECK-NEXT: vmovapd (%rdi), %ymm1
330+
; CHECK-NEXT: xorl %eax, %eax
331+
; CHECK-NEXT: vtestpd %ymm0, %ymm1
332+
; CHECK-NEXT: setb %al
333+
; CHECK-NEXT: vzeroupper
334+
; CHECK-NEXT: retq
335+
%3 = load <4 x double>, ptr %0
336+
%4 = tail call i32 @llvm.x86.avx.vtestc.pd.256(<4 x double> %3, <4 x double> %1)
337+
ret i32 %4
338+
}
339+
258340
declare i32 @llvm.x86.avx.vtestz.pd(<2 x double>, <2 x double>) nounwind readnone
259341
declare i32 @llvm.x86.avx.vtestc.pd(<2 x double>, <2 x double>) nounwind readnone
260342
declare i32 @llvm.x86.avx.vtestnzc.pd(<2 x double>, <2 x double>) nounwind readnone

llvm/test/CodeGen/X86/combine-testps.ll

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -254,6 +254,88 @@ end: ; preds = %entry
254254
ret void
255255
}
256256

257+
define i32 @PR88958_1(ptr %0, <4 x float> %1) {
258+
; SSE-LABEL: PR88958_1:
259+
; SSE: # %bb.0:
260+
; SSE-NEXT: xorl %eax, %eax
261+
; SSE-NEXT: ptest (%rdi), %xmm0
262+
; SSE-NEXT: sete %al
263+
; SSE-NEXT: retq
264+
;
265+
; CHECK-LABEL: PR88958_1:
266+
; CHECK: # %bb.0:
267+
; CHECK-NEXT: xorl %eax, %eax
268+
; CHECK-NEXT: vtestps (%rdi), %xmm0
269+
; CHECK-NEXT: sete %al
270+
; CHECK-NEXT: retq
271+
%3 = load <4 x float>, ptr %0
272+
%4 = tail call i32 @llvm.x86.avx.vtestz.ps(<4 x float> %3, <4 x float> %1)
273+
ret i32 %4
274+
}
275+
276+
define i32 @PR88958_2(ptr %0, <4 x float> %1) {
277+
; SSE-LABEL: PR88958_2:
278+
; SSE: # %bb.0:
279+
; SSE-NEXT: movdqa (%rdi), %xmm1
280+
; SSE-NEXT: xorl %eax, %eax
281+
; SSE-NEXT: ptest %xmm0, %xmm1
282+
; SSE-NEXT: setb %al
283+
; SSE-NEXT: retq
284+
;
285+
; CHECK-LABEL: PR88958_2:
286+
; CHECK: # %bb.0:
287+
; CHECK-NEXT: vmovaps (%rdi), %xmm1
288+
; CHECK-NEXT: xorl %eax, %eax
289+
; CHECK-NEXT: vtestps %xmm0, %xmm1
290+
; CHECK-NEXT: setb %al
291+
; CHECK-NEXT: retq
292+
%3 = load <4 x float>, ptr %0
293+
%4 = tail call i32 @llvm.x86.avx.vtestc.ps(<4 x float> %3, <4 x float> %1)
294+
ret i32 %4
295+
}
296+
297+
define i32 @PR88958_3(ptr %0, <8 x float> %1) {
298+
; SSE-LABEL: PR88958_1:
299+
; SSE: # %bb.0:
300+
; SSE-NEXT: xorl %eax, %eax
301+
; SSE-NEXT: ptest (%rdi), %xmm0
302+
; SSE-NEXT: sete %al
303+
; SSE-NEXT: retq
304+
;
305+
; CHECK-LABEL: PR88958_3:
306+
; CHECK: # %bb.0:
307+
; CHECK-NEXT: xorl %eax, %eax
308+
; CHECK-NEXT: vtestps (%rdi), %ymm0
309+
; CHECK-NEXT: sete %al
310+
; CHECK-NEXT: vzeroupper
311+
; CHECK-NEXT: retq
312+
%3 = load <8 x float>, ptr %0
313+
%4 = tail call i32 @llvm.x86.avx.vtestz.ps.256(<8 x float> %3, <8 x float> %1)
314+
ret i32 %4
315+
}
316+
317+
define i32 @PR88958_4(ptr %0, <8 x float> %1) {
318+
; SSE-LABEL: PR88958_2:
319+
; SSE: # %bb.0:
320+
; SSE-NEXT: movdqa (%rdi), %xmm1
321+
; SSE-NEXT: xorl %eax, %eax
322+
; SSE-NEXT: ptest %xmm0, %xmm1
323+
; SSE-NEXT: setb %al
324+
; SSE-NEXT: retq
325+
;
326+
; CHECK-LABEL: PR88958_4:
327+
; CHECK: # %bb.0:
328+
; CHECK-NEXT: vmovaps (%rdi), %ymm1
329+
; CHECK-NEXT: xorl %eax, %eax
330+
; CHECK-NEXT: vtestps %ymm0, %ymm1
331+
; CHECK-NEXT: setb %al
332+
; CHECK-NEXT: vzeroupper
333+
; CHECK-NEXT: retq
334+
%3 = load <8 x float>, ptr %0
335+
%4 = tail call i32 @llvm.x86.avx.vtestc.ps.256(<8 x float> %3, <8 x float> %1)
336+
ret i32 %4
337+
}
338+
257339
declare i32 @llvm.x86.avx.vtestz.ps(<4 x float>, <4 x float>) nounwind readnone
258340
declare i32 @llvm.x86.avx.vtestc.ps(<4 x float>, <4 x float>) nounwind readnone
259341
declare i32 @llvm.x86.avx.vtestnzc.ps(<4 x float>, <4 x float>) nounwind readnone

llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll

Lines changed: 19 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1018,32 +1018,38 @@ define zeroext i1 @PR44781(ptr %0) {
10181018
; SSE41-NEXT: sete %al
10191019
; SSE41-NEXT: retq
10201020
;
1021-
; AVX1OR2-LABEL: PR44781:
1022-
; AVX1OR2: # %bb.0:
1023-
; AVX1OR2-NEXT: vmovdqu (%rdi), %xmm0
1024-
; AVX1OR2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1025-
; AVX1OR2-NEXT: sete %al
1026-
; AVX1OR2-NEXT: retq
1021+
; AVX1-LABEL: PR44781:
1022+
; AVX1: # %bb.0:
1023+
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm0 = [15,15,15,15]
1024+
; AVX1-NEXT: vptest (%rdi), %xmm0
1025+
; AVX1-NEXT: sete %al
1026+
; AVX1-NEXT: retq
1027+
;
1028+
; AVX2-LABEL: PR44781:
1029+
; AVX2: # %bb.0:
1030+
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm0 = [15,15,15,15]
1031+
; AVX2-NEXT: vptest (%rdi), %xmm0
1032+
; AVX2-NEXT: sete %al
1033+
; AVX2-NEXT: retq
10271034
;
10281035
; AVX512F-LABEL: PR44781:
10291036
; AVX512F: # %bb.0:
1030-
; AVX512F-NEXT: vmovdqu (%rdi), %xmm0
1031-
; AVX512F-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1037+
; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm0 = [15,15,15,15]
1038+
; AVX512F-NEXT: vptest (%rdi), %xmm0
10321039
; AVX512F-NEXT: sete %al
10331040
; AVX512F-NEXT: retq
10341041
;
10351042
; AVX512BW-LABEL: PR44781:
10361043
; AVX512BW: # %bb.0:
1037-
; AVX512BW-NEXT: vmovdqu (%rdi), %xmm0
1038-
; AVX512BW-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1044+
; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [15,15,15,15]
1045+
; AVX512BW-NEXT: vptest (%rdi), %xmm0
10391046
; AVX512BW-NEXT: sete %al
10401047
; AVX512BW-NEXT: retq
10411048
;
10421049
; AVX512BWVL-LABEL: PR44781:
10431050
; AVX512BWVL: # %bb.0:
1044-
; AVX512BWVL-NEXT: vmovdqu (%rdi), %xmm0
1045-
; AVX512BWVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [64424509455,64424509455]
1046-
; AVX512BWVL-NEXT: vptest %xmm1, %xmm0
1051+
; AVX512BWVL-NEXT: vpbroadcastq {{.*#+}} xmm0 = [64424509455,64424509455]
1052+
; AVX512BWVL-NEXT: vptest (%rdi), %xmm0
10471053
; AVX512BWVL-NEXT: sete %al
10481054
; AVX512BWVL-NEXT: retq
10491055
%2 = load <4 x i32>, ptr %0, align 4

0 commit comments

Comments
 (0)