Skip to content

Commit f7a034d

Browse files
[AMDGPU] (x or y) xor -1 -> x nor y (#130264)
Added pattern so s_nor is selected for ((i1 x or i1 y) xor -1) instead of s_or and s_xor . This patch is for i1 divergent. The ballot in the test is added for the retrieval of lanemask. The control flow is needed because the combiner can't pass through phi instructions.
1 parent 0d64f5a commit f7a034d

File tree

2 files changed

+121
-0
lines changed

2 files changed

+121
-0
lines changed

llvm/lib/Target/AMDGPU/SOPInstructions.td

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1925,6 +1925,20 @@ def : ScalarNot2Pat<S_ORN2_B32, or, v2i16>;
19251925
def : ScalarNot2Pat<S_ORN2_B64, or, v4i16>;
19261926
def : ScalarNot2Pat<S_ORN2_B64, or, v2i32>;
19271927

1928+
let WaveSizePredicate = isWave32 in {
1929+
def : GCNPat<
1930+
(i1 (not (or_oneuse i1:$src0, i1:$src1))),
1931+
(S_NOR_B32 i1:$src0, i1:$src1)
1932+
>;
1933+
}
1934+
1935+
let WaveSizePredicate = isWave64 in {
1936+
def : GCNPat<
1937+
(i1 (not (or_oneuse i1:$src0, i1:$src1))),
1938+
(S_NOR_B64 i1:$src0, i1:$src1)
1939+
>;
1940+
}
1941+
19281942
//===----------------------------------------------------------------------===//
19291943
// Target-specific instruction encodings.
19301944
//===----------------------------------------------------------------------===//
Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
2+
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr="-wavefrontsize32,+wavefrontsize64" -o - < %s | FileCheck -check-prefixes=SDAG-W64 %s
3+
; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr="-wavefrontsize32,+wavefrontsize64" -o - < %s | FileCheck -check-prefixes=GISEL-W64 %s
4+
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr="+wavefrontsize32,-wavefrontsize64" -o - < %s | FileCheck -check-prefixes=SDAG-W32 %s
5+
; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr="+wavefrontsize32,-wavefrontsize64" -o - < %s | FileCheck -check-prefixes=GISEL-W32 %s
6+
7+
; Use ballot for easy access to lanemask
8+
9+
define amdgpu_ps i64 @test_nor(i64 inreg %a, i64 inreg %b) {
10+
; SDAG-W64-LABEL: test_nor:
11+
; SDAG-W64: ; %bb.0:
12+
; SDAG-W64-NEXT: s_nor_b64 s[0:1], s[0:1], s[2:3]
13+
; SDAG-W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
14+
; SDAG-W64-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
15+
; SDAG-W64-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v0
16+
; SDAG-W64-NEXT: ; return to shader part epilog
17+
;
18+
; GISEL-W64-LABEL: test_nor:
19+
; GISEL-W64: ; %bb.0:
20+
; GISEL-W64-NEXT: s_nor_b64 s[0:1], s[0:1], s[2:3]
21+
; GISEL-W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
22+
; GISEL-W64-NEXT: s_and_b64 s[0:1], s[0:1], exec
23+
; GISEL-W64-NEXT: ; return to shader part epilog
24+
;
25+
; SDAG-W32-LABEL: test_nor:
26+
; SDAG-W32: ; %bb.0:
27+
; SDAG-W32-NEXT: s_nor_b32 s0, s0, s2
28+
; SDAG-W32-NEXT: s_mov_b32 s1, 0
29+
; SDAG-W32-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
30+
; SDAG-W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
31+
; SDAG-W32-NEXT: v_cmp_ne_u32_e64 s0, 0, v0
32+
; SDAG-W32-NEXT: ; return to shader part epilog
33+
;
34+
; GISEL-W32-LABEL: test_nor:
35+
; GISEL-W32: ; %bb.0:
36+
; GISEL-W32-NEXT: s_nor_b32 s0, s0, s2
37+
; GISEL-W32-NEXT: s_mov_b32 s1, 0
38+
; GISEL-W32-NEXT: s_and_b32 s0, s0, exec_lo
39+
; GISEL-W32-NEXT: ; return to shader part epilog
40+
%a.lanemask = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 %a)
41+
%b.lanemask = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 %b)
42+
%or = or i1 %a.lanemask, %b.lanemask
43+
%xor = xor i1 %or, true
44+
%r = call i64 @llvm.amdgcn.ballot.i64(i1 %xor)
45+
ret i64 %r
46+
}
47+
48+
define amdgpu_ps i64 @test_or_two_uses(i64 inreg %a, i64 inreg %b) {
49+
; SDAG-W64-LABEL: test_or_two_uses:
50+
; SDAG-W64: ; %bb.0:
51+
; SDAG-W64-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
52+
; SDAG-W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
53+
; SDAG-W64-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
54+
; SDAG-W64-NEXT: s_xor_b64 s[0:1], s[0:1], -1
55+
; SDAG-W64-NEXT: s_waitcnt_depctr 0xfffe
56+
; SDAG-W64-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
57+
; SDAG-W64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
58+
; SDAG-W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
59+
; SDAG-W64-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v1
60+
; SDAG-W64-NEXT: s_and_b64 s[0:1], s[0:1], vcc
61+
; SDAG-W64-NEXT: s_waitcnt_depctr 0xfffe
62+
; SDAG-W64-NEXT: ; return to shader part epilog
63+
;
64+
; GISEL-W64-LABEL: test_or_two_uses:
65+
; GISEL-W64: ; %bb.0:
66+
; GISEL-W64-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
67+
; GISEL-W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
68+
; GISEL-W64-NEXT: s_xor_b64 s[2:3], s[0:1], -1
69+
; GISEL-W64-NEXT: s_and_b64 s[0:1], s[0:1], exec
70+
; GISEL-W64-NEXT: s_and_b64 s[2:3], s[2:3], exec
71+
; GISEL-W64-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1]
72+
; GISEL-W64-NEXT: ; return to shader part epilog
73+
;
74+
; SDAG-W32-LABEL: test_or_two_uses:
75+
; SDAG-W32: ; %bb.0:
76+
; SDAG-W32-NEXT: s_or_b32 s0, s0, s2
77+
; SDAG-W32-NEXT: s_mov_b32 s3, 0
78+
; SDAG-W32-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
79+
; SDAG-W32-NEXT: s_xor_b32 s0, s0, -1
80+
; SDAG-W32-NEXT: s_mov_b32 s1, s3
81+
; SDAG-W32-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
82+
; SDAG-W32-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
83+
; SDAG-W32-NEXT: v_cmp_ne_u32_e64 s0, 0, v0
84+
; SDAG-W32-NEXT: v_cmp_ne_u32_e64 s2, 0, v1
85+
; SDAG-W32-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1]
86+
; SDAG-W32-NEXT: ; return to shader part epilog
87+
;
88+
; GISEL-W32-LABEL: test_or_two_uses:
89+
; GISEL-W32: ; %bb.0:
90+
; GISEL-W32-NEXT: s_or_b32 s0, s0, s2
91+
; GISEL-W32-NEXT: s_mov_b32 s1, 0
92+
; GISEL-W32-NEXT: s_xor_b32 s4, s0, -1
93+
; GISEL-W32-NEXT: s_and_b32 s2, s0, exec_lo
94+
; GISEL-W32-NEXT: s_mov_b32 s3, s1
95+
; GISEL-W32-NEXT: s_and_b32 s0, s4, exec_lo
96+
; GISEL-W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
97+
; GISEL-W32-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
98+
; GISEL-W32-NEXT: ; return to shader part epilog
99+
%a.lanemask = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 %a)
100+
%b.lanemask = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 %b)
101+
%or = or i1 %a.lanemask, %b.lanemask
102+
%xor = xor i1 %or, true
103+
%r0 = call i64 @llvm.amdgcn.ballot.i64(i1 %xor)
104+
%r1 = call i64 @llvm.amdgcn.ballot.i64(i1 %or)
105+
%r = and i64 %r0, %r1
106+
ret i64 %r
107+
}

0 commit comments

Comments
 (0)