Skip to content

Commit a728d63

Browse files
committed
AMDGPU: Add test showing bit operations that should be reducible
v_xor_i64_known_i32_from_range_use_out_of_block demonstrates a regression that appears in a future patch in the IR division expansion. We could generalize splitBinaryBitConstantOp to use known bits. I'm not sure if it's worth it in the original example, since the pattern seems to disappear if I optimize the division expansion. We should probably fix the divide expand to avoid this.
1 parent ab954b1 commit a728d63

File tree

1 file changed

+162
-0
lines changed

1 file changed

+162
-0
lines changed
Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s
3+
4+
; Check for situations where we could reduce the width of bitwise
5+
; operations.
6+
7+
8+
; Should be able to reduce this to a 32-bit or plus a copy
9+
; https://alive2.llvm.org/ce/z/9LddFX
10+
define i64 @v_xor_i64_known_hi_i32_from_arg_range(i64 range(i64 0, 4294967296) %arg0, i64 %arg1) {
11+
; CHECK-LABEL: v_xor_i64_known_hi_i32_from_arg_range:
12+
; CHECK: ; %bb.0:
13+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14+
; CHECK-NEXT: v_xor_b32_e32 v1, v1, v3
15+
; CHECK-NEXT: v_xor_b32_e32 v0, v0, v2
16+
; CHECK-NEXT: s_setpc_b64 s[30:31]
17+
%xor = xor i64 %arg0, %arg1
18+
ret i64 %xor
19+
}
20+
21+
; Should be able to reduce this to a 32-bit or plus a copy
22+
; https://alive2.llvm.org/ce/z/HaXnBJ
23+
define i64 @v_or_i64_known_hi_i32_from_arg_range(i64 range(i64 0, 4294967296) %arg0, i64 %arg1) {
24+
; CHECK-LABEL: v_or_i64_known_hi_i32_from_arg_range:
25+
; CHECK: ; %bb.0:
26+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27+
; CHECK-NEXT: v_or_b32_e32 v1, v1, v3
28+
; CHECK-NEXT: v_or_b32_e32 v0, v0, v2
29+
; CHECK-NEXT: s_setpc_b64 s[30:31]
30+
%or = or i64 %arg0, %arg1
31+
ret i64 %or
32+
}
33+
34+
; https://alive2.llvm.org/ce/z/M96Ror
35+
; Should be able to reduce this to a 32-bit plus a copy
36+
define i64 @v_and_i64_known_i32_from_arg_range(i64 range(i64 -4294967296, 0) %arg0, i64 %arg1) {
37+
; CHECK-LABEL: v_and_i64_known_i32_from_arg_range:
38+
; CHECK: ; %bb.0:
39+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40+
; CHECK-NEXT: v_and_b32_e32 v1, v1, v3
41+
; CHECK-NEXT: v_and_b32_e32 v0, v0, v2
42+
; CHECK-NEXT: s_setpc_b64 s[30:31]
43+
%and = and i64 %arg0, %arg1
44+
ret i64 %and
45+
}
46+
47+
define i64 @s_xor_i64_known_i32_from_arg_range(i64 range(i64 0, 65) inreg %arg) {
48+
; CHECK-LABEL: s_xor_i64_known_i32_from_arg_range:
49+
; CHECK: ; %bb.0:
50+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
51+
; CHECK-NEXT: s_not_b64 s[4:5], s[16:17]
52+
; CHECK-NEXT: v_mov_b32_e32 v0, s4
53+
; CHECK-NEXT: v_mov_b32_e32 v1, s5
54+
; CHECK-NEXT: s_setpc_b64 s[30:31]
55+
%xor = xor i64 %arg, -1
56+
ret i64 %xor
57+
}
58+
59+
define i64 @v_xor_i64_known_i32_from_call_range() {
60+
; CHECK-LABEL: v_xor_i64_known_i32_from_call_range:
61+
; CHECK: ; %bb.0:
62+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
63+
; CHECK-NEXT: ;;#ASMSTART
64+
; CHECK-NEXT: ; def v[0:1]
65+
; CHECK-NEXT: ;;#ASMEND
66+
; CHECK-NEXT: v_not_b32_e32 v1, v1
67+
; CHECK-NEXT: v_not_b32_e32 v0, v0
68+
; CHECK-NEXT: s_setpc_b64 s[30:31]
69+
%call = call range(i64 0, 65) i64 asm "; def $0", "=v"()
70+
%xor = xor i64 %call, -1
71+
ret i64 %xor
72+
}
73+
74+
define i64 @s_xor_i64_known_i32_from_call_range() {
75+
; CHECK-LABEL: s_xor_i64_known_i32_from_call_range:
76+
; CHECK: ; %bb.0:
77+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
78+
; CHECK-NEXT: ;;#ASMSTART
79+
; CHECK-NEXT: ; def s[4:5]
80+
; CHECK-NEXT: ;;#ASMEND
81+
; CHECK-NEXT: s_not_b64 s[4:5], s[4:5]
82+
; CHECK-NEXT: v_mov_b32_e32 v0, s4
83+
; CHECK-NEXT: v_mov_b32_e32 v1, s5
84+
; CHECK-NEXT: s_setpc_b64 s[30:31]
85+
%call = call range(i64 0, 65) i64 asm "; def $0", "=s"()
86+
%xor = xor i64 %call, -1
87+
ret i64 %xor
88+
}
89+
90+
; Reduced from -amdgpu-codegenprepare-expand-div64 output, produces a
91+
; not of 0 which ideally would fold out.
92+
; FIXME: Produces not of constant 0
93+
define i64 @v_xor_i64_known_i32_from_range_use_out_of_block(i64 %x) {
94+
; CHECK-LABEL: v_xor_i64_known_i32_from_range_use_out_of_block:
95+
; CHECK: ; %bb.0: ; %entry
96+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
97+
; CHECK-NEXT: v_ffbh_u32_e32 v2, v0
98+
; CHECK-NEXT: v_add_u32_e32 v2, 32, v2
99+
; CHECK-NEXT: v_ffbh_u32_e32 v3, v1
100+
; CHECK-NEXT: v_min_u32_e32 v4, v2, v3
101+
; CHECK-NEXT: v_mov_b32_e32 v5, 0
102+
; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[0:1]
103+
; CHECK-NEXT: v_mov_b32_e32 v2, 0
104+
; CHECK-NEXT: v_mov_b32_e32 v3, 0
105+
; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc
106+
; CHECK-NEXT: ; %bb.1: ; %inc
107+
; CHECK-NEXT: v_not_b32_e32 v2, v4
108+
; CHECK-NEXT: v_not_b32_e32 v3, 0
109+
; CHECK-NEXT: v_add_co_u32_e32 v2, vcc, v0, v2
110+
; CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc
111+
; CHECK-NEXT: ; %bb.2: ; %UnifiedReturnBlock
112+
; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
113+
; CHECK-NEXT: v_mov_b32_e32 v0, v2
114+
; CHECK-NEXT: v_mov_b32_e32 v1, v3
115+
; CHECK-NEXT: s_setpc_b64 s[30:31]
116+
entry:
117+
%ctlz = tail call range(i64 0, 65) i64 @llvm.ctlz.i64(i64 %x, i1 true)
118+
%cmp.entry.not = icmp eq i64 %ctlz, %x
119+
br i1 %cmp.entry.not, label %inc, label %ret
120+
121+
inc: ; preds = %entry
122+
%i1 = xor i64 %ctlz, -1
123+
%i2 = add i64 %x, %i1
124+
ret i64 %i2
125+
126+
ret: ; preds = %loop, %entry
127+
ret i64 0
128+
}
129+
130+
define i64 @s_xor_i64_known_i32_from_range_use_out_of_block(i64 inreg %x) {
131+
; CHECK-LABEL: s_xor_i64_known_i32_from_range_use_out_of_block:
132+
; CHECK: ; %bb.0: ; %entry
133+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
134+
; CHECK-NEXT: s_flbit_i32_b64 s4, s[16:17]
135+
; CHECK-NEXT: s_mov_b32 s5, 0
136+
; CHECK-NEXT: s_cmp_lg_u64 s[4:5], s[16:17]
137+
; CHECK-NEXT: s_cbranch_scc1 .LBB7_2
138+
; CHECK-NEXT: ; %bb.1: ; %inc
139+
; CHECK-NEXT: s_not_b64 s[4:5], s[4:5]
140+
; CHECK-NEXT: s_add_u32 s4, s16, s4
141+
; CHECK-NEXT: s_addc_u32 s5, s17, s5
142+
; CHECK-NEXT: v_mov_b32_e32 v0, s4
143+
; CHECK-NEXT: v_mov_b32_e32 v1, s5
144+
; CHECK-NEXT: s_setpc_b64 s[30:31]
145+
; CHECK-NEXT: .LBB7_2: ; %ret
146+
; CHECK-NEXT: v_mov_b32_e32 v0, 0
147+
; CHECK-NEXT: v_mov_b32_e32 v1, 0
148+
; CHECK-NEXT: s_setpc_b64 s[30:31]
149+
entry:
150+
%ctlz = tail call range(i64 0, 65) i64 @llvm.ctlz.i64(i64 %x, i1 true)
151+
%cmp.entry.not = icmp eq i64 %ctlz, %x
152+
br i1 %cmp.entry.not, label %inc, label %ret
153+
154+
inc: ; preds = %entry
155+
%i1 = xor i64 %ctlz, -1
156+
%i2 = add i64 %x, %i1
157+
ret i64 %i2
158+
159+
ret: ; preds = %loop, %entry
160+
ret i64 0
161+
}
162+

0 commit comments

Comments
 (0)