Skip to content

Commit 9778fc7

Browse files
authored
Revert "AMDGPU: Don't avoid clamp of bit shift in BFE pattern (#115372)" (#116091)
Based on the suggestion from #115543, we should not do the pattern matching from x << (32-y) >> (32-y) to "bfe x, 0, y" at all. This reverts commits a2bacf8 and bdf8e30.
1 parent 569c36e commit 9778fc7

File tree

5 files changed

+40
-50
lines changed

5 files changed

+40
-50
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#include "SIISelLowering.h"
2323
#include "SIMachineFunctionInfo.h"
2424
#include "llvm/Analysis/UniformityAnalysis.h"
25+
#include "llvm/Analysis/ValueTracking.h"
2526
#include "llvm/CodeGen/FunctionLoweringInfo.h"
2627
#include "llvm/CodeGen/SelectionDAG.h"
2728
#include "llvm/CodeGen/SelectionDAGISel.h"

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
#include "GCNSubtarget.h"
1818
#include "SIMachineFunctionInfo.h"
1919
#include "SIModeRegisterDefaults.h"
20-
#include "llvm/Analysis/ValueTracking.h"
2120
#include "llvm/CodeGen/SelectionDAGISel.h"
2221
#include "llvm/Target/TargetMachine.h"
2322

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -3556,23 +3556,6 @@ def : AMDGPUPat <
35563556
(V_BFE_U32_e64 $src, (i32 0), $width)
35573557
>;
35583558

3559-
def uint5Bits : PatLeaf<(i32 VGPR_32:$width), [{
3560-
return CurDAG->computeKnownBits(SDValue(N, 0)).countMaxActiveBits() <= 5;
3561-
}]>;
3562-
3563-
// x << (bitwidth - y) >> (bitwidth - y)
3564-
def : AMDGPUPat <
3565-
(DivergentBinFrag<srl> (shl_oneuse i32:$src, (sub 32, uint5Bits:$width)),
3566-
(sub 32, uint5Bits:$width)),
3567-
(V_BFE_U32_e64 $src, (i32 0), $width)
3568-
>;
3569-
3570-
def : AMDGPUPat <
3571-
(DivergentBinFrag<sra> (shl_oneuse i32:$src, (sub 32, uint5Bits:$width)),
3572-
(sub 32, uint5Bits:$width)),
3573-
(V_BFE_I32_e64 $src, (i32 0), $width)
3574-
>;
3575-
35763559
// SHA-256 Ma patterns
35773560

35783561
// ((x & z) | (y & (x | z))) -> BFI (XOR x, y), z, y

llvm/test/CodeGen/AMDGPU/bfe-patterns.ll

Lines changed: 16 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,9 @@ define amdgpu_kernel void @v_ubfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1
1717
; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
1818
; SI-NEXT: s_waitcnt vmcnt(0)
1919
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
20-
; SI-NEXT: v_and_b32_e32 v3, 31, v3
21-
; SI-NEXT: v_bfe_u32 v2, v2, 0, v3
20+
; SI-NEXT: v_sub_i32_e32 v3, vcc, 32, v3
21+
; SI-NEXT: v_lshlrev_b32_e32 v2, v3, v2
22+
; SI-NEXT: v_lshrrev_b32_e32 v2, v3, v2
2223
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2324
; SI-NEXT: s_endpgm
2425
;
@@ -37,8 +38,9 @@ define amdgpu_kernel void @v_ubfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1
3738
; VI-NEXT: v_mov_b32_e32 v1, s1
3839
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
3940
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
40-
; VI-NEXT: v_and_b32_e32 v2, 31, v4
41-
; VI-NEXT: v_bfe_u32 v2, v3, 0, v2
41+
; VI-NEXT: v_sub_u32_e32 v2, vcc, 32, v4
42+
; VI-NEXT: v_lshlrev_b32_e32 v3, v2, v3
43+
; VI-NEXT: v_lshrrev_b32_e32 v2, v2, v3
4244
; VI-NEXT: flat_store_dword v[0:1], v2
4345
; VI-NEXT: s_endpgm
4446
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -47,8 +49,7 @@ define amdgpu_kernel void @v_ubfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1
4749
%out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id.x
4850
%src = load volatile i32, ptr addrspace(1) %in0.gep
4951
%width = load volatile i32, ptr addrspace(1) %in0.gep
50-
%width5 = and i32 %width, 31
51-
%sub = sub i32 32, %width5
52+
%sub = sub i32 32, %width
5253
%shl = shl i32 %src, %sub
5354
%bfe = lshr i32 %shl, %sub
5455
store i32 %bfe, ptr addrspace(1) %out.gep
@@ -71,7 +72,6 @@ define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, p
7172
; SI-NEXT: s_waitcnt vmcnt(0)
7273
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
7374
; SI-NEXT: s_mov_b32 s6, -1
74-
; SI-NEXT: v_and_b32_e32 v3, 31, v3
7575
; SI-NEXT: v_sub_i32_e32 v3, vcc, 32, v3
7676
; SI-NEXT: v_lshlrev_b32_e32 v2, v3, v2
7777
; SI-NEXT: v_lshrrev_b32_e32 v3, v3, v2
@@ -95,8 +95,7 @@ define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, p
9595
; VI-NEXT: v_mov_b32_e32 v1, s1
9696
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
9797
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
98-
; VI-NEXT: v_and_b32_e32 v2, 31, v4
99-
; VI-NEXT: v_sub_u32_e32 v2, vcc, 32, v2
98+
; VI-NEXT: v_sub_u32_e32 v2, vcc, 32, v4
10099
; VI-NEXT: v_lshlrev_b32_e32 v3, v2, v3
101100
; VI-NEXT: v_lshrrev_b32_e32 v2, v2, v3
102101
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -109,8 +108,7 @@ define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, p
109108
%out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id.x
110109
%src = load volatile i32, ptr addrspace(1) %in0.gep
111110
%width = load volatile i32, ptr addrspace(1) %in0.gep
112-
%width5 = and i32 %width, 31
113-
%sub = sub i32 32, %width5
111+
%sub = sub i32 32, %width
114112
%shl = shl i32 %src, %sub
115113
%bfe = lshr i32 %shl, %sub
116114
store i32 %bfe, ptr addrspace(1) %out.gep
@@ -221,8 +219,9 @@ define amdgpu_kernel void @v_sbfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1
221219
; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
222220
; SI-NEXT: s_waitcnt vmcnt(0)
223221
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
224-
; SI-NEXT: v_and_b32_e32 v3, 31, v3
225-
; SI-NEXT: v_bfe_i32 v2, v2, 0, v3
222+
; SI-NEXT: v_sub_i32_e32 v3, vcc, 32, v3
223+
; SI-NEXT: v_lshlrev_b32_e32 v2, v3, v2
224+
; SI-NEXT: v_ashrrev_i32_e32 v2, v3, v2
226225
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
227226
; SI-NEXT: s_endpgm
228227
;
@@ -241,8 +240,9 @@ define amdgpu_kernel void @v_sbfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1
241240
; VI-NEXT: v_mov_b32_e32 v1, s1
242241
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
243242
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
244-
; VI-NEXT: v_and_b32_e32 v2, 31, v4
245-
; VI-NEXT: v_bfe_i32 v2, v3, 0, v2
243+
; VI-NEXT: v_sub_u32_e32 v2, vcc, 32, v4
244+
; VI-NEXT: v_lshlrev_b32_e32 v3, v2, v3
245+
; VI-NEXT: v_ashrrev_i32_e32 v2, v2, v3
246246
; VI-NEXT: flat_store_dword v[0:1], v2
247247
; VI-NEXT: s_endpgm
248248
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -251,8 +251,7 @@ define amdgpu_kernel void @v_sbfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1
251251
%out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id.x
252252
%src = load volatile i32, ptr addrspace(1) %in0.gep
253253
%width = load volatile i32, ptr addrspace(1) %in0.gep
254-
%width5 = and i32 %width, 31
255-
%sub = sub i32 32, %width5
254+
%sub = sub i32 32, %width
256255
%shl = shl i32 %src, %sub
257256
%bfe = ashr i32 %shl, %sub
258257
store i32 %bfe, ptr addrspace(1) %out.gep

llvm/test/CodeGen/AMDGPU/extract-lowbits.ll

Lines changed: 23 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -150,39 +150,47 @@ define i32 @bzhi32_c4_commutative(i32 %val, i32 %numlowbits) nounwind {
150150
; ---------------------------------------------------------------------------- ;
151151

152152
define i32 @bzhi32_d0(i32 %val, i32 %numlowbits) nounwind {
153-
; GCN-LABEL: bzhi32_d0:
154-
; GCN: ; %bb.0:
155-
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
156-
; GCN-NEXT: v_and_b32_e32 v1, 31, v1
157-
; GCN-NEXT: v_bfe_u32 v0, v0, 0, v1
158-
; GCN-NEXT: s_setpc_b64 s[30:31]
159-
%numlow5bits = and i32 %numlowbits, 31
160-
%numhighbits = sub i32 32, %numlow5bits
153+
; SI-LABEL: bzhi32_d0:
154+
; SI: ; %bb.0:
155+
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
156+
; SI-NEXT: v_sub_i32_e32 v1, vcc, 32, v1
157+
; SI-NEXT: v_lshlrev_b32_e32 v0, v1, v0
158+
; SI-NEXT: v_lshrrev_b32_e32 v0, v1, v0
159+
; SI-NEXT: s_setpc_b64 s[30:31]
160+
;
161+
; VI-LABEL: bzhi32_d0:
162+
; VI: ; %bb.0:
163+
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
164+
; VI-NEXT: v_sub_u32_e32 v1, vcc, 32, v1
165+
; VI-NEXT: v_lshlrev_b32_e32 v0, v1, v0
166+
; VI-NEXT: v_lshrrev_b32_e32 v0, v1, v0
167+
; VI-NEXT: s_setpc_b64 s[30:31]
168+
%numhighbits = sub i32 32, %numlowbits
161169
%highbitscleared = shl i32 %val, %numhighbits
162170
%masked = lshr i32 %highbitscleared, %numhighbits
163171
ret i32 %masked
164172
}
165173

166-
define i32 @bzhi32_d0_even(i32 %val, i32 %numlowbits) nounwind {
167-
; SI-LABEL: bzhi32_d0_even:
174+
define i32 @bzhi32_d0_5bits(i32 %val, i32 %numlowbits) nounwind {
175+
; SI-LABEL: bzhi32_d0_5bits:
168176
; SI: ; %bb.0:
169177
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
170-
; SI-NEXT: v_lshlrev_b32_e32 v1, 1, v1
178+
; SI-NEXT: v_and_b32_e32 v1, 31, v1
171179
; SI-NEXT: v_sub_i32_e32 v1, vcc, 32, v1
172180
; SI-NEXT: v_lshlrev_b32_e32 v0, v1, v0
173181
; SI-NEXT: v_lshrrev_b32_e32 v0, v1, v0
174182
; SI-NEXT: s_setpc_b64 s[30:31]
175183
;
176-
; VI-LABEL: bzhi32_d0_even:
184+
; VI-LABEL: bzhi32_d0_5bits:
177185
; VI: ; %bb.0:
178186
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
179-
; VI-NEXT: v_lshlrev_b32_e32 v1, 1, v1
187+
; VI-NEXT: v_and_b32_e32 v1, 31, v1
180188
; VI-NEXT: v_sub_u32_e32 v1, vcc, 32, v1
181189
; VI-NEXT: v_lshlrev_b32_e32 v0, v1, v0
182190
; VI-NEXT: v_lshrrev_b32_e32 v0, v1, v0
183191
; VI-NEXT: s_setpc_b64 s[30:31]
184-
%times2 = shl i32 %numlowbits, 1
185-
%numhighbits = sub i32 32, %times2
192+
%numlow5bits = and i32 %numlowbits, 31
193+
%numhighbits = sub i32 32, %numlow5bits
186194
%highbitscleared = shl i32 %val, %numhighbits
187195
%masked = lshr i32 %highbitscleared, %numhighbits
188196
ret i32 %masked

0 commit comments

Comments
 (0)