Skip to content

Commit 8f8e8db

Browse files
changpengGroverkss
authored andcommitted
AMDGPU: Don't avoid clamp of bit shift in BFE pattern (llvm#115372)
Enable pattern matching from "x<<32-y>>32-y" to "bfe x, 0, y" when we know y is in [0,31]. This is the follow-up for the PR: llvm#114279 to fix the issue: llvm#114282
1 parent 1e98405 commit 8f8e8db

File tree

5 files changed

+43
-33
lines changed

5 files changed

+43
-33
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@
2222
#include "SIISelLowering.h"
2323
#include "SIMachineFunctionInfo.h"
2424
#include "llvm/Analysis/UniformityAnalysis.h"
25-
#include "llvm/Analysis/ValueTracking.h"
2625
#include "llvm/CodeGen/FunctionLoweringInfo.h"
2726
#include "llvm/CodeGen/SelectionDAG.h"
2827
#include "llvm/CodeGen/SelectionDAGISel.h"

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#include "GCNSubtarget.h"
1818
#include "SIMachineFunctionInfo.h"
1919
#include "SIModeRegisterDefaults.h"
20+
#include "llvm/Analysis/ValueTracking.h"
2021
#include "llvm/CodeGen/SelectionDAGISel.h"
2122
#include "llvm/Target/TargetMachine.h"
2223

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3553,6 +3553,23 @@ def : AMDGPUPat <
35533553
(V_BFE_U32_e64 $src, (i32 0), $width)
35543554
>;
35553555

3556+
def uint5Bits : PatLeaf<(i32 VGPR_32:$width), [{
3557+
return CurDAG->computeKnownBits(SDValue(N, 0)).countMaxTrailingOnes() <= 5;
3558+
}]>;
3559+
3560+
// x << (bitwidth - y) >> (bitwidth - y)
3561+
def : AMDGPUPat <
3562+
(DivergentBinFrag<srl> (shl_oneuse i32:$src, (sub 32, uint5Bits:$width)),
3563+
(sub 32, uint5Bits:$width)),
3564+
(V_BFE_U32_e64 $src, (i32 0), $width)
3565+
>;
3566+
3567+
def : AMDGPUPat <
3568+
(DivergentBinFrag<sra> (shl_oneuse i32:$src, (sub 32, uint5Bits:$width)),
3569+
(sub 32, uint5Bits:$width)),
3570+
(V_BFE_I32_e64 $src, (i32 0), $width)
3571+
>;
3572+
35563573
// SHA-256 Ma patterns
35573574

35583575
// ((x & z) | (y & (x | z))) -> BFI (XOR x, y), z, y

llvm/test/CodeGen/AMDGPU/bfe-patterns.ll

Lines changed: 17 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,8 @@ define amdgpu_kernel void @v_ubfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1
1717
; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
1818
; SI-NEXT: s_waitcnt vmcnt(0)
1919
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
20-
; SI-NEXT: v_sub_i32_e32 v3, vcc, 32, v3
21-
; SI-NEXT: v_lshlrev_b32_e32 v2, v3, v2
22-
; SI-NEXT: v_lshrrev_b32_e32 v2, v3, v2
20+
; SI-NEXT: v_and_b32_e32 v3, 31, v3
21+
; SI-NEXT: v_bfe_u32 v2, v2, 0, v3
2322
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2423
; SI-NEXT: s_endpgm
2524
;
@@ -38,9 +37,8 @@ define amdgpu_kernel void @v_ubfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1
3837
; VI-NEXT: v_mov_b32_e32 v1, s1
3938
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
4039
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
41-
; VI-NEXT: v_sub_u32_e32 v2, vcc, 32, v4
42-
; VI-NEXT: v_lshlrev_b32_e32 v3, v2, v3
43-
; VI-NEXT: v_lshrrev_b32_e32 v2, v2, v3
40+
; VI-NEXT: v_and_b32_e32 v2, 31, v4
41+
; VI-NEXT: v_bfe_u32 v2, v3, 0, v2
4442
; VI-NEXT: flat_store_dword v[0:1], v2
4543
; VI-NEXT: s_endpgm
4644
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -49,7 +47,8 @@ define amdgpu_kernel void @v_ubfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1
4947
%out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id.x
5048
%src = load volatile i32, ptr addrspace(1) %in0.gep
5149
%width = load volatile i32, ptr addrspace(1) %in0.gep
52-
%sub = sub i32 32, %width
50+
%width5 = and i32 %width, 31
51+
%sub = sub i32 32, %width5
5352
%shl = shl i32 %src, %sub
5453
%bfe = lshr i32 %shl, %sub
5554
store i32 %bfe, ptr addrspace(1) %out.gep
@@ -72,6 +71,7 @@ define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, p
7271
; SI-NEXT: s_waitcnt vmcnt(0)
7372
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
7473
; SI-NEXT: s_mov_b32 s6, -1
74+
; SI-NEXT: v_and_b32_e32 v3, 31, v3
7575
; SI-NEXT: v_sub_i32_e32 v3, vcc, 32, v3
7676
; SI-NEXT: v_lshlrev_b32_e32 v2, v3, v2
7777
; SI-NEXT: v_lshrrev_b32_e32 v3, v3, v2
@@ -95,7 +95,8 @@ define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, p
9595
; VI-NEXT: v_mov_b32_e32 v1, s1
9696
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
9797
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
98-
; VI-NEXT: v_sub_u32_e32 v2, vcc, 32, v4
98+
; VI-NEXT: v_and_b32_e32 v2, 31, v4
99+
; VI-NEXT: v_sub_u32_e32 v2, vcc, 32, v2
99100
; VI-NEXT: v_lshlrev_b32_e32 v3, v2, v3
100101
; VI-NEXT: v_lshrrev_b32_e32 v2, v2, v3
101102
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -108,7 +109,8 @@ define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, p
108109
%out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id.x
109110
%src = load volatile i32, ptr addrspace(1) %in0.gep
110111
%width = load volatile i32, ptr addrspace(1) %in0.gep
111-
%sub = sub i32 32, %width
112+
%width5 = and i32 %width, 31
113+
%sub = sub i32 32, %width5
112114
%shl = shl i32 %src, %sub
113115
%bfe = lshr i32 %shl, %sub
114116
store i32 %bfe, ptr addrspace(1) %out.gep
@@ -219,9 +221,8 @@ define amdgpu_kernel void @v_sbfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1
219221
; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
220222
; SI-NEXT: s_waitcnt vmcnt(0)
221223
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
222-
; SI-NEXT: v_sub_i32_e32 v3, vcc, 32, v3
223-
; SI-NEXT: v_lshlrev_b32_e32 v2, v3, v2
224-
; SI-NEXT: v_ashrrev_i32_e32 v2, v3, v2
224+
; SI-NEXT: v_and_b32_e32 v3, 31, v3
225+
; SI-NEXT: v_bfe_i32 v2, v2, 0, v3
225226
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
226227
; SI-NEXT: s_endpgm
227228
;
@@ -240,9 +241,8 @@ define amdgpu_kernel void @v_sbfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1
240241
; VI-NEXT: v_mov_b32_e32 v1, s1
241242
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
242243
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
243-
; VI-NEXT: v_sub_u32_e32 v2, vcc, 32, v4
244-
; VI-NEXT: v_lshlrev_b32_e32 v3, v2, v3
245-
; VI-NEXT: v_ashrrev_i32_e32 v2, v2, v3
244+
; VI-NEXT: v_and_b32_e32 v2, 31, v4
245+
; VI-NEXT: v_bfe_i32 v2, v3, 0, v2
246246
; VI-NEXT: flat_store_dword v[0:1], v2
247247
; VI-NEXT: s_endpgm
248248
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -251,7 +251,8 @@ define amdgpu_kernel void @v_sbfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1
251251
%out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id.x
252252
%src = load volatile i32, ptr addrspace(1) %in0.gep
253253
%width = load volatile i32, ptr addrspace(1) %in0.gep
254-
%sub = sub i32 32, %width
254+
%width5 = and i32 %width, 31
255+
%sub = sub i32 32, %width5
255256
%shl = shl i32 %src, %sub
256257
%bfe = ashr i32 %shl, %sub
257258
store i32 %bfe, ptr addrspace(1) %out.gep

llvm/test/CodeGen/AMDGPU/extract-lowbits.ll

Lines changed: 8 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -150,22 +150,14 @@ define i32 @bzhi32_c4_commutative(i32 %val, i32 %numlowbits) nounwind {
150150
; ---------------------------------------------------------------------------- ;
151151

152152
define i32 @bzhi32_d0(i32 %val, i32 %numlowbits) nounwind {
153-
; SI-LABEL: bzhi32_d0:
154-
; SI: ; %bb.0:
155-
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
156-
; SI-NEXT: v_sub_i32_e32 v1, vcc, 32, v1
157-
; SI-NEXT: v_lshlrev_b32_e32 v0, v1, v0
158-
; SI-NEXT: v_lshrrev_b32_e32 v0, v1, v0
159-
; SI-NEXT: s_setpc_b64 s[30:31]
160-
;
161-
; VI-LABEL: bzhi32_d0:
162-
; VI: ; %bb.0:
163-
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
164-
; VI-NEXT: v_sub_u32_e32 v1, vcc, 32, v1
165-
; VI-NEXT: v_lshlrev_b32_e32 v0, v1, v0
166-
; VI-NEXT: v_lshrrev_b32_e32 v0, v1, v0
167-
; VI-NEXT: s_setpc_b64 s[30:31]
168-
%numhighbits = sub i32 32, %numlowbits
153+
; GCN-LABEL: bzhi32_d0:
154+
; GCN: ; %bb.0:
155+
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
156+
; GCN-NEXT: v_and_b32_e32 v1, 31, v1
157+
; GCN-NEXT: v_bfe_u32 v0, v0, 0, v1
158+
; GCN-NEXT: s_setpc_b64 s[30:31]
159+
%numlow5bits = and i32 %numlowbits, 31
160+
%numhighbits = sub i32 32, %numlow5bits
169161
%highbitscleared = shl i32 %val, %numhighbits
170162
%masked = lshr i32 %highbitscleared, %numhighbits
171163
ret i32 %masked

0 commit comments

Comments
 (0)