-
Notifications
You must be signed in to change notification settings - Fork 14.3k
AMDGPU: Don't avoid clamp of bit shift in BFE pattern #115372
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Enable pattern matching from "x<<32-y>>32-y" to "bfe x, 0, y" when we know y is in [0,31]. This is the follow-up for the PR: llvm#114279 to fix the issue: llvm#114282
@llvm/pr-subscribers-backend-amdgpu Author: Changpeng Fang (changpeng) ChangesEnable pattern matching from "x<<32-y>>32-y" to "bfe x, 0, y" when we know y is in [0,31]. Full diff: https://github.com/llvm/llvm-project/pull/115372.diff 5 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 21fffba14287ef..e3a330d45aaa57 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -22,7 +22,6 @@
#include "SIISelLowering.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/Analysis/UniformityAnalysis.h"
-#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/FunctionLoweringInfo.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGISel.h"
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 11c4cdd560c2f3..5ae0b179d7d0e6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -17,6 +17,7 @@
#include "GCNSubtarget.h"
#include "SIMachineFunctionInfo.h"
#include "SIModeRegisterDefaults.h"
+#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/SelectionDAGISel.h"
#include "llvm/Target/TargetMachine.h"
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 52df38c352cf53..6969e687a9734b 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3553,6 +3553,23 @@ def : AMDGPUPat <
(V_BFE_U32_e64 $src, (i32 0), $width)
>;
+def uint5Bits : PatLeaf<(i32 VGPR_32:$width), [{
+ return CurDAG->computeKnownBits(SDValue(N, 0)).countMinLeadingZeros() >= 27;
+}]>;
+
+// x << (bitwidth - y) >> (bitwidth - y)
+def : AMDGPUPat <
+ (DivergentBinFrag<srl> (shl_oneuse i32:$src, (sub 32, uint5Bits:$width)),
+ (sub 32, uint5Bits:$width)),
+ (V_BFE_U32_e64 $src, (i32 0), $width)
+>;
+
+def : AMDGPUPat <
+ (DivergentBinFrag<sra> (shl_oneuse i32:$src, (sub 32, uint5Bits:$width)),
+ (sub 32, uint5Bits:$width)),
+ (V_BFE_I32_e64 $src, (i32 0), $width)
+>;
+
// SHA-256 Ma patterns
// ((x & z) | (y & (x | z))) -> BFI (XOR x, y), z, y
diff --git a/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll b/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll
index c57a35aa1880db..bdba8c57dc745d 100644
--- a/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll
+++ b/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll
@@ -17,9 +17,8 @@ define amdgpu_kernel void @v_ubfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1
; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
-; SI-NEXT: v_sub_i32_e32 v3, vcc, 32, v3
-; SI-NEXT: v_lshlrev_b32_e32 v2, v3, v2
-; SI-NEXT: v_lshrrev_b32_e32 v2, v3, v2
+; SI-NEXT: v_and_b32_e32 v3, 31, v3
+; SI-NEXT: v_bfe_u32 v2, v2, 0, v3
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_endpgm
;
@@ -38,9 +37,8 @@ define amdgpu_kernel void @v_ubfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_sub_u32_e32 v2, vcc, 32, v4
-; VI-NEXT: v_lshlrev_b32_e32 v3, v2, v3
-; VI-NEXT: v_lshrrev_b32_e32 v2, v2, v3
+; VI-NEXT: v_and_b32_e32 v2, 31, v4
+; VI-NEXT: v_bfe_u32 v2, v3, 0, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -49,7 +47,8 @@ define amdgpu_kernel void @v_ubfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1
%out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id.x
%src = load volatile i32, ptr addrspace(1) %in0.gep
%width = load volatile i32, ptr addrspace(1) %in0.gep
- %sub = sub i32 32, %width
+ %width5 = and i32 %width, 31
+ %sub = sub i32 32, %width5
%shl = shl i32 %src, %sub
%bfe = lshr i32 %shl, %sub
store i32 %bfe, ptr addrspace(1) %out.gep
@@ -72,6 +71,7 @@ define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, p
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: v_and_b32_e32 v3, 31, v3
; SI-NEXT: v_sub_i32_e32 v3, vcc, 32, v3
; SI-NEXT: v_lshlrev_b32_e32 v2, v3, v2
; SI-NEXT: v_lshrrev_b32_e32 v3, v3, v2
@@ -95,7 +95,8 @@ define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, p
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_sub_u32_e32 v2, vcc, 32, v4
+; VI-NEXT: v_and_b32_e32 v2, 31, v4
+; VI-NEXT: v_sub_u32_e32 v2, vcc, 32, v2
; VI-NEXT: v_lshlrev_b32_e32 v3, v2, v3
; VI-NEXT: v_lshrrev_b32_e32 v2, v2, v3
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -108,7 +109,8 @@ define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, p
%out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id.x
%src = load volatile i32, ptr addrspace(1) %in0.gep
%width = load volatile i32, ptr addrspace(1) %in0.gep
- %sub = sub i32 32, %width
+ %width5 = and i32 %width, 31
+ %sub = sub i32 32, %width5
%shl = shl i32 %src, %sub
%bfe = lshr i32 %shl, %sub
store i32 %bfe, ptr addrspace(1) %out.gep
@@ -219,9 +221,8 @@ define amdgpu_kernel void @v_sbfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1
; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
-; SI-NEXT: v_sub_i32_e32 v3, vcc, 32, v3
-; SI-NEXT: v_lshlrev_b32_e32 v2, v3, v2
-; SI-NEXT: v_ashrrev_i32_e32 v2, v3, v2
+; SI-NEXT: v_and_b32_e32 v3, 31, v3
+; SI-NEXT: v_bfe_i32 v2, v2, 0, v3
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_endpgm
;
@@ -240,9 +241,8 @@ define amdgpu_kernel void @v_sbfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_sub_u32_e32 v2, vcc, 32, v4
-; VI-NEXT: v_lshlrev_b32_e32 v3, v2, v3
-; VI-NEXT: v_ashrrev_i32_e32 v2, v2, v3
+; VI-NEXT: v_and_b32_e32 v2, 31, v4
+; VI-NEXT: v_bfe_i32 v2, v3, 0, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -251,7 +251,8 @@ define amdgpu_kernel void @v_sbfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1
%out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id.x
%src = load volatile i32, ptr addrspace(1) %in0.gep
%width = load volatile i32, ptr addrspace(1) %in0.gep
- %sub = sub i32 32, %width
+ %width5 = and i32 %width, 31
+ %sub = sub i32 32, %width5
%shl = shl i32 %src, %sub
%bfe = ashr i32 %shl, %sub
store i32 %bfe, ptr addrspace(1) %out.gep
diff --git a/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll b/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll
index 3d9616f02d52d1..3de8db2c6a448e 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll
@@ -150,22 +150,14 @@ define i32 @bzhi32_c4_commutative(i32 %val, i32 %numlowbits) nounwind {
; ---------------------------------------------------------------------------- ;
define i32 @bzhi32_d0(i32 %val, i32 %numlowbits) nounwind {
-; SI-LABEL: bzhi32_d0:
-; SI: ; %bb.0:
-; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_sub_i32_e32 v1, vcc, 32, v1
-; SI-NEXT: v_lshlrev_b32_e32 v0, v1, v0
-; SI-NEXT: v_lshrrev_b32_e32 v0, v1, v0
-; SI-NEXT: s_setpc_b64 s[30:31]
-;
-; VI-LABEL: bzhi32_d0:
-; VI: ; %bb.0:
-; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_sub_u32_e32 v1, vcc, 32, v1
-; VI-NEXT: v_lshlrev_b32_e32 v0, v1, v0
-; VI-NEXT: v_lshrrev_b32_e32 v0, v1, v0
-; VI-NEXT: s_setpc_b64 s[30:31]
- %numhighbits = sub i32 32, %numlowbits
+; GCN-LABEL: bzhi32_d0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v1, 31, v1
+; GCN-NEXT: v_bfe_u32 v0, v0, 0, v1
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %numlow5bits = and i32 %numlowbits, 31
+ %numhighbits = sub i32 32, %numlow5bits
%highbitscleared = shl i32 %val, %numhighbits
%masked = lshr i32 %highbitscleared, %numhighbits
ret i32 %masked
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM with a nit.
Use "countMaxTrailingOnes() <= 5" instead of "countMinLeadingZeros() >= 27" in uint5Bits PatLeaf definition.
; VI-NEXT: v_lshrrev_b32_e32 v0, v1, v0 | ||
; VI-NEXT: s_setpc_b64 s[30:31] | ||
%numhighbits = sub i32 32, %numlowbits | ||
; GCN-LABEL: bzhi32_d0: |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This isn't canonical IR. We shouldn't be burning compile time optimizing non-canonical IR. Can you add a copy of this test with the instcombine output, and revert this if you get the same output
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Instcombine does not change the test source, and I think extracting the low bits are frequently used in the programs to restrict the range of a value.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Instcombine does change this:
define i32 @bzhi32_d0(i32 %val, i32 %numlowbits) #0 {
%numlow5bits = and i32 %numlowbits, 31
%numhighbits = sub nuw nsw i32 32, %numlow5bits
%1 = lshr i32 -1, %numhighbits
%masked = and i32 %1, %val
ret i32 %masked
}
Codegenning this does produce BFE already (except we fail to optimize out the clamp, which is a separate issue)
@@ -3553,6 +3553,23 @@ def : AMDGPUPat < | |||
(V_BFE_U32_e64 $src, (i32 0), $width) | |||
>; | |||
|
|||
def uint5Bits : PatLeaf<(i32 VGPR_32:$width), [{ | |||
return CurDAG->computeKnownBits(SDValue(N, 0)).countMaxTrailingOnes() <= 5; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No, this is completely wrong!!! E.g. if bit 0 is known to be 0 then countMaxTrailingOnes
will return 0 and this test will pass.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ugh, my bad, sorry.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks so much. Also countMaxActiveBits is more close to what we are going to define
countMaxTrailingOnes() is not correct. This patch follows the suggestion from llvm#115372.
countMaxTrailingOnes() is not correct. This patch follows the suggestion from #115372.
Enable pattern matching from "x<<32-y>>32-y" to "bfe x, 0, y" when we know y is in [0,31]. This is the follow-up for the PR: llvm#114279 to fix the issue: llvm#114282
) countMaxTrailingOnes() is not correct. This patch follows the suggestion from llvm#115372.
Enable pattern matching from "x<<32-y>>32-y" to "bfe x, 0, y" when we know y is in [0,31].
This is the follow-up for the PR: #114279 to fix the issue: #114282