AMDGPU: Don't avoid clamp of bit shift in BFE pattern #115372

changpeng · 2024-11-07T21:39:56Z

Enable pattern matching from "x<<32-y>>32-y" to "bfe x, 0, y" when we know y is in [0,31].
This is the follow-up for the PR: #114279 to fix the issue: #114282

Enable pattern matching from "x<<32-y>>32-y" to "bfe x, 0, y" when we know y is in [0,31]. This is the follow-up for the PR: llvm#114279 to fix the issue: llvm#114282

llvmbot · 2024-11-07T21:40:30Z

@llvm/pr-subscribers-backend-amdgpu

Author: Changpeng Fang (changpeng)

Changes

Enable pattern matching from "x<<32-y>>32-y" to "bfe x, 0, y" when we know y is in [0,31].
This is the follow-up for the PR: #114279 to fix the issue: #114282

Full diff: https://github.com/llvm/llvm-project/pull/115372.diff

5 Files Affected:

(modified) llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp (-1)
(modified) llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h (+1)
(modified) llvm/lib/Target/AMDGPU/SIInstructions.td (+17)
(modified) llvm/test/CodeGen/AMDGPU/bfe-patterns.ll (+17-16)
(modified) llvm/test/CodeGen/AMDGPU/extract-lowbits.ll (+8-16)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 21fffba14287ef..e3a330d45aaa57 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -22,7 +22,6 @@
 #include "SIISelLowering.h"
 #include "SIMachineFunctionInfo.h"
 #include "llvm/Analysis/UniformityAnalysis.h"
-#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 11c4cdd560c2f3..5ae0b179d7d0e6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -17,6 +17,7 @@
 #include "GCNSubtarget.h"
 #include "SIMachineFunctionInfo.h"
 #include "SIModeRegisterDefaults.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/Target/TargetMachine.h"
 
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 52df38c352cf53..6969e687a9734b 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3553,6 +3553,23 @@ def : AMDGPUPat <
   (V_BFE_U32_e64 $src, (i32 0), $width)
 >;
 
+def uint5Bits : PatLeaf<(i32 VGPR_32:$width), [{
+  return CurDAG->computeKnownBits(SDValue(N, 0)).countMinLeadingZeros() >= 27;
+}]>;
+
+// x << (bitwidth - y) >> (bitwidth - y)
+def : AMDGPUPat <
+  (DivergentBinFrag<srl> (shl_oneuse i32:$src, (sub 32, uint5Bits:$width)),
+                         (sub 32, uint5Bits:$width)),
+   (V_BFE_U32_e64 $src, (i32 0), $width)
+>;
+
+def : AMDGPUPat <
+  (DivergentBinFrag<sra> (shl_oneuse i32:$src, (sub 32, uint5Bits:$width)),
+                         (sub 32, uint5Bits:$width)),
+   (V_BFE_I32_e64 $src, (i32 0), $width)
+>;
+
 // SHA-256 Ma patterns
 
 // ((x & z) | (y & (x | z))) -> BFI (XOR x, y), z, y
diff --git a/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll b/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll
index c57a35aa1880db..bdba8c57dc745d 100644
--- a/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll
+++ b/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll
@@ -17,9 +17,8 @@ define amdgpu_kernel void @v_ubfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1
 ; SI-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; SI-NEXT:    v_sub_i32_e32 v3, vcc, 32, v3
-; SI-NEXT:    v_lshlrev_b32_e32 v2, v3, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v2, v3, v2
+; SI-NEXT:    v_and_b32_e32 v3, 31, v3
+; SI-NEXT:    v_bfe_u32 v2, v2, 0, v3
 ; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 ; SI-NEXT:    s_endpgm
 ;
@@ -38,9 +37,8 @@ define amdgpu_kernel void @v_ubfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    v_sub_u32_e32 v2, vcc, 32, v4
-; VI-NEXT:    v_lshlrev_b32_e32 v3, v2, v3
-; VI-NEXT:    v_lshrrev_b32_e32 v2, v2, v3
+; VI-NEXT:    v_and_b32_e32 v2, 31, v4
+; VI-NEXT:    v_bfe_u32 v2, v3, 0, v2
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -49,7 +47,8 @@ define amdgpu_kernel void @v_ubfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1
   %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id.x
   %src = load volatile i32, ptr addrspace(1) %in0.gep
   %width = load volatile i32, ptr addrspace(1) %in0.gep
-  %sub = sub i32 32, %width
+  %width5 = and i32 %width, 31
+  %sub = sub i32 32, %width5
   %shl = shl i32 %src, %sub
   %bfe = lshr i32 %shl, %sub
   store i32 %bfe, ptr addrspace(1) %out.gep
@@ -72,6 +71,7 @@ define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, p
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    v_and_b32_e32 v3, 31, v3
 ; SI-NEXT:    v_sub_i32_e32 v3, vcc, 32, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, v3, v2
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, v3, v2
@@ -95,7 +95,8 @@ define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, p
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    v_sub_u32_e32 v2, vcc, 32, v4
+; VI-NEXT:    v_and_b32_e32 v2, 31, v4
+; VI-NEXT:    v_sub_u32_e32 v2, vcc, 32, v2
 ; VI-NEXT:    v_lshlrev_b32_e32 v3, v2, v3
 ; VI-NEXT:    v_lshrrev_b32_e32 v2, v2, v3
 ; VI-NEXT:    flat_store_dword v[0:1], v2
@@ -108,7 +109,8 @@ define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, p
   %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id.x
   %src = load volatile i32, ptr addrspace(1) %in0.gep
   %width = load volatile i32, ptr addrspace(1) %in0.gep
-  %sub = sub i32 32, %width
+  %width5 = and i32 %width, 31
+  %sub = sub i32 32, %width5
   %shl = shl i32 %src, %sub
   %bfe = lshr i32 %shl, %sub
   store i32 %bfe, ptr addrspace(1) %out.gep
@@ -219,9 +221,8 @@ define amdgpu_kernel void @v_sbfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1
 ; SI-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; SI-NEXT:    v_sub_i32_e32 v3, vcc, 32, v3
-; SI-NEXT:    v_lshlrev_b32_e32 v2, v3, v2
-; SI-NEXT:    v_ashrrev_i32_e32 v2, v3, v2
+; SI-NEXT:    v_and_b32_e32 v3, 31, v3
+; SI-NEXT:    v_bfe_i32 v2, v2, 0, v3
 ; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 ; SI-NEXT:    s_endpgm
 ;
@@ -240,9 +241,8 @@ define amdgpu_kernel void @v_sbfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    v_sub_u32_e32 v2, vcc, 32, v4
-; VI-NEXT:    v_lshlrev_b32_e32 v3, v2, v3
-; VI-NEXT:    v_ashrrev_i32_e32 v2, v2, v3
+; VI-NEXT:    v_and_b32_e32 v2, 31, v4
+; VI-NEXT:    v_bfe_i32 v2, v3, 0, v2
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -251,7 +251,8 @@ define amdgpu_kernel void @v_sbfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1
   %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id.x
   %src = load volatile i32, ptr addrspace(1) %in0.gep
   %width = load volatile i32, ptr addrspace(1) %in0.gep
-  %sub = sub i32 32, %width
+  %width5 = and i32 %width, 31
+  %sub = sub i32 32, %width5
   %shl = shl i32 %src, %sub
   %bfe = ashr i32 %shl, %sub
   store i32 %bfe, ptr addrspace(1) %out.gep
diff --git a/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll b/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll
index 3d9616f02d52d1..3de8db2c6a448e 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll
@@ -150,22 +150,14 @@ define i32 @bzhi32_c4_commutative(i32 %val, i32 %numlowbits) nounwind {
 ; ---------------------------------------------------------------------------- ;
 
 define i32 @bzhi32_d0(i32 %val, i32 %numlowbits) nounwind {
-; SI-LABEL: bzhi32_d0:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    v_sub_i32_e32 v1, vcc, 32, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
-; SI-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
-; SI-NEXT:    s_setpc_b64 s[30:31]
-;
-; VI-LABEL: bzhi32_d0:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_sub_u32_e32 v1, vcc, 32, v1
-; VI-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
-; VI-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
-; VI-NEXT:    s_setpc_b64 s[30:31]
-  %numhighbits = sub i32 32, %numlowbits
+; GCN-LABEL: bzhi32_d0:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v1, 31, v1
+; GCN-NEXT:    v_bfe_u32 v0, v0, 0, v1
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %numlow5bits = and i32 %numlowbits, 31
+  %numhighbits = sub i32 32, %numlow5bits
   %highbitscleared = shl i32 %val, %numhighbits
   %masked = lshr i32 %highbitscleared, %numhighbits
   ret i32 %masked

rampitec

LGTM with a nit.

llvm/lib/Target/AMDGPU/SIInstructions.td

Use "countMaxTrailingOnes() <= 5" instead of "countMinLeadingZeros() >= 27" in uint5Bits PatLeaf definition.

arsenm · 2024-11-07T23:36:19Z

llvm/test/CodeGen/AMDGPU/extract-lowbits.ll

-; VI-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
-; VI-NEXT:    s_setpc_b64 s[30:31]
-  %numhighbits = sub i32 32, %numlowbits
+; GCN-LABEL: bzhi32_d0:


This isn't canonical IR. We shouldn't be burning compile time optimizing non-canonical IR. Can you add a copy of this test with the instcombine output, and revert this if you get the same output

Instcombine does not change the test source, and I think extracting the low bits are frequently used in the programs to restrict the range of a value.

Instcombine does change this:

define i32 @bzhi32_d0(i32 %val, i32 %numlowbits) #0 { %numlow5bits = and i32 %numlowbits, 31 %numhighbits = sub nuw nsw i32 32, %numlow5bits %1 = lshr i32 -1, %numhighbits %masked = and i32 %1, %val ret i32 %masked }

Codegenning this does produce BFE already (except we fail to optimize out the clamp, which is a separate issue)

jayfoad · 2024-11-08T09:20:12Z

llvm/lib/Target/AMDGPU/SIInstructions.td

@@ -3553,6 +3553,23 @@ def : AMDGPUPat <
  (V_BFE_U32_e64 $src, (i32 0), $width)
 >;

+def uint5Bits : PatLeaf<(i32 VGPR_32:$width), [{
+  return CurDAG->computeKnownBits(SDValue(N, 0)).countMaxTrailingOnes() <= 5;


No, this is completely wrong!!! E.g. if bit 0 is known to be 0 then countMaxTrailingOnes will return 0 and this test will pass.

Ugh, my bad, sorry.

Thanks so much. Also countMaxActiveBits is more close to what we are going to define

llvm/lib/Target/AMDGPU/SIInstructions.td

countMaxTrailingOnes() is not correct. This patch follows the suggestion from llvm#115372.

countMaxTrailingOnes() is not correct. This patch follows the suggestion from #115372.

…15372)" This reverts commit bdf8e30.

…" (#116091) Based on the suggestion from #115543, we should not do the pattern matching from x << (32-y) >> (32-y) to "bfe x, 0, y" at all. This reverts commits a2bacf8 and bdf8e30.

Enable pattern matching from "x<<32-y>>32-y" to "bfe x, 0, y" when we know y is in [0,31]. This is the follow-up for the PR: llvm#114279 to fix the issue: llvm#114282

) countMaxTrailingOnes() is not correct. This patch follows the suggestion from llvm#115372.

AMDGPU: Don't avoid clamp of bit shift in BFE pattern

0c29f88

Enable pattern matching from "x<<32-y>>32-y" to "bfe x, 0, y" when we know y is in [0,31]. This is the follow-up for the PR: llvm#114279 to fix the issue: llvm#114282

changpeng requested a review from arsenm November 7, 2024 21:40

llvmbot added the backend:AMDGPU label Nov 7, 2024

changpeng requested a review from rampitec November 7, 2024 21:40

rampitec approved these changes Nov 7, 2024

View reviewed changes

llvm/lib/Target/AMDGPU/SIInstructions.td Outdated Show resolved Hide resolved

AMDGPU: Don't avoid clamp of bit shift in BFE pattern

aeb611a

Use "countMaxTrailingOnes() <= 5" instead of "countMinLeadingZeros() >= 27" in uint5Bits PatLeaf definition.

changpeng merged commit bdf8e30 into llvm:main Nov 7, 2024
5 of 7 checks passed

arsenm reviewed Nov 7, 2024

View reviewed changes

jayfoad reviewed Nov 8, 2024

View reviewed changes

llvm/lib/Target/AMDGPU/SIInstructions.td Show resolved Hide resolved

changpeng deleted the bfe branch November 8, 2024 18:00

changpeng added a commit to changpeng/llvm-project that referenced this pull request Nov 8, 2024

AMDGPU: Use "countMaxActiveBits() <= 5" to define uint5Bits

a2bacf8

countMaxTrailingOnes() is not correct. This patch follows the suggestion from llvm#115372.

changpeng mentioned this pull request Nov 8, 2024

AMDGPU: Use "countMaxActiveBits() <= 5" to define uint5Bits #115543

Merged

changpeng added a commit that referenced this pull request Nov 8, 2024

AMDGPU: Use "countMaxActiveBits() <= 5" to define uint5Bits (#115543)

db6f476

countMaxTrailingOnes() is not correct. This patch follows the suggestion from #115372.

changpeng added a commit to changpeng/llvm-project that referenced this pull request Nov 13, 2024

Revert "AMDGPU: Don't avoid clamp of bit shift in BFE pattern (llvm#1…

d755816

…15372)" This reverts commit bdf8e30.

Groverkss pushed a commit to iree-org/llvm-project that referenced this pull request Nov 15, 2024

AMDGPU: Use "countMaxActiveBits() <= 5" to define uint5Bits (llvm#115543

e056995

) countMaxTrailingOnes() is not correct. This patch follows the suggestion from llvm#115372.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

AMDGPU: Don't avoid clamp of bit shift in BFE pattern #115372

AMDGPU: Don't avoid clamp of bit shift in BFE pattern #115372

Uh oh!

changpeng commented Nov 7, 2024

Uh oh!

llvmbot commented Nov 7, 2024

Uh oh!

rampitec left a comment

Uh oh!

Uh oh!

Uh oh!

arsenm Nov 7, 2024

Uh oh!

changpeng Nov 8, 2024

Uh oh!

arsenm Nov 8, 2024

Uh oh!

jayfoad Nov 8, 2024

Uh oh!

rampitec Nov 8, 2024

Uh oh!

changpeng Nov 8, 2024

Uh oh!

Uh oh!

Uh oh!

AMDGPU: Don't avoid clamp of bit shift in BFE pattern #115372

AMDGPU: Don't avoid clamp of bit shift in BFE pattern #115372

Uh oh!

Conversation

changpeng commented Nov 7, 2024

Uh oh!

llvmbot commented Nov 7, 2024

Uh oh!

rampitec left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

arsenm Nov 7, 2024

Choose a reason for hiding this comment

Uh oh!

changpeng Nov 8, 2024

Choose a reason for hiding this comment

Uh oh!

arsenm Nov 8, 2024

Choose a reason for hiding this comment

Uh oh!

jayfoad Nov 8, 2024

Choose a reason for hiding this comment

Uh oh!

rampitec Nov 8, 2024

Choose a reason for hiding this comment

Uh oh!

changpeng Nov 8, 2024

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!