[AMDGPU][True16][CodeGen] true16 codegen for bswap #122849

broxigarchen · 2025-01-14T02:40:36Z

true16 codegen pattern for bswap

llvmbot · 2025-01-14T02:41:27Z

@llvm/pr-subscribers-backend-amdgpu

Author: Brox Chen (broxigarchen)

Changes

true16 codegen pattern for bswap

Full diff: https://github.com/llvm/llvm-project/pull/122849.diff

2 Files Affected:

(modified) llvm/lib/Target/AMDGPU/SIInstructions.td (+15)
(modified) llvm/test/CodeGen/AMDGPU/bswap.ll (+17-8)

diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index cdc1132579d8d8..3b924d869a184c 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3030,6 +3030,8 @@ def : GCNPat <
 
 // Magic number: 1 | (0 << 8) | (12 << 16) | (12 << 24)
 // The 12s emit 0s.
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+let True16Predicate = p in {
 def : GCNPat <
   (i16 (bswap i16:$a)),
   (V_PERM_B32_e64  (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x0c0c0001)))
@@ -3039,6 +3041,19 @@ def : GCNPat <
   (i32 (zext (bswap i16:$a))),
   (V_PERM_B32_e64  (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x0c0c0001)))
 >;
+}
+
+let True16Predicate = UseRealTrue16Insts in {
+def : GCNPat <
+  (i16 (bswap i16:$a)),
+  (EXTRACT_SUBREG (V_PERM_B32_e64  (i32 0), (COPY VGPR_16:$a), (S_MOV_B32 (i32 0x0c0c0001))), lo16)
+>;
+
+def : GCNPat <
+  (i32 (zext (bswap i16:$a))),
+  (V_PERM_B32_e64  (i32 0), (COPY VGPR_16:$a), (S_MOV_B32 (i32 0x0c0c0001)))
+>;
+}
 
 // Magic number: 1 | (0 << 8) | (3 << 16) | (2 << 24)
 def : GCNPat <
diff --git a/llvm/test/CodeGen/AMDGPU/bswap.ll b/llvm/test/CodeGen/AMDGPU/bswap.ll
index 30c8e94c9a27f5..a95a1aba0c9141 100644
--- a/llvm/test/CodeGen/AMDGPU/bswap.ll
+++ b/llvm/test/CodeGen/AMDGPU/bswap.ll
@@ -1,7 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=amdgcn-- -verify-machineinstrs | FileCheck %s --check-prefix=SI
 ; RUN: llc < %s -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=VI
-; RUN: llc < %s -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=GFX11
+; RUN: llc < %s -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs | FileCheck %s --check-prefixes=GFX11,GFX11-REAL16
+; RUN: llc < %s -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs | FileCheck %s --check-prefixes=GFX11,GFX11-FAKE16
 
 declare i16 @llvm.bswap.i16(i16) nounwind readnone
 declare <2 x i16> @llvm.bswap.v2i16(<2 x i16>) nounwind readnone
@@ -490,13 +491,21 @@ define float @missing_truncate_promote_bswap(i32 %arg) {
 ; VI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: missing_truncate_promote_bswap:
-; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_perm_b32 v0, 0, v0, 0xc0c0001
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-REAL16-LABEL: missing_truncate_promote_bswap:
+; GFX11-REAL16:       ; %bb.0: ; %bb
+; GFX11-REAL16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-REAL16-NEXT:    v_perm_b32 v0, 0, v0, 0xc0c0001
+; GFX11-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-REAL16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-REAL16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: missing_truncate_promote_bswap:
+; GFX11-FAKE16:       ; %bb.0: ; %bb
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, 0, v0, 0xc0c0001
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %tmp = trunc i32 %arg to i16
   %tmp1 = call i16 @llvm.bswap.i16(i16 %tmp)

kosarev

LGTM.

arsenm · 2025-01-14T12:50:46Z

llvm/lib/Target/AMDGPU/SIInstructions.td

+>;
+
+def : GCNPat <
+  (i32 (zext (bswap i16:$a))),


should this handle zext or any ext

Hi Matt. From the pattern this is zero filling so it should be zext.

This is following the existing pattern here https://github.com/llvm/llvm-project/blob/main/llvm/lib/Target/AMDGPU/SIInstructions.td#L3039

I don't know if we need to handle sext. It seems rare that if we need to handle a byte swap and at the mean time do a sext

I don't mean sext, I mean anyext

I see. I think we can have both zext and anyext here like https://github.com/llvm/llvm-project/blob/main/llvm/lib/Target/AMDGPU/SIInstructions.td#L2355-L2356

But if there is only one then it should be using zext. The reason would be
zext -> anyext cost nothing, but anyext -> zext will add an addtional and operation to zero out top bits.

i.e. for example

llvm-project/llvm/test/CodeGen/AMDGPU/bswap.ll

Lines 567 to 568 in 98da375

%bswap = call i16 @llvm.bswap.i16(i16 %src)

%zext = zext i16 %bswap to i32

I think this one it will introduce an addtional and operation if we replace zext to anyext

arsenm · 2025-01-14T12:51:24Z

llvm/lib/Target/AMDGPU/SIInstructions.td

@@ -3039,6 +3041,19 @@ def : GCNPat <
  (i32 (zext (bswap i16:$a))),
  (V_PERM_B32_e64  (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x0c0c0001)))
 >;
+}
+
+let True16Predicate = UseRealTrue16Insts in {


I assume UseRealTrue16Insts implies has perm

Hi Matt. Yes UseRealTrue16Insts implies GFX11Plus

These patterns are also guarded by

llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td

Line 3008 in 98da375

let SubtargetPredicate = isGFX8Plus, AddedComplexity = 1 in {

.

true16 pattern for bswap

98da375

broxigarchen marked this pull request as ready for review January 14, 2025 02:40

broxigarchen requested review from kosarev and arsenm January 14, 2025 02:40

llvmbot added the backend:AMDGPU label Jan 14, 2025

broxigarchen requested a review from Sisyph January 14, 2025 02:41

kosarev approved these changes Jan 14, 2025

View reviewed changes

arsenm reviewed Jan 14, 2025

View reviewed changes

broxigarchen requested a review from arsenm January 16, 2025 16:07

arsenm approved these changes Jan 17, 2025

View reviewed changes

broxigarchen merged commit 703e9e9 into llvm:main Jan 17, 2025
12 checks passed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[AMDGPU][True16][CodeGen] true16 codegen for bswap #122849

[AMDGPU][True16][CodeGen] true16 codegen for bswap #122849

Uh oh!

broxigarchen commented Jan 14, 2025

Uh oh!

llvmbot commented Jan 14, 2025

Uh oh!

kosarev left a comment

Uh oh!

arsenm Jan 14, 2025

Uh oh!

broxigarchen Jan 14, 2025 •

edited

Loading

Uh oh!

arsenm Jan 15, 2025

Uh oh!

broxigarchen Jan 16, 2025 •

edited

Loading

Uh oh!

arsenm Jan 14, 2025

Uh oh!

broxigarchen Jan 14, 2025

Uh oh!

Uh oh!

Uh oh!

	%bswap = call i16 @llvm.bswap.i16(i16 %src)
	%zext = zext i16 %bswap to i32

[AMDGPU][True16][CodeGen] true16 codegen for bswap #122849

[AMDGPU][True16][CodeGen] true16 codegen for bswap #122849

Uh oh!

Conversation

broxigarchen commented Jan 14, 2025

Uh oh!

llvmbot commented Jan 14, 2025

Uh oh!

kosarev left a comment

Choose a reason for hiding this comment

Uh oh!

arsenm Jan 14, 2025

Choose a reason for hiding this comment

Uh oh!

broxigarchen Jan 14, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

arsenm Jan 15, 2025

Choose a reason for hiding this comment

Uh oh!

broxigarchen Jan 16, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

arsenm Jan 14, 2025

Choose a reason for hiding this comment

Uh oh!

broxigarchen Jan 14, 2025

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

broxigarchen Jan 14, 2025 •

edited

Loading

broxigarchen Jan 16, 2025 •

edited

Loading