-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AMDGPU][True16][CodeGen] true16 codegen for bswap #122849
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU][True16][CodeGen] true16 codegen for bswap #122849
Conversation
@llvm/pr-subscribers-backend-amdgpu Author: Brox Chen (broxigarchen) Changestrue16 codegen pattern for bswap Full diff: https://github.com/llvm/llvm-project/pull/122849.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index cdc1132579d8d8..3b924d869a184c 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3030,6 +3030,8 @@ def : GCNPat <
// Magic number: 1 | (0 << 8) | (12 << 16) | (12 << 24)
// The 12s emit 0s.
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+let True16Predicate = p in {
def : GCNPat <
(i16 (bswap i16:$a)),
(V_PERM_B32_e64 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x0c0c0001)))
@@ -3039,6 +3041,19 @@ def : GCNPat <
(i32 (zext (bswap i16:$a))),
(V_PERM_B32_e64 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x0c0c0001)))
>;
+}
+
+let True16Predicate = UseRealTrue16Insts in {
+def : GCNPat <
+ (i16 (bswap i16:$a)),
+ (EXTRACT_SUBREG (V_PERM_B32_e64 (i32 0), (COPY VGPR_16:$a), (S_MOV_B32 (i32 0x0c0c0001))), lo16)
+>;
+
+def : GCNPat <
+ (i32 (zext (bswap i16:$a))),
+ (V_PERM_B32_e64 (i32 0), (COPY VGPR_16:$a), (S_MOV_B32 (i32 0x0c0c0001)))
+>;
+}
// Magic number: 1 | (0 << 8) | (3 << 16) | (2 << 24)
def : GCNPat <
diff --git a/llvm/test/CodeGen/AMDGPU/bswap.ll b/llvm/test/CodeGen/AMDGPU/bswap.ll
index 30c8e94c9a27f5..a95a1aba0c9141 100644
--- a/llvm/test/CodeGen/AMDGPU/bswap.ll
+++ b/llvm/test/CodeGen/AMDGPU/bswap.ll
@@ -1,7 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=amdgcn-- -verify-machineinstrs | FileCheck %s --check-prefix=SI
; RUN: llc < %s -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=VI
-; RUN: llc < %s -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=GFX11
+; RUN: llc < %s -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs | FileCheck %s --check-prefixes=GFX11,GFX11-REAL16
+; RUN: llc < %s -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs | FileCheck %s --check-prefixes=GFX11,GFX11-FAKE16
declare i16 @llvm.bswap.i16(i16) nounwind readnone
declare <2 x i16> @llvm.bswap.v2i16(<2 x i16>) nounwind readnone
@@ -490,13 +491,21 @@ define float @missing_truncate_promote_bswap(i32 %arg) {
; VI-NEXT: v_cvt_f32_f16_e32 v0, v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: missing_truncate_promote_bswap:
-; GFX11: ; %bb.0: ; %bb
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_perm_b32 v0, 0, v0, 0xc0c0001
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-REAL16-LABEL: missing_truncate_promote_bswap:
+; GFX11-REAL16: ; %bb.0: ; %bb
+; GFX11-REAL16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-REAL16-NEXT: v_perm_b32 v0, 0, v0, 0xc0c0001
+; GFX11-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-REAL16-NEXT: v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-REAL16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: missing_truncate_promote_bswap:
+; GFX11-FAKE16: ; %bb.0: ; %bb
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, 0, v0, 0xc0c0001
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
bb:
%tmp = trunc i32 %arg to i16
%tmp1 = call i16 @llvm.bswap.i16(i16 %tmp)
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
>; | ||
|
||
def : GCNPat < | ||
(i32 (zext (bswap i16:$a))), |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
should this handle zext or any ext
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hi Matt. From the pattern this is zero filling so it should be zext.
This is following the existing pattern here https://github.com/llvm/llvm-project/blob/main/llvm/lib/Target/AMDGPU/SIInstructions.td#L3039
I don't know if we need to handle sext. It seems rare that if we need to handle a byte swap and at the mean time do a sext
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't mean sext, I mean anyext
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I see. I think we can have both zext and anyext here like https://github.com/llvm/llvm-project/blob/main/llvm/lib/Target/AMDGPU/SIInstructions.td#L2355-L2356
But if there is only one then it should be using zext. The reason would be
zext -> anyext cost nothing, but anyext -> zext will add an addtional and
operation to zero out top bits.
i.e. for example
llvm-project/llvm/test/CodeGen/AMDGPU/bswap.ll
Lines 567 to 568 in 98da375
%bswap = call i16 @llvm.bswap.i16(i16 %src) | |
%zext = zext i16 %bswap to i32 |
and
operation if we replace zext to anyext
@@ -3039,6 +3041,19 @@ def : GCNPat < | |||
(i32 (zext (bswap i16:$a))), | |||
(V_PERM_B32_e64 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x0c0c0001))) | |||
>; | |||
} | |||
|
|||
let True16Predicate = UseRealTrue16Insts in { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I assume UseRealTrue16Insts implies has perm
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hi Matt. Yes UseRealTrue16Insts implies GFX11Plus
These patterns are also guarded by
let SubtargetPredicate = isGFX8Plus, AddedComplexity = 1 in { |
true16 codegen pattern for bswap