Skip to content

Commit ec66c4a

Browse files
authored
[AMDGPU][True16][CodeGen] true16 codegen pattern for f16 canonicalize (#122000)
true16 codegen pattern for f16 canonicalize
1 parent a94226f commit ec66c4a

File tree

3 files changed

+304
-135
lines changed

3 files changed

+304
-135
lines changed

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 27 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3093,7 +3093,7 @@ foreach vt = [f16, v2f16, f32, v2f32, f64] in {
30933093
// Prefer selecting to max when legal, but using mul is always valid.
30943094
let AddedComplexity = -5 in {
30953095

3096-
let OtherPredicates = [NotHasTrue16BitInsts] in {
3096+
let True16Predicate = NotHasTrue16BitInsts in {
30973097
def : GCNPat<
30983098
(fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))),
30993099
(V_MUL_F16_e64 0, (i32 CONST.FP16_ONE), $src_mods, $src)
@@ -3103,9 +3103,21 @@ def : GCNPat<
31033103
(fcanonicalize (f16 (fneg (VOP3Mods f16:$src, i32:$src_mods)))),
31043104
(V_MUL_F16_e64 0, (i32 CONST.FP16_NEG_ONE), $src_mods, $src)
31053105
>;
3106-
} // End OtherPredicates
3106+
} // End True16Predicate
31073107

3108-
let OtherPredicates = [HasTrue16BitInsts] in {
3108+
let True16Predicate = UseRealTrue16Insts in {
3109+
def : GCNPat<
3110+
(fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))),
3111+
(V_MUL_F16_t16_e64 0, (i16 CONST.FP16_ONE), $src_mods, $src, 0/*Clamp*/, /*omod*/0, /*opsel*/0)
3112+
>;
3113+
3114+
def : GCNPat<
3115+
(fcanonicalize (f16 (fneg (VOP3Mods f16:$src, i32:$src_mods)))),
3116+
(V_MUL_F16_t16_e64 0, (i16 CONST.FP16_NEG_ONE), $src_mods, $src, 0/*Clamp*/, /*omod*/0, /*opsel*/0)
3117+
>;
3118+
} // End True16Predicate
3119+
3120+
let True16Predicate = UseFakeTrue16Insts in {
31093121
def : GCNPat<
31103122
(fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))),
31113123
(V_MUL_F16_fake16_e64 0, (i32 CONST.FP16_ONE), $src_mods, $src)
@@ -3115,7 +3127,7 @@ def : GCNPat<
31153127
(fcanonicalize (f16 (fneg (VOP3Mods f16:$src, i32:$src_mods)))),
31163128
(V_MUL_F16_fake16_e64 0, (i32 CONST.FP16_NEG_ONE), $src_mods, $src)
31173129
>;
3118-
} // End OtherPredicates
3130+
} // End True16Predicate
31193131

31203132
def : GCNPat<
31213133
(fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))),
@@ -3173,13 +3185,22 @@ multiclass SelectCanonicalizeAsMax<
31733185
def : GCNPat<
31743186
(fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))),
31753187
(V_MAX_F16_e64 $src_mods, $src, $src_mods, $src, 0, 0)> {
3176-
let OtherPredicates = !listconcat(f16_preds, [Has16BitInsts, NotHasTrue16BitInsts]);
3188+
let OtherPredicates = !listconcat(f16_preds, [Has16BitInsts]);
3189+
let True16Predicate = NotHasTrue16BitInsts;
3190+
}
3191+
3192+
def : GCNPat<
3193+
(fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))),
3194+
(V_MAX_F16_t16_e64 $src_mods, $src, $src_mods, $src, 0, 0)> {
3195+
let OtherPredicates = !listconcat(f16_preds, [Has16BitInsts]);
3196+
let True16Predicate = UseRealTrue16Insts;
31773197
}
31783198

31793199
def : GCNPat<
31803200
(fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))),
31813201
(V_MAX_F16_fake16_e64 $src_mods, $src, $src_mods, $src, 0, 0)> {
3182-
let OtherPredicates = !listconcat(f16_preds, [Has16BitInsts, HasTrue16BitInsts]);
3202+
let OtherPredicates = !listconcat(f16_preds, [Has16BitInsts]);
3203+
let True16Predicate = UseFakeTrue16Insts;
31833204
}
31843205

31853206
def : GCNPat<

llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcanonicalize.mir

Lines changed: 30 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@
22
# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -disable-gisel-legality-check -o - %s | FileCheck -check-prefix=GFX8 %s
33
# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s
44
# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX10 %s
5-
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX11 %s
5+
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -mattr=+real-true16 -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
6+
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -mattr=-real-true16 -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
67

78
---
89

@@ -38,12 +39,20 @@ body: |
3839
; GFX10-NEXT: [[V_MAX_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MAX_F16_e64 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
3940
; GFX10-NEXT: S_ENDPGM 0, implicit [[V_MAX_F16_e64_]]
4041
;
41-
; GFX11-LABEL: name: fcanonicalize_f16_denorm
42-
; GFX11: liveins: $vgpr0
43-
; GFX11-NEXT: {{ $}}
44-
; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
45-
; GFX11-NEXT: [[V_MAX_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MAX_F16_fake16_e64 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
46-
; GFX11-NEXT: S_ENDPGM 0, implicit [[V_MAX_F16_fake16_e64_]]
42+
; GFX11-TRUE16-LABEL: name: fcanonicalize_f16_denorm
43+
; GFX11-TRUE16: liveins: $vgpr0
44+
; GFX11-TRUE16-NEXT: {{ $}}
45+
; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
46+
; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16
47+
; GFX11-TRUE16-NEXT: [[V_MAX_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, [[COPY1]], 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
48+
; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_MAX_F16_t16_e64_]]
49+
;
50+
; GFX11-FAKE16-LABEL: name: fcanonicalize_f16_denorm
51+
; GFX11-FAKE16: liveins: $vgpr0
52+
; GFX11-FAKE16-NEXT: {{ $}}
53+
; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
54+
; GFX11-FAKE16-NEXT: [[V_MAX_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MAX_F16_fake16_e64 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
55+
; GFX11-FAKE16-NEXT: S_ENDPGM 0, implicit [[V_MAX_F16_fake16_e64_]]
4756
%0:vgpr(s32) = COPY $vgpr0
4857
%1:vgpr(s16) = G_TRUNC %0
4958
%2:vgpr(s16) = G_FCANONICALIZE %1
@@ -84,12 +93,20 @@ body: |
8493
; GFX10-NEXT: [[V_MAX_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MAX_F16_e64 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
8594
; GFX10-NEXT: S_ENDPGM 0, implicit [[V_MAX_F16_e64_]]
8695
;
87-
; GFX11-LABEL: name: fcanonicalize_f16_flush
88-
; GFX11: liveins: $vgpr0
89-
; GFX11-NEXT: {{ $}}
90-
; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
91-
; GFX11-NEXT: [[V_MAX_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MAX_F16_fake16_e64 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
92-
; GFX11-NEXT: S_ENDPGM 0, implicit [[V_MAX_F16_fake16_e64_]]
96+
; GFX11-TRUE16-LABEL: name: fcanonicalize_f16_flush
97+
; GFX11-TRUE16: liveins: $vgpr0
98+
; GFX11-TRUE16-NEXT: {{ $}}
99+
; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
100+
; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16
101+
; GFX11-TRUE16-NEXT: [[V_MAX_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, [[COPY1]], 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
102+
; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_MAX_F16_t16_e64_]]
103+
;
104+
; GFX11-FAKE16-LABEL: name: fcanonicalize_f16_flush
105+
; GFX11-FAKE16: liveins: $vgpr0
106+
; GFX11-FAKE16-NEXT: {{ $}}
107+
; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
108+
; GFX11-FAKE16-NEXT: [[V_MAX_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MAX_F16_fake16_e64 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
109+
; GFX11-FAKE16-NEXT: S_ENDPGM 0, implicit [[V_MAX_F16_fake16_e64_]]
93110
%0:vgpr(s32) = COPY $vgpr0
94111
%1:vgpr(s16) = G_TRUNC %0
95112
%2:vgpr(s16) = G_FCANONICALIZE %1

0 commit comments

Comments
 (0)