Skip to content

Commit 0a3a0ea

Browse files
jayfoadmbrkusanin
andauthored
[AMDGPU] Update uses of new VOP2 pseudos for GFX12 (#78155)
New pseudos were added for instructions that were natively VOP3 on GFX11: V_ADD_F64_pseudo, V_MUL_F64_pseudo, V_MIN_NUM_F64, V_MAX_NUM_F64, V_LSHLREV_B64_pseudo --------- Co-authored-by: Mirko Brkusanin <[email protected]>
1 parent f4fbbeb commit 0a3a0ea

File tree

5 files changed

+1029
-168
lines changed

5 files changed

+1029
-168
lines changed

llvm/lib/Target/AMDGPU/SIFoldOperands.cpp

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1498,6 +1498,7 @@ const MachineOperand *SIFoldOperands::isClamp(const MachineInstr &MI) const {
14981498
case AMDGPU::V_MAX_F16_t16_e64:
14991499
case AMDGPU::V_MAX_F16_fake16_e64:
15001500
case AMDGPU::V_MAX_F64_e64:
1501+
case AMDGPU::V_MAX_NUM_F64_e64:
15011502
case AMDGPU::V_PK_MAX_F16: {
15021503
if (!TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm())
15031504
return nullptr;
@@ -1567,7 +1568,8 @@ bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) {
15671568

15681569
static int getOModValue(unsigned Opc, int64_t Val) {
15691570
switch (Opc) {
1570-
case AMDGPU::V_MUL_F64_e64: {
1571+
case AMDGPU::V_MUL_F64_e64:
1572+
case AMDGPU::V_MUL_F64_pseudo_e64: {
15711573
switch (Val) {
15721574
case 0x3fe0000000000000: // 0.5
15731575
return SIOutMods::DIV2;
@@ -1618,15 +1620,16 @@ SIFoldOperands::isOMod(const MachineInstr &MI) const {
16181620
unsigned Op = MI.getOpcode();
16191621
switch (Op) {
16201622
case AMDGPU::V_MUL_F64_e64:
1623+
case AMDGPU::V_MUL_F64_pseudo_e64:
16211624
case AMDGPU::V_MUL_F32_e64:
16221625
case AMDGPU::V_MUL_F16_t16_e64:
16231626
case AMDGPU::V_MUL_F16_fake16_e64:
16241627
case AMDGPU::V_MUL_F16_e64: {
16251628
// If output denormals are enabled, omod is ignored.
16261629
if ((Op == AMDGPU::V_MUL_F32_e64 &&
16271630
MFI->getMode().FP32Denormals.Output != DenormalMode::PreserveSign) ||
1628-
((Op == AMDGPU::V_MUL_F64_e64 || Op == AMDGPU::V_MUL_F16_e64 ||
1629-
Op == AMDGPU::V_MUL_F16_t16_e64 ||
1631+
((Op == AMDGPU::V_MUL_F64_e64 || Op == AMDGPU::V_MUL_F64_pseudo_e64 ||
1632+
Op == AMDGPU::V_MUL_F16_e64 || Op == AMDGPU::V_MUL_F16_t16_e64 ||
16301633
Op == AMDGPU::V_MUL_F16_fake16_e64) &&
16311634
MFI->getMode().FP64FP16Denormals.Output != DenormalMode::PreserveSign))
16321635
return std::pair(nullptr, SIOutMods::NONE);
@@ -1655,15 +1658,16 @@ SIFoldOperands::isOMod(const MachineInstr &MI) const {
16551658
return std::pair(RegOp, OMod);
16561659
}
16571660
case AMDGPU::V_ADD_F64_e64:
1661+
case AMDGPU::V_ADD_F64_pseudo_e64:
16581662
case AMDGPU::V_ADD_F32_e64:
16591663
case AMDGPU::V_ADD_F16_e64:
16601664
case AMDGPU::V_ADD_F16_t16_e64:
16611665
case AMDGPU::V_ADD_F16_fake16_e64: {
16621666
// If output denormals are enabled, omod is ignored.
16631667
if ((Op == AMDGPU::V_ADD_F32_e64 &&
16641668
MFI->getMode().FP32Denormals.Output != DenormalMode::PreserveSign) ||
1665-
((Op == AMDGPU::V_ADD_F64_e64 || Op == AMDGPU::V_ADD_F16_e64 ||
1666-
Op == AMDGPU::V_ADD_F16_t16_e64 ||
1669+
((Op == AMDGPU::V_ADD_F64_e64 || Op == AMDGPU::V_ADD_F64_pseudo_e64 ||
1670+
Op == AMDGPU::V_ADD_F16_e64 || Op == AMDGPU::V_ADD_F16_t16_e64 ||
16671671
Op == AMDGPU::V_ADD_F16_fake16_e64) &&
16681672
MFI->getMode().FP64FP16Denormals.Output != DenormalMode::PreserveSign))
16691673
return std::pair(nullptr, SIOutMods::NONE);

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1862,7 +1862,10 @@ class ClampPat<Instruction inst, ValueType vt> : GCNPat <
18621862
>;
18631863

18641864
def : ClampPat<V_MAX_F32_e64, f32>;
1865+
let SubtargetPredicate = isNotGFX12Plus in
18651866
def : ClampPat<V_MAX_F64_e64, f64>;
1867+
let SubtargetPredicate = isGFX12Plus in
1868+
def : ClampPat<V_MAX_NUM_F64_e64, f64>;
18661869
let SubtargetPredicate = NotHasTrue16BitInsts in
18671870
def : ClampPat<V_MAX_F16_e64, f16>;
18681871
let SubtargetPredicate = UseRealTrue16Insts in
@@ -2990,10 +2993,12 @@ def : GCNPat<
29902993
}
29912994

29922995
// TODO: Handle fneg like other types.
2996+
let SubtargetPredicate = isNotGFX12Plus in {
29932997
def : GCNPat<
29942998
(fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))),
29952999
(V_MUL_F64_e64 0, CONST.FP64_ONE, $src_mods, $src)
29963000
>;
3001+
}
29973002
} // End AddedComplexity = -5
29983003

29993004
multiclass SelectCanonicalizeAsMax<
@@ -3009,7 +3014,13 @@ multiclass SelectCanonicalizeAsMax<
30093014
def : GCNPat<
30103015
(fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))),
30113016
(V_MAX_F64_e64 $src_mods, $src, $src_mods, $src)> {
3012-
let OtherPredicates = f64_preds;
3017+
let OtherPredicates = !listconcat(f64_preds, [isNotGFX12Plus]);
3018+
}
3019+
3020+
def : GCNPat<
3021+
(fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))),
3022+
(V_MAX_NUM_F64_e64 $src_mods, $src, $src_mods, $src)> {
3023+
let OtherPredicates = !listconcat(f64_preds, [isGFX12Plus]);
30133024
}
30143025

30153026
def : GCNPat<

llvm/test/CodeGen/AMDGPU/clamp.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -857,7 +857,7 @@ define amdgpu_kernel void @v_clamp_f64(ptr addrspace(1) %out, ptr addrspace(1) %
857857
; GFX12-NEXT: s_wait_kmcnt 0x0
858858
; GFX12-NEXT: global_load_b64 v[0:1], v2, s[2:3]
859859
; GFX12-NEXT: s_wait_loadcnt 0x0
860-
; GFX12-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] clamp
860+
; GFX12-NEXT: v_max_num_f64_e64 v[0:1], v[0:1], v[0:1] clamp
861861
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
862862
; GFX12-NEXT: s_nop 0
863863
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -938,7 +938,7 @@ define amdgpu_kernel void @v_clamp_neg_f64(ptr addrspace(1) %out, ptr addrspace(
938938
; GFX12-NEXT: s_wait_kmcnt 0x0
939939
; GFX12-NEXT: global_load_b64 v[0:1], v2, s[2:3]
940940
; GFX12-NEXT: s_wait_loadcnt 0x0
941-
; GFX12-NEXT: v_max_f64 v[0:1], -v[0:1], -v[0:1] clamp
941+
; GFX12-NEXT: v_max_num_f64_e64 v[0:1], -v[0:1], -v[0:1] clamp
942942
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
943943
; GFX12-NEXT: s_nop 0
944944
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1020,7 +1020,7 @@ define amdgpu_kernel void @v_clamp_negabs_f64(ptr addrspace(1) %out, ptr addrspa
10201020
; GFX12-NEXT: s_wait_kmcnt 0x0
10211021
; GFX12-NEXT: global_load_b64 v[0:1], v2, s[2:3]
10221022
; GFX12-NEXT: s_wait_loadcnt 0x0
1023-
; GFX12-NEXT: v_max_f64 v[0:1], -|v[0:1]|, -|v[0:1]| clamp
1023+
; GFX12-NEXT: v_max_num_f64_e64 v[0:1], -|v[0:1]|, -|v[0:1]| clamp
10241024
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
10251025
; GFX12-NEXT: s_nop 0
10261026
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)

0 commit comments

Comments
 (0)