-
Notifications
You must be signed in to change notification settings - Fork 14.4k
[AMDGPU][True16][MC] true16 for v_fma_f16 #119477
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU][True16][MC] true16 for v_fma_f16 #119477
Conversation
d40b7fb
to
4f7e73e
Compare
@llvm/pr-subscribers-backend-amdgpu Author: Brox Chen (broxigarchen) ChangesSupport true16 format for v_fma_f16 in MC. Since we are replacing v_fma_f16 to v_fma_f16_t16/v_fma_f16_fake16 in Post-GFX11, have to update the CodeGen pattern for v_fma_f16_fake16 to get CodeGen test passing. There is no pattern modified/created, but just replacing the v_fma_f16 with fake16 format. Patch is 172.45 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/119477.diff 17 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 4fb5cb066be7c3..c2199fd587bea6 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -199,7 +199,7 @@ static unsigned macToMad(unsigned Opc) {
case AMDGPU::V_FMAC_F16_e64:
return AMDGPU::V_FMA_F16_gfx9_e64;
case AMDGPU::V_FMAC_F16_fake16_e64:
- return AMDGPU::V_FMA_F16_gfx9_e64;
+ return AMDGPU::V_FMA_F16_gfx9_fake16_e64;
case AMDGPU::V_FMAC_LEGACY_F32_e64:
return AMDGPU::V_FMA_LEGACY_F32_e64;
case AMDGPU::V_FMAC_F64_e64:
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 4a94d690297949..22e96b780ed21f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -4033,14 +4033,15 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
if (Src0Literal && !ST.hasVOP3Literal())
return nullptr;
- unsigned NewOpc = IsFMA ? IsF16 ? AMDGPU::V_FMA_F16_gfx9_e64
- : IsF64 ? AMDGPU::V_FMA_F64_e64
- : IsLegacy
- ? AMDGPU::V_FMA_LEGACY_F32_e64
- : AMDGPU::V_FMA_F32_e64
- : IsF16 ? AMDGPU::V_MAD_F16_e64
- : IsLegacy ? AMDGPU::V_MAD_LEGACY_F32_e64
- : AMDGPU::V_MAD_F32_e64;
+ unsigned NewOpc = IsFMA ? IsF16 ? ST.hasTrue16BitInsts()
+ ? AMDGPU::V_FMA_F16_gfx9_fake16_e64
+ : AMDGPU::V_FMA_F16_gfx9_e64
+ : IsF64 ? AMDGPU::V_FMA_F64_e64
+ : IsLegacy ? AMDGPU::V_FMA_LEGACY_F32_e64
+ : AMDGPU::V_FMA_F32_e64
+ : IsF16 ? AMDGPU::V_MAD_F16_e64
+ : IsLegacy ? AMDGPU::V_MAD_LEGACY_F32_e64
+ : AMDGPU::V_MAD_F32_e64;
if (pseudoToMCOpcode(NewOpc) == -1)
return nullptr;
@@ -9285,6 +9286,7 @@ static bool isRenamedInGFX9(int Opcode) {
//
case AMDGPU::V_DIV_FIXUP_F16_gfx9_e64:
case AMDGPU::V_FMA_F16_gfx9_e64:
+ case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
case AMDGPU::V_INTERP_P2_F16:
case AMDGPU::V_MAD_F16_e64:
case AMDGPU::V_MAD_U16_e64:
diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 42df4576a774d5..979812e07fc3f7 100644
--- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -455,6 +455,7 @@ void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const {
break;
case AMDGPU::V_FMA_F16_e64:
case AMDGPU::V_FMA_F16_gfx9_e64:
+ case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
NewOpcode = ST->hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_fake16
: AMDGPU::V_FMAAK_F16;
break;
@@ -484,6 +485,7 @@ void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const {
break;
case AMDGPU::V_FMA_F16_e64:
case AMDGPU::V_FMA_F16_gfx9_e64:
+ case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
NewOpcode = ST->hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_fake16
: AMDGPU::V_FMAMK_F16;
break;
@@ -956,7 +958,8 @@ bool SIShrinkInstructions::run(MachineFunction &MF) {
MI.getOpcode() == AMDGPU::V_FMA_F32_e64 ||
MI.getOpcode() == AMDGPU::V_MAD_F16_e64 ||
MI.getOpcode() == AMDGPU::V_FMA_F16_e64 ||
- MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_e64) {
+ MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_e64 ||
+ MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_fake16_e64) {
shrinkMadFma(MI);
continue;
}
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 8a9f8aa3d16d3a..807b6a49e8e8f0 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -341,7 +341,7 @@ let FPDPRounding = 1 in {
let SubtargetPredicate = isGFX9Plus in {
defm V_DIV_FIXUP_F16_gfx9 : VOP3Inst <"v_div_fixup_f16_gfx9",
VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUdiv_fixup>;
- defm V_FMA_F16_gfx9 : VOP3Inst <"v_fma_f16_gfx9", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, any_fma>;
+ defm V_FMA_F16_gfx9 : VOP3Inst_t16 <"v_fma_f16_gfx9", VOP_F16_F16_F16_F16, any_fma>;
} // End SubtargetPredicate = isGFX9Plus
} // End FPDPRounding = 1
@@ -1706,7 +1706,7 @@ defm V_PERM_B32 : VOP3_Realtriple_gfx11_gfx12<0x244>;
defm V_XAD_U32 : VOP3_Realtriple_gfx11_gfx12<0x245>;
defm V_LSHL_ADD_U32 : VOP3_Realtriple_gfx11_gfx12<0x246>;
defm V_ADD_LSHL_U32 : VOP3_Realtriple_gfx11_gfx12<0x247>;
-defm V_FMA_F16 : VOP3_Realtriple_with_name_gfx11_gfx12<0x248, "V_FMA_F16_gfx9", "v_fma_f16">;
+defm V_FMA_F16 : VOP3_Realtriple_t16_and_fake16_gfx11_gfx12<0x248, "v_fma_f16", "V_FMA_F16_gfx9">;
defm V_MIN3_F16 : VOP3_Realtriple_gfx11<0x249>;
defm V_MIN3_I16 : VOP3_Realtriple_gfx11_gfx12<0x24a>;
defm V_MIN3_U16 : VOP3_Realtriple_gfx11_gfx12<0x24b>;
diff --git a/llvm/test/CodeGen/AMDGPU/gfx11-twoaddr-fma.mir b/llvm/test/CodeGen/AMDGPU/gfx11-twoaddr-fma.mir
index cefd24032871f4..85c65778933964 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx11-twoaddr-fma.mir
+++ b/llvm/test/CodeGen/AMDGPU/gfx11-twoaddr-fma.mir
@@ -18,7 +18,7 @@ body: |
; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[DEF]].sub1
; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[DEF]].sub0
; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1078523331, implicit $exec
- ; GFX11-NEXT: [[V_FMA_F16_gfx9_e64_:%[0-9]+]]:vgpr_32 = V_FMA_F16_gfx9_e64 0, killed [[COPY1]], 0, [[V_MOV_B32_e32_]], 0, killed [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: [[V_FMA_F16_gfx9_fake16_e64_:%[0-9]+]]:vgpr_32 = V_FMA_F16_gfx9_fake16_e64 0, killed [[COPY1]], 0, [[V_MOV_B32_e32_]], 0, killed [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
%0 = IMPLICIT_DEF
%1 = COPY %0.sub1
%2 = COPY %0.sub0
@@ -43,7 +43,7 @@ body: |
; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[DEF]].sub1
; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[DEF]].sub0
; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1078523331, implicit $exec
- ; GFX11-NEXT: [[V_FMA_F16_gfx9_e64_:%[0-9]+]]:vgpr_32 = V_FMA_F16_gfx9_e64 0, [[COPY1]], 0, killed [[V_MOV_B32_e32_]], 0, killed [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: [[V_FMA_F16_gfx9_fake16_e64_:%[0-9]+]]:vgpr_32 = V_FMA_F16_gfx9_fake16_e64 0, [[COPY1]], 0, killed [[V_MOV_B32_e32_]], 0, killed [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
%0 = IMPLICIT_DEF
%1 = COPY %0.sub1
%2 = COPY %0.sub0
@@ -68,7 +68,7 @@ body: |
; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[DEF]].sub0
; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[DEF]].sub1
; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1078523331, implicit $exec
- ; GFX11-NEXT: [[V_FMA_F16_gfx9_e64_:%[0-9]+]]:vgpr_32 = V_FMA_F16_gfx9_e64 0, killed [[COPY]], 0, [[COPY1]], 0, [[V_MOV_B32_e32_]], 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: [[V_FMA_F16_gfx9_fake16_e64_:%[0-9]+]]:vgpr_32 = V_FMA_F16_gfx9_fake16_e64 0, killed [[COPY]], 0, [[COPY1]], 0, [[V_MOV_B32_e32_]], 0, 0, 0, implicit $mode, implicit $exec
%0 = IMPLICIT_DEF
%1 = COPY %0.sub0
%2 = COPY %0.sub1
@@ -90,7 +90,7 @@ body: |
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0
; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 49664, implicit $exec
- ; GFX11-NEXT: [[V_FMA_F16_gfx9_e64_:%[0-9]+]]:vgpr_32 = V_FMA_F16_gfx9_e64 0, 16384, 0, killed [[COPY]], 0, [[V_MOV_B32_e32_]], 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: [[V_FMA_F16_gfx9_fake16_e64_:%[0-9]+]]:vgpr_32 = V_FMA_F16_gfx9_fake16_e64 0, 16384, 0, killed [[COPY]], 0, [[V_MOV_B32_e32_]], 0, 0, 0, implicit $mode, implicit $exec
; GFX11-NEXT: S_ENDPGM 0
%0:vgpr_32 = COPY killed $vgpr0
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s
index b649bab532f262..b796d51a1f0d73 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s
@@ -2174,53 +2174,89 @@ v_fma_dx9_zero_f32 v5, -src_scc, |vcc_lo|, -1 mul:4
v_fma_dx9_zero_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2
// GFX11: v_fma_dx9_zero_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x09,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf]
-v_fma_f16 v5, v1, v2, s3
-// GFX11: v_fma_f16 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x48,0xd6,0x01,0x05,0x0e,0x00]
+v_fma_f16 v5.l, v1.l, v2.l, s3
+// GFX11: v_fma_f16 v5.l, v1.l, v2.l, s3 ; encoding: [0x05,0x00,0x48,0xd6,0x01,0x05,0x0e,0x00]
-v_fma_f16 v5, v255, s2, s105
-// GFX11: v_fma_f16 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x48,0xd6,0xff,0x05,0xa4,0x01]
+v_fma_f16 v5.l, v255.l, s2, s105
+// GFX11: v_fma_f16 v5.l, v255.l, s2, s105 ; encoding: [0x05,0x00,0x48,0xd6,0xff,0x05,0xa4,0x01]
-v_fma_f16 v5, s1, v255, exec_hi
-// GFX11: v_fma_f16 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x48,0xd6,0x01,0xfe,0xff,0x01]
+v_fma_f16 v5.l, s1, v255.l, exec_hi
+// GFX11: v_fma_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x48,0xd6,0x01,0xfe,0xff,0x01]
-v_fma_f16 v5, s105, s105, exec_lo
-// GFX11: v_fma_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x48,0xd6,0x69,0xd2,0xf8,0x01]
+v_fma_f16 v5.l, s105, s105, exec_lo
+// GFX11: v_fma_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x48,0xd6,0x69,0xd2,0xf8,0x01]
-v_fma_f16 v5, vcc_lo, ttmp15, v3
-// GFX11: v_fma_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x48,0xd6,0x6a,0xf6,0x0c,0x04]
+v_fma_f16 v5.l, vcc_lo, ttmp15, v3.l
+// GFX11: v_fma_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x48,0xd6,0x6a,0xf6,0x0c,0x04]
-v_fma_f16 v5, vcc_hi, 0xfe0b, v255
-// GFX11: v_fma_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x48,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+v_fma_f16 v5.l, vcc_hi, 0xfe0b, v255.l
+// GFX11: v_fma_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x48,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
-v_fma_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15|
-// GFX11: v_fma_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x48,0xd6,0x7b,0xfa,0xed,0xe1]
+v_fma_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15|
+// GFX11: v_fma_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x48,0xd6,0x7b,0xfa,0xed,0xe1]
-v_fma_f16 v5, m0, 0.5, m0
-// GFX11: v_fma_f16 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x48,0xd6,0x7d,0xe0,0xf5,0x01]
+v_fma_f16 v5.l, m0, 0.5, m0
+// GFX11: v_fma_f16 v5.l, m0, 0.5, m0 ; encoding: [0x05,0x00,0x48,0xd6,0x7d,0xe0,0xf5,0x01]
-v_fma_f16 v5, |exec_lo|, -1, vcc_hi
-// GFX11: v_fma_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x48,0xd6,0x7e,0x82,0xad,0x01]
+v_fma_f16 v5.l, |exec_lo|, -1, vcc_hi
+// GFX11: v_fma_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x48,0xd6,0x7e,0x82,0xad,0x01]
-v_fma_f16 v5, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1]
-// GFX11: v_fma_f16 v5, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1] ; encoding: [0x05,0x7d,0x48,0xd6,0x7f,0xf8,0xa8,0xa1]
+v_fma_f16 v5.h, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1]
+// GFX11: v_fma_f16 v5.h, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1] ; encoding: [0x05,0x7d,0x48,0xd6,0x7f,0xf8,0xa8,0xa1]
-v_fma_f16 v5, null, exec_lo, -|0xfe0b| op_sel:[0,0,0,0]
-// GFX11: v_fma_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x48,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+v_fma_f16 v5.l, null, exec_lo, -|0xfe0b|
+// GFX11: v_fma_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x48,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
-v_fma_f16 v5, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0]
-// GFX11: v_fma_f16 v5, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x48,0xd6,0xc1,0xfe,0xf4,0xc3]
+v_fma_f16 v5.l, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0]
+// GFX11: v_fma_f16 v5.l, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x48,0xd6,0xc1,0xfe,0xf4,0xc3]
-v_fma_f16 v5, 0.5, -m0, 0.5 op_sel:[0,1,0,0]
-// GFX11: v_fma_f16 v5, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x48,0xd6,0xf0,0xfa,0xc0,0x43]
+v_fma_f16 v5.l, 0.5, -m0, 0.5 op_sel:[0,1,0,0]
+// GFX11: v_fma_f16 v5.l, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x48,0xd6,0xf0,0xfa,0xc0,0x43]
-v_fma_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0]
-// GFX11: v_fma_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x48,0xd6,0xfd,0xd4,0x04,0x23]
+v_fma_f16 v5.l, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0]
+// GFX11: v_fma_f16 v5.l, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x48,0xd6,0xfd,0xd4,0x04,0x23]
-v_fma_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp
-// GFX11: v_fma_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+v_fma_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null clamp
+// GFX11: v_fma_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
-v_fma_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2
-// GFX11: v_fma_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+v_fma_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2
+// GFX11: v_fma_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+
+v_fma_f16 v5.l, v255.h, s2, s105
+// GFX11: v_fma_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x48,0xd6,0xff,0x05,0xa4,0x01]
+
+v_fma_f16 v5.l, s1, v255.h, exec_hi
+// GFX11: v_fma_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x48,0xd6,0x01,0xfe,0xff,0x01]
+
+v_fma_f16 v5.l, vcc_hi, 0xfe0b, v255.h
+// GFX11: v_fma_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x48,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+
+v_fma_f16 v5.l, -|exec_hi|, null, -|vcc_lo|
+// GFX11: v_fma_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x48,0xd6,0x7f,0xf8,0xa8,0xa1]
+
+v_fma_f16 v5.l, -1, -|exec_hi|, -|src_scc|
+// GFX11: v_fma_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x48,0xd6,0xc1,0xfe,0xf4,0xc3]
+
+v_fma_f16 v5.l, 0.5, -m0, 0.5
+// GFX11: v_fma_f16 v5.l, 0.5, -m0, 0.5 ; encoding: [0x05,0x00,0x48,0xd6,0xf0,0xfa,0xc0,0x43]
+
+v_fma_f16 v5.l, -src_scc, |vcc_lo|, -1
+// GFX11: v_fma_f16 v5.l, -src_scc, |vcc_lo|, -1 ; encoding: [0x05,0x02,0x48,0xd6,0xfd,0xd4,0x04,0x23]
+
+v_fma_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2
+// GFX11: v_fma_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x48,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+
+v_fma_f16 v5.l, v255.h, s2, s105
+// GFX11: [0x05,0x08,0x48,0xd6,0xff,0x05,0xa4,0x01]
+
+v_fma_f16 v5.l, s1, v255.h, exec_hi
+// GFX11: [0x05,0x10,0x48,0xd6,0x01,0xfe,0xff,0x01]
+
+v_fma_f16 v5.l, vcc_hi, 0xfe0b, v255.h
+// GFX11: [0x05,0x20,0x48,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+
+v_fma_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null clamp
+// GFX11: [0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
v_fma_f32 v5, v1, v2, s3
// GFX11: v_fma_f32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x13,0xd6,0x01,0x05,0x0e,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s
index e6f868d2b40e7e..777380e890f339 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s
@@ -1448,47 +1448,83 @@ v_div_fixup_f16_e64_dpp v5, v1, -|v2|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask
v_div_fixup_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
// GFX11: v_div_fixup_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x54,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30]
-v_fma_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
-// GFX11: v_fma_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf
+// GFX11: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
-v_fma_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
-// GFX11: v_fma_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf
+// GFX11: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
-v_fma_f16_e64_dpp v5, v1, v2, v3 row_mirror
-// GFX11: v_fma_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf
+// GFX11: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
-v_fma_f16_e64_dpp v5, v1, v2, v255 row_half_mirror
-// GFX11: v_fma_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf
+// GFX11: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
-v_fma_f16_e64_dpp v5, v1, v2, s105 row_shl:1
-// GFX11: v_fma_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf
+// GFX11: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
-v_fma_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15
-// GFX11: v_fma_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15
+// GFX11: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
-v_fma_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1
-// GFX11: v_fma_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1
+// GFX11: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
-v_fma_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15
-// GFX11: v_fma_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x48,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+v_fma_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf
+// GFX11: v_fma_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x48,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
-v_fma_f1...
[truncated]
|
@llvm/pr-subscribers-mc Author: Brox Chen (broxigarchen) ChangesSupport true16 format for v_fma_f16 in MC. Since we are replacing v_fma_f16 to v_fma_f16_t16/v_fma_f16_fake16 in Post-GFX11, have to update the CodeGen pattern for v_fma_f16_fake16 to get CodeGen test passing. There is no pattern modified/created, but just replacing the v_fma_f16 with fake16 format. Patch is 172.45 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/119477.diff 17 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 4fb5cb066be7c3..c2199fd587bea6 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -199,7 +199,7 @@ static unsigned macToMad(unsigned Opc) {
case AMDGPU::V_FMAC_F16_e64:
return AMDGPU::V_FMA_F16_gfx9_e64;
case AMDGPU::V_FMAC_F16_fake16_e64:
- return AMDGPU::V_FMA_F16_gfx9_e64;
+ return AMDGPU::V_FMA_F16_gfx9_fake16_e64;
case AMDGPU::V_FMAC_LEGACY_F32_e64:
return AMDGPU::V_FMA_LEGACY_F32_e64;
case AMDGPU::V_FMAC_F64_e64:
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 4a94d690297949..22e96b780ed21f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -4033,14 +4033,15 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
if (Src0Literal && !ST.hasVOP3Literal())
return nullptr;
- unsigned NewOpc = IsFMA ? IsF16 ? AMDGPU::V_FMA_F16_gfx9_e64
- : IsF64 ? AMDGPU::V_FMA_F64_e64
- : IsLegacy
- ? AMDGPU::V_FMA_LEGACY_F32_e64
- : AMDGPU::V_FMA_F32_e64
- : IsF16 ? AMDGPU::V_MAD_F16_e64
- : IsLegacy ? AMDGPU::V_MAD_LEGACY_F32_e64
- : AMDGPU::V_MAD_F32_e64;
+ unsigned NewOpc = IsFMA ? IsF16 ? ST.hasTrue16BitInsts()
+ ? AMDGPU::V_FMA_F16_gfx9_fake16_e64
+ : AMDGPU::V_FMA_F16_gfx9_e64
+ : IsF64 ? AMDGPU::V_FMA_F64_e64
+ : IsLegacy ? AMDGPU::V_FMA_LEGACY_F32_e64
+ : AMDGPU::V_FMA_F32_e64
+ : IsF16 ? AMDGPU::V_MAD_F16_e64
+ : IsLegacy ? AMDGPU::V_MAD_LEGACY_F32_e64
+ : AMDGPU::V_MAD_F32_e64;
if (pseudoToMCOpcode(NewOpc) == -1)
return nullptr;
@@ -9285,6 +9286,7 @@ static bool isRenamedInGFX9(int Opcode) {
//
case AMDGPU::V_DIV_FIXUP_F16_gfx9_e64:
case AMDGPU::V_FMA_F16_gfx9_e64:
+ case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
case AMDGPU::V_INTERP_P2_F16:
case AMDGPU::V_MAD_F16_e64:
case AMDGPU::V_MAD_U16_e64:
diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 42df4576a774d5..979812e07fc3f7 100644
--- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -455,6 +455,7 @@ void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const {
break;
case AMDGPU::V_FMA_F16_e64:
case AMDGPU::V_FMA_F16_gfx9_e64:
+ case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
NewOpcode = ST->hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_fake16
: AMDGPU::V_FMAAK_F16;
break;
@@ -484,6 +485,7 @@ void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const {
break;
case AMDGPU::V_FMA_F16_e64:
case AMDGPU::V_FMA_F16_gfx9_e64:
+ case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
NewOpcode = ST->hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_fake16
: AMDGPU::V_FMAMK_F16;
break;
@@ -956,7 +958,8 @@ bool SIShrinkInstructions::run(MachineFunction &MF) {
MI.getOpcode() == AMDGPU::V_FMA_F32_e64 ||
MI.getOpcode() == AMDGPU::V_MAD_F16_e64 ||
MI.getOpcode() == AMDGPU::V_FMA_F16_e64 ||
- MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_e64) {
+ MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_e64 ||
+ MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_fake16_e64) {
shrinkMadFma(MI);
continue;
}
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 8a9f8aa3d16d3a..807b6a49e8e8f0 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -341,7 +341,7 @@ let FPDPRounding = 1 in {
let SubtargetPredicate = isGFX9Plus in {
defm V_DIV_FIXUP_F16_gfx9 : VOP3Inst <"v_div_fixup_f16_gfx9",
VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUdiv_fixup>;
- defm V_FMA_F16_gfx9 : VOP3Inst <"v_fma_f16_gfx9", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, any_fma>;
+ defm V_FMA_F16_gfx9 : VOP3Inst_t16 <"v_fma_f16_gfx9", VOP_F16_F16_F16_F16, any_fma>;
} // End SubtargetPredicate = isGFX9Plus
} // End FPDPRounding = 1
@@ -1706,7 +1706,7 @@ defm V_PERM_B32 : VOP3_Realtriple_gfx11_gfx12<0x244>;
defm V_XAD_U32 : VOP3_Realtriple_gfx11_gfx12<0x245>;
defm V_LSHL_ADD_U32 : VOP3_Realtriple_gfx11_gfx12<0x246>;
defm V_ADD_LSHL_U32 : VOP3_Realtriple_gfx11_gfx12<0x247>;
-defm V_FMA_F16 : VOP3_Realtriple_with_name_gfx11_gfx12<0x248, "V_FMA_F16_gfx9", "v_fma_f16">;
+defm V_FMA_F16 : VOP3_Realtriple_t16_and_fake16_gfx11_gfx12<0x248, "v_fma_f16", "V_FMA_F16_gfx9">;
defm V_MIN3_F16 : VOP3_Realtriple_gfx11<0x249>;
defm V_MIN3_I16 : VOP3_Realtriple_gfx11_gfx12<0x24a>;
defm V_MIN3_U16 : VOP3_Realtriple_gfx11_gfx12<0x24b>;
diff --git a/llvm/test/CodeGen/AMDGPU/gfx11-twoaddr-fma.mir b/llvm/test/CodeGen/AMDGPU/gfx11-twoaddr-fma.mir
index cefd24032871f4..85c65778933964 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx11-twoaddr-fma.mir
+++ b/llvm/test/CodeGen/AMDGPU/gfx11-twoaddr-fma.mir
@@ -18,7 +18,7 @@ body: |
; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[DEF]].sub1
; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[DEF]].sub0
; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1078523331, implicit $exec
- ; GFX11-NEXT: [[V_FMA_F16_gfx9_e64_:%[0-9]+]]:vgpr_32 = V_FMA_F16_gfx9_e64 0, killed [[COPY1]], 0, [[V_MOV_B32_e32_]], 0, killed [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: [[V_FMA_F16_gfx9_fake16_e64_:%[0-9]+]]:vgpr_32 = V_FMA_F16_gfx9_fake16_e64 0, killed [[COPY1]], 0, [[V_MOV_B32_e32_]], 0, killed [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
%0 = IMPLICIT_DEF
%1 = COPY %0.sub1
%2 = COPY %0.sub0
@@ -43,7 +43,7 @@ body: |
; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[DEF]].sub1
; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[DEF]].sub0
; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1078523331, implicit $exec
- ; GFX11-NEXT: [[V_FMA_F16_gfx9_e64_:%[0-9]+]]:vgpr_32 = V_FMA_F16_gfx9_e64 0, [[COPY1]], 0, killed [[V_MOV_B32_e32_]], 0, killed [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: [[V_FMA_F16_gfx9_fake16_e64_:%[0-9]+]]:vgpr_32 = V_FMA_F16_gfx9_fake16_e64 0, [[COPY1]], 0, killed [[V_MOV_B32_e32_]], 0, killed [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
%0 = IMPLICIT_DEF
%1 = COPY %0.sub1
%2 = COPY %0.sub0
@@ -68,7 +68,7 @@ body: |
; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[DEF]].sub0
; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[DEF]].sub1
; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1078523331, implicit $exec
- ; GFX11-NEXT: [[V_FMA_F16_gfx9_e64_:%[0-9]+]]:vgpr_32 = V_FMA_F16_gfx9_e64 0, killed [[COPY]], 0, [[COPY1]], 0, [[V_MOV_B32_e32_]], 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: [[V_FMA_F16_gfx9_fake16_e64_:%[0-9]+]]:vgpr_32 = V_FMA_F16_gfx9_fake16_e64 0, killed [[COPY]], 0, [[COPY1]], 0, [[V_MOV_B32_e32_]], 0, 0, 0, implicit $mode, implicit $exec
%0 = IMPLICIT_DEF
%1 = COPY %0.sub0
%2 = COPY %0.sub1
@@ -90,7 +90,7 @@ body: |
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0
; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 49664, implicit $exec
- ; GFX11-NEXT: [[V_FMA_F16_gfx9_e64_:%[0-9]+]]:vgpr_32 = V_FMA_F16_gfx9_e64 0, 16384, 0, killed [[COPY]], 0, [[V_MOV_B32_e32_]], 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: [[V_FMA_F16_gfx9_fake16_e64_:%[0-9]+]]:vgpr_32 = V_FMA_F16_gfx9_fake16_e64 0, 16384, 0, killed [[COPY]], 0, [[V_MOV_B32_e32_]], 0, 0, 0, implicit $mode, implicit $exec
; GFX11-NEXT: S_ENDPGM 0
%0:vgpr_32 = COPY killed $vgpr0
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s
index b649bab532f262..b796d51a1f0d73 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s
@@ -2174,53 +2174,89 @@ v_fma_dx9_zero_f32 v5, -src_scc, |vcc_lo|, -1 mul:4
v_fma_dx9_zero_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2
// GFX11: v_fma_dx9_zero_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x09,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf]
-v_fma_f16 v5, v1, v2, s3
-// GFX11: v_fma_f16 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x48,0xd6,0x01,0x05,0x0e,0x00]
+v_fma_f16 v5.l, v1.l, v2.l, s3
+// GFX11: v_fma_f16 v5.l, v1.l, v2.l, s3 ; encoding: [0x05,0x00,0x48,0xd6,0x01,0x05,0x0e,0x00]
-v_fma_f16 v5, v255, s2, s105
-// GFX11: v_fma_f16 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x48,0xd6,0xff,0x05,0xa4,0x01]
+v_fma_f16 v5.l, v255.l, s2, s105
+// GFX11: v_fma_f16 v5.l, v255.l, s2, s105 ; encoding: [0x05,0x00,0x48,0xd6,0xff,0x05,0xa4,0x01]
-v_fma_f16 v5, s1, v255, exec_hi
-// GFX11: v_fma_f16 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x48,0xd6,0x01,0xfe,0xff,0x01]
+v_fma_f16 v5.l, s1, v255.l, exec_hi
+// GFX11: v_fma_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x48,0xd6,0x01,0xfe,0xff,0x01]
-v_fma_f16 v5, s105, s105, exec_lo
-// GFX11: v_fma_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x48,0xd6,0x69,0xd2,0xf8,0x01]
+v_fma_f16 v5.l, s105, s105, exec_lo
+// GFX11: v_fma_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x48,0xd6,0x69,0xd2,0xf8,0x01]
-v_fma_f16 v5, vcc_lo, ttmp15, v3
-// GFX11: v_fma_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x48,0xd6,0x6a,0xf6,0x0c,0x04]
+v_fma_f16 v5.l, vcc_lo, ttmp15, v3.l
+// GFX11: v_fma_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x48,0xd6,0x6a,0xf6,0x0c,0x04]
-v_fma_f16 v5, vcc_hi, 0xfe0b, v255
-// GFX11: v_fma_f16 v5, vcc_hi, 0xfe0b, v255 ; encoding: [0x05,0x00,0x48,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+v_fma_f16 v5.l, vcc_hi, 0xfe0b, v255.l
+// GFX11: v_fma_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x48,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
-v_fma_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15|
-// GFX11: v_fma_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x48,0xd6,0x7b,0xfa,0xed,0xe1]
+v_fma_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15|
+// GFX11: v_fma_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x48,0xd6,0x7b,0xfa,0xed,0xe1]
-v_fma_f16 v5, m0, 0.5, m0
-// GFX11: v_fma_f16 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x48,0xd6,0x7d,0xe0,0xf5,0x01]
+v_fma_f16 v5.l, m0, 0.5, m0
+// GFX11: v_fma_f16 v5.l, m0, 0.5, m0 ; encoding: [0x05,0x00,0x48,0xd6,0x7d,0xe0,0xf5,0x01]
-v_fma_f16 v5, |exec_lo|, -1, vcc_hi
-// GFX11: v_fma_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x48,0xd6,0x7e,0x82,0xad,0x01]
+v_fma_f16 v5.l, |exec_lo|, -1, vcc_hi
+// GFX11: v_fma_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x48,0xd6,0x7e,0x82,0xad,0x01]
-v_fma_f16 v5, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1]
-// GFX11: v_fma_f16 v5, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1] ; encoding: [0x05,0x7d,0x48,0xd6,0x7f,0xf8,0xa8,0xa1]
+v_fma_f16 v5.h, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1]
+// GFX11: v_fma_f16 v5.h, -|exec_hi|, null, -|vcc_lo| op_sel:[1,1,1,1] ; encoding: [0x05,0x7d,0x48,0xd6,0x7f,0xf8,0xa8,0xa1]
-v_fma_f16 v5, null, exec_lo, -|0xfe0b| op_sel:[0,0,0,0]
-// GFX11: v_fma_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x48,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+v_fma_f16 v5.l, null, exec_lo, -|0xfe0b|
+// GFX11: v_fma_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x48,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
-v_fma_f16 v5, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0]
-// GFX11: v_fma_f16 v5, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x48,0xd6,0xc1,0xfe,0xf4,0xc3]
+v_fma_f16 v5.l, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0]
+// GFX11: v_fma_f16 v5.l, -1, -|exec_hi|, -|src_scc| op_sel:[1,0,0,0] ; encoding: [0x05,0x0e,0x48,0xd6,0xc1,0xfe,0xf4,0xc3]
-v_fma_f16 v5, 0.5, -m0, 0.5 op_sel:[0,1,0,0]
-// GFX11: v_fma_f16 v5, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x48,0xd6,0xf0,0xfa,0xc0,0x43]
+v_fma_f16 v5.l, 0.5, -m0, 0.5 op_sel:[0,1,0,0]
+// GFX11: v_fma_f16 v5.l, 0.5, -m0, 0.5 op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x48,0xd6,0xf0,0xfa,0xc0,0x43]
-v_fma_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0]
-// GFX11: v_fma_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x48,0xd6,0xfd,0xd4,0x04,0x23]
+v_fma_f16 v5.l, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0]
+// GFX11: v_fma_f16 v5.l, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0] ; encoding: [0x05,0x22,0x48,0xd6,0xfd,0xd4,0x04,0x23]
-v_fma_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp
-// GFX11: v_fma_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+v_fma_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null clamp
+// GFX11: v_fma_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
-v_fma_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2
-// GFX11: v_fma_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+v_fma_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2
+// GFX11: v_fma_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+
+v_fma_f16 v5.l, v255.h, s2, s105
+// GFX11: v_fma_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x48,0xd6,0xff,0x05,0xa4,0x01]
+
+v_fma_f16 v5.l, s1, v255.h, exec_hi
+// GFX11: v_fma_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x48,0xd6,0x01,0xfe,0xff,0x01]
+
+v_fma_f16 v5.l, vcc_hi, 0xfe0b, v255.h
+// GFX11: v_fma_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x48,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+
+v_fma_f16 v5.l, -|exec_hi|, null, -|vcc_lo|
+// GFX11: v_fma_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x48,0xd6,0x7f,0xf8,0xa8,0xa1]
+
+v_fma_f16 v5.l, -1, -|exec_hi|, -|src_scc|
+// GFX11: v_fma_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x48,0xd6,0xc1,0xfe,0xf4,0xc3]
+
+v_fma_f16 v5.l, 0.5, -m0, 0.5
+// GFX11: v_fma_f16 v5.l, 0.5, -m0, 0.5 ; encoding: [0x05,0x00,0x48,0xd6,0xf0,0xfa,0xc0,0x43]
+
+v_fma_f16 v5.l, -src_scc, |vcc_lo|, -1
+// GFX11: v_fma_f16 v5.l, -src_scc, |vcc_lo|, -1 ; encoding: [0x05,0x02,0x48,0xd6,0xfd,0xd4,0x04,0x23]
+
+v_fma_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2
+// GFX11: v_fma_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x48,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+
+v_fma_f16 v5.l, v255.h, s2, s105
+// GFX11: [0x05,0x08,0x48,0xd6,0xff,0x05,0xa4,0x01]
+
+v_fma_f16 v5.l, s1, v255.h, exec_hi
+// GFX11: [0x05,0x10,0x48,0xd6,0x01,0xfe,0xff,0x01]
+
+v_fma_f16 v5.l, vcc_hi, 0xfe0b, v255.h
+// GFX11: [0x05,0x20,0x48,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+
+v_fma_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null clamp
+// GFX11: [0xff,0xc3,0x48,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
v_fma_f32 v5, v1, v2, s3
// GFX11: v_fma_f32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x13,0xd6,0x01,0x05,0x0e,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s
index e6f868d2b40e7e..777380e890f339 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s
@@ -1448,47 +1448,83 @@ v_div_fixup_f16_e64_dpp v5, v1, -|v2|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask
v_div_fixup_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
// GFX11: v_div_fixup_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x54,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30]
-v_fma_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
-// GFX11: v_fma_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf
+// GFX11: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
-v_fma_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
-// GFX11: v_fma_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf
+// GFX11: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
-v_fma_f16_e64_dpp v5, v1, v2, v3 row_mirror
-// GFX11: v_fma_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf
+// GFX11: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
-v_fma_f16_e64_dpp v5, v1, v2, v255 row_half_mirror
-// GFX11: v_fma_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf
+// GFX11: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
-v_fma_f16_e64_dpp v5, v1, v2, s105 row_shl:1
-// GFX11: v_fma_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf
+// GFX11: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
-v_fma_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15
-// GFX11: v_fma_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15
+// GFX11: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
-v_fma_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1
-// GFX11: v_fma_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1
+// GFX11: v_fma_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
-v_fma_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15
-// GFX11: v_fma_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x48,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+v_fma_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf
+// GFX11: v_fma_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x48,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
-v_fma_f1...
[truncated]
|
4f7e73e
to
fc17de9
Compare
unsigned NewOpc = IsFMA ? IsF16 ? ST.hasTrue16BitInsts() | ||
? AMDGPU::V_FMA_F16_gfx9_fake16_e64 | ||
: AMDGPU::V_FMA_F16_gfx9_e64 | ||
: IsF64 ? AMDGPU::V_FMA_F64_e64 | ||
: IsLegacy ? AMDGPU::V_FMA_LEGACY_F32_e64 | ||
: AMDGPU::V_FMA_F32_e64 | ||
: IsF16 ? AMDGPU::V_MAD_F16_e64 | ||
: IsLegacy ? AMDGPU::V_MAD_LEGACY_F32_e64 | ||
: AMDGPU::V_MAD_F32_e64; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This formatting is getting worse and worse. Should move this to a utility function with some early returns
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated to a lambda. What do you think?
c6576b9
to
c1b2563
Compare
rebased |
: IsF16 ? AMDGPU::V_MAD_F16_e64 | ||
: IsLegacy ? AMDGPU::V_MAD_LEGACY_F32_e64 | ||
: AMDGPU::V_MAD_F32_e64; | ||
auto getNewFMAInst = [&]() { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
auto getNewFMAInst = [&]() { | |
auto getNewFMAInst = [=]() { |
I would just move this to a full fledged helper function instead of a lambda. It would also be more straightforward to just switch over all the possibilities, rather than going backwards from these IsFMA / IsF16 / IsLegacy derived from lists of opcodes above
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done
c1b2563
to
6dfdcee
Compare
rebased |
cc68c34
to
6137af9
Compare
6137af9
to
0365bf8
Compare
rebased. Added GFX12 runline in codeGen test |
Support true16 format for v_fma_f16 in MC.
Since we are replacing v_fma_f16 to v_fma_f16_t16/v_fma_f16_fake16 in Post-GFX11, have to update the CodeGen pattern for v_fma_f16_fake16 to get CodeGen test passing. There is no pattern modified/created, but just replacing the v_fma_f16 with fake16 format.