Skip to content

Commit b3a8c1e

Browse files
VigneshwarJarsenm
andauthored
[AMDGPU] Bugfix for scaled MFMA parsing FP literals (#142493)
bugfix on parsing FP literals for scale values in the scaled MFMA. Due to the change in order of operands between MCinst and parsed operands, the FP literal imms for scale values were not parsed correctly. --------- Co-authored-by: Matt Arsenault <[email protected]>
1 parent e129c3c commit b3a8c1e

File tree

5 files changed

+462
-8
lines changed

5 files changed

+462
-8
lines changed

llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8826,6 +8826,7 @@ void AMDGPUAsmParser::cvtScaledMFMA(MCInst &Inst,
88268826
OptionalImmIndexMap OptionalIdx;
88278827
unsigned Opc = Inst.getOpcode();
88288828
unsigned I = 1;
8829+
int CbszOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::cbsz);
88298830

88308831
const MCInstrDesc &Desc = MII.get(Opc);
88318832

@@ -8834,8 +8835,15 @@ void AMDGPUAsmParser::cvtScaledMFMA(MCInst &Inst,
88348835

88358836
for (unsigned E = Operands.size(); I != E; ++I) {
88368837
AMDGPUOperand &Op = static_cast<AMDGPUOperand &>(*Operands[I]);
8837-
8838-
if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) {
8838+
int NumOperands = Inst.getNumOperands();
8839+
// The order of operands in MCInst and parsed operands are different.
8840+
// Adding dummy cbsz and blgp operands at corresponding MCInst operand
8841+
// indices for parsing scale values correctly.
8842+
if (NumOperands == CbszOpIdx) {
8843+
Inst.addOperand(MCOperand::createImm(0));
8844+
Inst.addOperand(MCOperand::createImm(0));
8845+
}
8846+
if (isRegOrImmWithInputMods(Desc, NumOperands)) {
88398847
Op.addRegOrImmWithFPInputModsOperands(Inst, 2);
88408848
} else if (Op.isImmModifier()) {
88418849
OptionalIdx[Op.getImmTy()] = I;
@@ -8845,12 +8853,18 @@ void AMDGPUAsmParser::cvtScaledMFMA(MCInst &Inst,
88458853
}
88468854

88478855
// Insert CBSZ and BLGP operands for F8F6F4 variants
8848-
int InsertPos = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::cbsz);
8849-
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyCBSZ,
8850-
0, InsertPos);
8851-
InsertPos = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::blgp);
8852-
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyBLGP,
8853-
0, InsertPos);
8856+
auto CbszIdx = OptionalIdx.find(AMDGPUOperand::ImmTyCBSZ);
8857+
if (CbszIdx != OptionalIdx.end()) {
8858+
int CbszVal = ((AMDGPUOperand &)*Operands[CbszIdx->second]).getImm();
8859+
Inst.getOperand(CbszOpIdx).setImm(CbszVal);
8860+
}
8861+
8862+
int BlgpOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::blgp);
8863+
auto BlgpIdx = OptionalIdx.find(AMDGPUOperand::ImmTyBLGP);
8864+
if (BlgpIdx != OptionalIdx.end()) {
8865+
int BlgpVal = ((AMDGPUOperand &)*Operands[BlgpIdx->second]).getImm();
8866+
Inst.getOperand(BlgpOpIdx).setImm(BlgpVal);
8867+
}
88548868

88558869
// Add dummy src_modifiers
88568870
Inst.addOperand(MCOperand::createImm(0));

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll

Lines changed: 199 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2024,6 +2024,205 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
20242024
ret void
20252025
}
20262026

2027+
define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_kimm__scaleB__FP_literal(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, ptr addrspace(1) %ptr) #0 {
2028+
; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_kimm__scaleB__FP_literal:
2029+
; SDAG: ; %bb.0:
2030+
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
2031+
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
2032+
; SDAG-NEXT: s_movk_i32 s6, 0x41
2033+
; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
2034+
; SDAG-NEXT: v_mov_b32_e32 v16, 0
2035+
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
2036+
; SDAG-NEXT: v_mov_b32_e32 v0, s8
2037+
; SDAG-NEXT: v_mov_b32_e32 v1, s9
2038+
; SDAG-NEXT: v_mov_b32_e32 v2, s10
2039+
; SDAG-NEXT: v_mov_b32_e32 v3, s11
2040+
; SDAG-NEXT: v_mov_b32_e32 v4, s12
2041+
; SDAG-NEXT: v_mov_b32_e32 v5, s13
2042+
; SDAG-NEXT: v_mov_b32_e32 v6, s14
2043+
; SDAG-NEXT: v_mov_b32_e32 v7, s15
2044+
; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
2045+
; SDAG-NEXT: v_mov_b32_e32 v8, s16
2046+
; SDAG-NEXT: v_mov_b32_e32 v9, s17
2047+
; SDAG-NEXT: v_mov_b32_e32 v10, s18
2048+
; SDAG-NEXT: v_mov_b32_e32 v11, s19
2049+
; SDAG-NEXT: v_mov_b32_e32 v12, s20
2050+
; SDAG-NEXT: v_mov_b32_e32 v13, s21
2051+
; SDAG-NEXT: v_mov_b32_e32 v14, s22
2052+
; SDAG-NEXT: v_mov_b32_e32 v15, s23
2053+
; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
2054+
; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
2055+
; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
2056+
; SDAG-NEXT: s_nop 1
2057+
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s6, 1.0 op_sel:[1,1,0] op_sel_hi:[1,0,0]
2058+
; SDAG-NEXT: s_nop 7
2059+
; SDAG-NEXT: s_nop 3
2060+
; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[4:5]
2061+
; SDAG-NEXT: s_endpgm
2062+
;
2063+
; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_kimm__scaleB__FP_literal:
2064+
; GISEL: ; %bb.0:
2065+
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
2066+
; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
2067+
; GISEL-NEXT: v_mov_b32_e32 v16, 0x41
2068+
; GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
2069+
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2070+
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
2071+
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
2072+
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
2073+
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
2074+
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
2075+
; GISEL-NEXT: v_accvgpr_write_b32 a0, s0
2076+
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
2077+
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
2078+
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
2079+
; GISEL-NEXT: v_accvgpr_write_b32 a1, s1
2080+
; GISEL-NEXT: v_accvgpr_write_b32 a2, s2
2081+
; GISEL-NEXT: v_accvgpr_write_b32 a3, s3
2082+
; GISEL-NEXT: s_nop 1
2083+
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, 1.0 op_sel:[1,1,0] op_sel_hi:[1,0,0]
2084+
; GISEL-NEXT: v_mov_b32_e32 v0, 0
2085+
; GISEL-NEXT: s_nop 7
2086+
; GISEL-NEXT: s_nop 2
2087+
; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5]
2088+
; GISEL-NEXT: s_endpgm
2089+
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 65, i32 1, i32 1065353216)
2090+
store <4 x float> %result, ptr addrspace(1) %ptr, align 16
2091+
ret void
2092+
}
2093+
2094+
define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__inline_imm(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, ptr addrspace(1) %ptr) #0 {
2095+
; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__inline_imm:
2096+
; SDAG: ; %bb.0:
2097+
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
2098+
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
2099+
; SDAG-NEXT: v_mov_b32_e32 v16, 0
2100+
; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
2101+
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
2102+
; SDAG-NEXT: v_mov_b32_e32 v0, s8
2103+
; SDAG-NEXT: v_mov_b32_e32 v1, s9
2104+
; SDAG-NEXT: v_mov_b32_e32 v2, s10
2105+
; SDAG-NEXT: v_mov_b32_e32 v3, s11
2106+
; SDAG-NEXT: v_mov_b32_e32 v4, s12
2107+
; SDAG-NEXT: v_mov_b32_e32 v5, s13
2108+
; SDAG-NEXT: v_mov_b32_e32 v6, s14
2109+
; SDAG-NEXT: v_mov_b32_e32 v7, s15
2110+
; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
2111+
; SDAG-NEXT: v_mov_b32_e32 v8, s16
2112+
; SDAG-NEXT: v_mov_b32_e32 v9, s17
2113+
; SDAG-NEXT: v_mov_b32_e32 v10, s18
2114+
; SDAG-NEXT: v_mov_b32_e32 v11, s19
2115+
; SDAG-NEXT: v_mov_b32_e32 v12, s20
2116+
; SDAG-NEXT: v_mov_b32_e32 v13, s21
2117+
; SDAG-NEXT: v_mov_b32_e32 v14, s22
2118+
; SDAG-NEXT: v_mov_b32_e32 v15, s23
2119+
; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
2120+
; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
2121+
; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
2122+
; SDAG-NEXT: s_nop 1
2123+
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1.0, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
2124+
; SDAG-NEXT: s_nop 7
2125+
; SDAG-NEXT: s_nop 3
2126+
; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[4:5]
2127+
; SDAG-NEXT: s_endpgm
2128+
;
2129+
; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__inline_imm:
2130+
; GISEL: ; %bb.0:
2131+
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
2132+
; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
2133+
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2134+
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
2135+
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
2136+
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
2137+
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
2138+
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
2139+
; GISEL-NEXT: v_accvgpr_write_b32 a0, s0
2140+
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
2141+
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
2142+
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
2143+
; GISEL-NEXT: v_accvgpr_write_b32 a1, s1
2144+
; GISEL-NEXT: v_accvgpr_write_b32 a2, s2
2145+
; GISEL-NEXT: v_accvgpr_write_b32 a3, s3
2146+
; GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
2147+
; GISEL-NEXT: s_nop 0
2148+
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1.0, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
2149+
; GISEL-NEXT: v_mov_b32_e32 v0, 0
2150+
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2151+
; GISEL-NEXT: s_nop 7
2152+
; GISEL-NEXT: s_nop 1
2153+
; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5]
2154+
; GISEL-NEXT: s_endpgm
2155+
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 1065353216, i32 1, i32 -2)
2156+
store <4 x float> %result, ptr addrspace(1) %ptr, align 16
2157+
ret void
2158+
}
2159+
2160+
define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__FP_literal(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, ptr addrspace(1) %ptr) #0 {
2161+
; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__FP_literal:
2162+
; SDAG: ; %bb.0:
2163+
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
2164+
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
2165+
; SDAG-NEXT: v_mov_b32_e32 v16, 0
2166+
; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
2167+
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
2168+
; SDAG-NEXT: v_mov_b32_e32 v0, s8
2169+
; SDAG-NEXT: v_mov_b32_e32 v1, s9
2170+
; SDAG-NEXT: v_mov_b32_e32 v2, s10
2171+
; SDAG-NEXT: v_mov_b32_e32 v3, s11
2172+
; SDAG-NEXT: v_mov_b32_e32 v4, s12
2173+
; SDAG-NEXT: v_mov_b32_e32 v5, s13
2174+
; SDAG-NEXT: v_mov_b32_e32 v6, s14
2175+
; SDAG-NEXT: v_mov_b32_e32 v7, s15
2176+
; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
2177+
; SDAG-NEXT: v_mov_b32_e32 v8, s16
2178+
; SDAG-NEXT: v_mov_b32_e32 v9, s17
2179+
; SDAG-NEXT: v_mov_b32_e32 v10, s18
2180+
; SDAG-NEXT: v_mov_b32_e32 v11, s19
2181+
; SDAG-NEXT: v_mov_b32_e32 v12, s20
2182+
; SDAG-NEXT: v_mov_b32_e32 v13, s21
2183+
; SDAG-NEXT: v_mov_b32_e32 v14, s22
2184+
; SDAG-NEXT: v_mov_b32_e32 v15, s23
2185+
; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
2186+
; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
2187+
; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
2188+
; SDAG-NEXT: s_nop 1
2189+
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1.0, 0.15915494 op_sel:[1,1,0] op_sel_hi:[1,0,0]
2190+
; SDAG-NEXT: s_nop 7
2191+
; SDAG-NEXT: s_nop 3
2192+
; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[4:5]
2193+
; SDAG-NEXT: s_endpgm
2194+
;
2195+
; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__FP_literal:
2196+
; GISEL: ; %bb.0:
2197+
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
2198+
; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
2199+
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2200+
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
2201+
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
2202+
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
2203+
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
2204+
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
2205+
; GISEL-NEXT: v_accvgpr_write_b32 a0, s0
2206+
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
2207+
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
2208+
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
2209+
; GISEL-NEXT: v_accvgpr_write_b32 a1, s1
2210+
; GISEL-NEXT: v_accvgpr_write_b32 a2, s2
2211+
; GISEL-NEXT: v_accvgpr_write_b32 a3, s3
2212+
; GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
2213+
; GISEL-NEXT: s_nop 0
2214+
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1.0, 0.15915494 op_sel:[1,1,0] op_sel_hi:[1,0,0]
2215+
; GISEL-NEXT: v_mov_b32_e32 v0, 0
2216+
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2217+
; GISEL-NEXT: s_nop 7
2218+
; GISEL-NEXT: s_nop 1
2219+
; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5]
2220+
; GISEL-NEXT: s_endpgm
2221+
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 1065353216, i32 1, i32 1042479491)
2222+
store <4 x float> %result, ptr addrspace(1) %ptr, align 16
2223+
ret void
2224+
}
2225+
20272226
; This should be optimized to avoid the scale
20282227
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_a(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
20292228
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_a:

0 commit comments

Comments
 (0)