Skip to content

Commit 4b1b232

Browse files
authored
[AMDGPU] Bugfix for scaled MFMA parsing FP literals (llvm#142493) (llvm#2660)
2 parents faf1611 + b09f4db commit 4b1b232

File tree

5 files changed

+462
-8
lines changed

5 files changed

+462
-8
lines changed

llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8808,6 +8808,7 @@ void AMDGPUAsmParser::cvtScaledMFMA(MCInst &Inst,
88088808
OptionalImmIndexMap OptionalIdx;
88098809
unsigned Opc = Inst.getOpcode();
88108810
unsigned I = 1;
8811+
int CbszOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::cbsz);
88118812

88128813
const MCInstrDesc &Desc = MII.get(Opc);
88138814

@@ -8816,8 +8817,15 @@ void AMDGPUAsmParser::cvtScaledMFMA(MCInst &Inst,
88168817

88178818
for (unsigned E = Operands.size(); I != E; ++I) {
88188819
AMDGPUOperand &Op = static_cast<AMDGPUOperand &>(*Operands[I]);
8819-
8820-
if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) {
8820+
int NumOperands = Inst.getNumOperands();
8821+
// The order of operands in MCInst and parsed operands are different.
8822+
// Adding dummy cbsz and blgp operands at corresponding MCInst operand
8823+
// indices for parsing scale values correctly.
8824+
if (NumOperands == CbszOpIdx) {
8825+
Inst.addOperand(MCOperand::createImm(0));
8826+
Inst.addOperand(MCOperand::createImm(0));
8827+
}
8828+
if (isRegOrImmWithInputMods(Desc, NumOperands)) {
88218829
Op.addRegOrImmWithFPInputModsOperands(Inst, 2);
88228830
} else if (Op.isImmModifier()) {
88238831
OptionalIdx[Op.getImmTy()] = I;
@@ -8827,12 +8835,18 @@ void AMDGPUAsmParser::cvtScaledMFMA(MCInst &Inst,
88278835
}
88288836

88298837
// Insert CBSZ and BLGP operands for F8F6F4 variants
8830-
int InsertPos = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::cbsz);
8831-
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyCBSZ,
8832-
0, InsertPos);
8833-
InsertPos = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::blgp);
8834-
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyBLGP,
8835-
0, InsertPos);
8838+
auto CbszIdx = OptionalIdx.find(AMDGPUOperand::ImmTyCBSZ);
8839+
if (CbszIdx != OptionalIdx.end()) {
8840+
int CbszVal = ((AMDGPUOperand &)*Operands[CbszIdx->second]).getImm();
8841+
Inst.getOperand(CbszOpIdx).setImm(CbszVal);
8842+
}
8843+
8844+
int BlgpOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::blgp);
8845+
auto BlgpIdx = OptionalIdx.find(AMDGPUOperand::ImmTyBLGP);
8846+
if (BlgpIdx != OptionalIdx.end()) {
8847+
int BlgpVal = ((AMDGPUOperand &)*Operands[BlgpIdx->second]).getImm();
8848+
Inst.getOperand(BlgpOpIdx).setImm(BlgpVal);
8849+
}
88368850

88378851
// Add dummy src_modifiers
88388852
Inst.addOperand(MCOperand::createImm(0));

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll

Lines changed: 199 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2030,6 +2030,205 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
20302030
ret void
20312031
}
20322032

2033+
define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_kimm__scaleB__FP_literal(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, ptr addrspace(1) %ptr) #0 {
2034+
; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_kimm__scaleB__FP_literal:
2035+
; SDAG: ; %bb.0:
2036+
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
2037+
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
2038+
; SDAG-NEXT: s_movk_i32 s6, 0x41
2039+
; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
2040+
; SDAG-NEXT: v_mov_b32_e32 v16, 0
2041+
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
2042+
; SDAG-NEXT: v_mov_b32_e32 v0, s8
2043+
; SDAG-NEXT: v_mov_b32_e32 v1, s9
2044+
; SDAG-NEXT: v_mov_b32_e32 v2, s10
2045+
; SDAG-NEXT: v_mov_b32_e32 v3, s11
2046+
; SDAG-NEXT: v_mov_b32_e32 v4, s12
2047+
; SDAG-NEXT: v_mov_b32_e32 v5, s13
2048+
; SDAG-NEXT: v_mov_b32_e32 v6, s14
2049+
; SDAG-NEXT: v_mov_b32_e32 v7, s15
2050+
; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
2051+
; SDAG-NEXT: v_mov_b32_e32 v8, s16
2052+
; SDAG-NEXT: v_mov_b32_e32 v9, s17
2053+
; SDAG-NEXT: v_mov_b32_e32 v10, s18
2054+
; SDAG-NEXT: v_mov_b32_e32 v11, s19
2055+
; SDAG-NEXT: v_mov_b32_e32 v12, s20
2056+
; SDAG-NEXT: v_mov_b32_e32 v13, s21
2057+
; SDAG-NEXT: v_mov_b32_e32 v14, s22
2058+
; SDAG-NEXT: v_mov_b32_e32 v15, s23
2059+
; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
2060+
; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
2061+
; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
2062+
; SDAG-NEXT: s_nop 1
2063+
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s6, 1.0 op_sel:[1,1,0] op_sel_hi:[1,0,0]
2064+
; SDAG-NEXT: s_nop 7
2065+
; SDAG-NEXT: s_nop 3
2066+
; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[4:5]
2067+
; SDAG-NEXT: s_endpgm
2068+
;
2069+
; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_kimm__scaleB__FP_literal:
2070+
; GISEL: ; %bb.0:
2071+
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
2072+
; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
2073+
; GISEL-NEXT: v_mov_b32_e32 v16, 0x41
2074+
; GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
2075+
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2076+
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
2077+
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
2078+
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
2079+
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
2080+
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
2081+
; GISEL-NEXT: v_accvgpr_write_b32 a0, s0
2082+
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
2083+
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
2084+
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
2085+
; GISEL-NEXT: v_accvgpr_write_b32 a1, s1
2086+
; GISEL-NEXT: v_accvgpr_write_b32 a2, s2
2087+
; GISEL-NEXT: v_accvgpr_write_b32 a3, s3
2088+
; GISEL-NEXT: s_nop 1
2089+
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, 1.0 op_sel:[1,1,0] op_sel_hi:[1,0,0]
2090+
; GISEL-NEXT: v_mov_b32_e32 v0, 0
2091+
; GISEL-NEXT: s_nop 7
2092+
; GISEL-NEXT: s_nop 2
2093+
; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5]
2094+
; GISEL-NEXT: s_endpgm
2095+
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 65, i32 1, i32 1065353216)
2096+
store <4 x float> %result, ptr addrspace(1) %ptr, align 16
2097+
ret void
2098+
}
2099+
2100+
define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__inline_imm(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, ptr addrspace(1) %ptr) #0 {
2101+
; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__inline_imm:
2102+
; SDAG: ; %bb.0:
2103+
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
2104+
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
2105+
; SDAG-NEXT: v_mov_b32_e32 v16, 0
2106+
; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
2107+
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
2108+
; SDAG-NEXT: v_mov_b32_e32 v0, s8
2109+
; SDAG-NEXT: v_mov_b32_e32 v1, s9
2110+
; SDAG-NEXT: v_mov_b32_e32 v2, s10
2111+
; SDAG-NEXT: v_mov_b32_e32 v3, s11
2112+
; SDAG-NEXT: v_mov_b32_e32 v4, s12
2113+
; SDAG-NEXT: v_mov_b32_e32 v5, s13
2114+
; SDAG-NEXT: v_mov_b32_e32 v6, s14
2115+
; SDAG-NEXT: v_mov_b32_e32 v7, s15
2116+
; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
2117+
; SDAG-NEXT: v_mov_b32_e32 v8, s16
2118+
; SDAG-NEXT: v_mov_b32_e32 v9, s17
2119+
; SDAG-NEXT: v_mov_b32_e32 v10, s18
2120+
; SDAG-NEXT: v_mov_b32_e32 v11, s19
2121+
; SDAG-NEXT: v_mov_b32_e32 v12, s20
2122+
; SDAG-NEXT: v_mov_b32_e32 v13, s21
2123+
; SDAG-NEXT: v_mov_b32_e32 v14, s22
2124+
; SDAG-NEXT: v_mov_b32_e32 v15, s23
2125+
; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
2126+
; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
2127+
; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
2128+
; SDAG-NEXT: s_nop 1
2129+
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1.0, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
2130+
; SDAG-NEXT: s_nop 7
2131+
; SDAG-NEXT: s_nop 3
2132+
; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[4:5]
2133+
; SDAG-NEXT: s_endpgm
2134+
;
2135+
; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__inline_imm:
2136+
; GISEL: ; %bb.0:
2137+
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
2138+
; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
2139+
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2140+
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
2141+
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
2142+
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
2143+
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
2144+
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
2145+
; GISEL-NEXT: v_accvgpr_write_b32 a0, s0
2146+
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
2147+
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
2148+
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
2149+
; GISEL-NEXT: v_accvgpr_write_b32 a1, s1
2150+
; GISEL-NEXT: v_accvgpr_write_b32 a2, s2
2151+
; GISEL-NEXT: v_accvgpr_write_b32 a3, s3
2152+
; GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
2153+
; GISEL-NEXT: s_nop 0
2154+
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1.0, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
2155+
; GISEL-NEXT: v_mov_b32_e32 v0, 0
2156+
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2157+
; GISEL-NEXT: s_nop 7
2158+
; GISEL-NEXT: s_nop 1
2159+
; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5]
2160+
; GISEL-NEXT: s_endpgm
2161+
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 1065353216, i32 1, i32 -2)
2162+
store <4 x float> %result, ptr addrspace(1) %ptr, align 16
2163+
ret void
2164+
}
2165+
2166+
define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__FP_literal(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, ptr addrspace(1) %ptr) #0 {
2167+
; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__FP_literal:
2168+
; SDAG: ; %bb.0:
2169+
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
2170+
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
2171+
; SDAG-NEXT: v_mov_b32_e32 v16, 0
2172+
; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
2173+
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
2174+
; SDAG-NEXT: v_mov_b32_e32 v0, s8
2175+
; SDAG-NEXT: v_mov_b32_e32 v1, s9
2176+
; SDAG-NEXT: v_mov_b32_e32 v2, s10
2177+
; SDAG-NEXT: v_mov_b32_e32 v3, s11
2178+
; SDAG-NEXT: v_mov_b32_e32 v4, s12
2179+
; SDAG-NEXT: v_mov_b32_e32 v5, s13
2180+
; SDAG-NEXT: v_mov_b32_e32 v6, s14
2181+
; SDAG-NEXT: v_mov_b32_e32 v7, s15
2182+
; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
2183+
; SDAG-NEXT: v_mov_b32_e32 v8, s16
2184+
; SDAG-NEXT: v_mov_b32_e32 v9, s17
2185+
; SDAG-NEXT: v_mov_b32_e32 v10, s18
2186+
; SDAG-NEXT: v_mov_b32_e32 v11, s19
2187+
; SDAG-NEXT: v_mov_b32_e32 v12, s20
2188+
; SDAG-NEXT: v_mov_b32_e32 v13, s21
2189+
; SDAG-NEXT: v_mov_b32_e32 v14, s22
2190+
; SDAG-NEXT: v_mov_b32_e32 v15, s23
2191+
; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
2192+
; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
2193+
; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
2194+
; SDAG-NEXT: s_nop 1
2195+
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1.0, 0.15915494 op_sel:[1,1,0] op_sel_hi:[1,0,0]
2196+
; SDAG-NEXT: s_nop 7
2197+
; SDAG-NEXT: s_nop 3
2198+
; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[4:5]
2199+
; SDAG-NEXT: s_endpgm
2200+
;
2201+
; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__FP_literal:
2202+
; GISEL: ; %bb.0:
2203+
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
2204+
; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40
2205+
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2206+
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
2207+
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
2208+
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
2209+
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
2210+
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
2211+
; GISEL-NEXT: v_accvgpr_write_b32 a0, s0
2212+
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
2213+
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
2214+
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
2215+
; GISEL-NEXT: v_accvgpr_write_b32 a1, s1
2216+
; GISEL-NEXT: v_accvgpr_write_b32 a2, s2
2217+
; GISEL-NEXT: v_accvgpr_write_b32 a3, s3
2218+
; GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50
2219+
; GISEL-NEXT: s_nop 0
2220+
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1.0, 0.15915494 op_sel:[1,1,0] op_sel_hi:[1,0,0]
2221+
; GISEL-NEXT: v_mov_b32_e32 v0, 0
2222+
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2223+
; GISEL-NEXT: s_nop 7
2224+
; GISEL-NEXT: s_nop 1
2225+
; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5]
2226+
; GISEL-NEXT: s_endpgm
2227+
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 1065353216, i32 1, i32 1042479491)
2228+
store <4 x float> %result, ptr addrspace(1) %ptr, align 16
2229+
ret void
2230+
}
2231+
20332232
; This should be optimized to avoid the scale
20342233
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_a(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
20352234
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_a:

0 commit comments

Comments
 (0)