Skip to content

[AMDGPU] Fix opsel for scaled MFMA operations #140183

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
May 21, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 80 additions & 7 deletions llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1878,6 +1878,7 @@ class AMDGPUAsmParser : public MCTargetAsmParser {

void cvtVOP3(MCInst &Inst, const OperandVector &Operands,
OptionalImmIndexMap &OptionalIdx);
void cvtScaledMFMA(MCInst &Inst, const OperandVector &Operands);
void cvtVOP3OpSel(MCInst &Inst, const OperandVector &Operands);
void cvtVOP3(MCInst &Inst, const OperandVector &Operands);
void cvtVOP3P(MCInst &Inst, const OperandVector &Operands);
Expand Down Expand Up @@ -6784,17 +6785,25 @@ ParseStatus AMDGPUAsmParser::parseTH(OperandVector &Operands, int64_t &TH) {
return ParseStatus::Success;
}

static void addOptionalImmOperand(
MCInst& Inst, const OperandVector& Operands,
AMDGPUAsmParser::OptionalImmIndexMap& OptionalIdx,
AMDGPUOperand::ImmTy ImmT,
int64_t Default = 0) {
static void
addOptionalImmOperand(MCInst &Inst, const OperandVector &Operands,
AMDGPUAsmParser::OptionalImmIndexMap &OptionalIdx,
AMDGPUOperand::ImmTy ImmT, int64_t Default = 0,
std::optional<unsigned> InsertAt = std::nullopt) {
auto i = OptionalIdx.find(ImmT);
if (i != OptionalIdx.end()) {
unsigned Idx = i->second;
((AMDGPUOperand &)*Operands[Idx]).addImmOperands(Inst, 1);
const AMDGPUOperand &Op =
static_cast<const AMDGPUOperand &>(*Operands[Idx]);
if (InsertAt)
Inst.insert(Inst.begin() + *InsertAt, MCOperand::createImm(Op.getImm()));
else
Op.addImmOperands(Inst, 1);
} else {
Inst.addOperand(MCOperand::createImm(Default));
if (InsertAt.has_value())
Inst.insert(Inst.begin() + *InsertAt, MCOperand::createImm(Default));
else
Inst.addOperand(MCOperand::createImm(Default));
}
}

Expand Down Expand Up @@ -8811,6 +8820,70 @@ void AMDGPUAsmParser::cvtVINTERP(MCInst &Inst, const OperandVector &Operands)
Inst.getOperand(ModIdx).setImm(ModVal);
}
}
void AMDGPUAsmParser::cvtScaledMFMA(MCInst &Inst,
const OperandVector &Operands) {
OptionalImmIndexMap OptionalIdx;
unsigned Opc = Inst.getOpcode();
unsigned I = 1;

const MCInstrDesc &Desc = MII.get(Opc);

for (unsigned J = 0; J < Desc.getNumDefs(); ++J)
static_cast<AMDGPUOperand &>(*Operands[I++]).addRegOperands(Inst, 1);

for (unsigned E = Operands.size(); I != E; ++I) {
AMDGPUOperand &Op = static_cast<AMDGPUOperand &>(*Operands[I]);

if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) {
Op.addRegOrImmWithFPInputModsOperands(Inst, 2);
} else if (Op.isImmModifier()) {
OptionalIdx[Op.getImmTy()] = I;
} else {
Op.addRegOrImmOperands(Inst, 1);
}
}

// Insert CBSZ and BLGP operands for F8F6F4 variants
int InsertPos = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::cbsz);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyCBSZ,
0, InsertPos);
InsertPos = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::blgp);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyBLGP,
0, InsertPos);

// Add dummy src_modifiers
Inst.addOperand(MCOperand::createImm(0));
Inst.addOperand(MCOperand::createImm(0));

// Handle op_sel fields

unsigned OpSel = 0;
auto OpselIdx = OptionalIdx.find(AMDGPUOperand::ImmTyOpSel);
if (OpselIdx != OptionalIdx.end()) {
OpSel = static_cast<const AMDGPUOperand &>(*Operands[OpselIdx->second])
.getImm();
}

unsigned OpSelHi = 0;
auto OpselHiIdx = OptionalIdx.find(AMDGPUOperand::ImmTyOpSelHi);
if (OpselHiIdx != OptionalIdx.end()) {
OpSelHi = static_cast<const AMDGPUOperand &>(*Operands[OpselHiIdx->second])
.getImm();
}
const AMDGPU::OpName ModOps[] = {AMDGPU::OpName::src0_modifiers,
AMDGPU::OpName::src1_modifiers};

for (unsigned J = 0; J < 2; ++J) {
unsigned ModVal = 0;
if (OpSel & (1 << J))
ModVal |= SISrcMods::OP_SEL_0;
if (OpSelHi & (1 << J))
ModVal |= SISrcMods::OP_SEL_1;

const int ModIdx = AMDGPU::getNamedOperandIdx(Opc, ModOps[J]);
Inst.getOperand(ModIdx).setImm(ModVal);
}
}

void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands,
OptionalImmIndexMap &OptionalIdx) {
Expand Down
15 changes: 7 additions & 8 deletions llvm/lib/Target/AMDGPU/VOP3PInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -829,12 +829,12 @@ class MFMA_F8F6F4_WithSizeTable_Helper<VOP3_Pseudo ps, string F8F8Op> :
// Currently assumes scaled instructions never have abid
class MAIFrag<SDPatternOperator Op, code pred, bit HasAbid = true, bit Scaled = false> : PatFrag <
!if(Scaled, (ops node:$src0, node:$src1, node:$src2, node:$cbsz, node:$blgp,
node:$scale_src0_opsel, node:$scale_src0,
node:$scale_src1_opsel, node:$scale_src1),
node:$src0_modifiers, node:$scale_src0,
node:$src1_modifiers, node:$scale_src1),
!con((ops node:$src0, node:$src1, node:$src2, node:$cbsz),
!if(HasAbid, (ops node:$abid), (ops)),
(ops node:$blgp))),
!if(Scaled, (Op $src0, $src1, $src2, $cbsz, $blgp, $scale_src0_opsel, $scale_src0, $scale_src1_opsel, $scale_src1),
!if(Scaled, (Op $src0, $src1, $src2, $cbsz, $blgp, $src0_modifiers, $scale_src0, $src1_modifiers, $scale_src1),
!if(HasAbid, (Op $src0, $src1, $src2, $cbsz, $abid, $blgp),
(Op $src0, $src1, $src2, $cbsz, $blgp))),
pred
Expand Down Expand Up @@ -895,12 +895,12 @@ class ScaledMAIInst<string OpName, MAIInst BaseInst, SDPatternOperator node> :
let InOperandList = !con(BaseInst.InOperandList,
(ins VSrc_b32:$scale_src0,
VSrc_b32:$scale_src1,
op_sel0:$scale_src0_opsel,
op_sel_hi0:$scale_src1_opsel));
op_sel0:$src0_modifiers,
op_sel_hi0:$src1_modifiers));
let AsmOperands =
"$vdst, $src0, $src1, $src2, $scale_src0, $scale_src1"
"$scale_src0_opsel$scale_src1_opsel$cbsz$blgp";

"$src0_modifiers$src1_modifiers$cbsz$blgp";
let AsmMatchConverter = "cvtScaledMFMA";
let FixedSize = 1;
let Size = 16;
}
Expand Down Expand Up @@ -2041,7 +2041,6 @@ multiclass VOP3PX_Real_ScaledMFMA<bits<7> op> {
defvar PS_VCD = !cast<VOP3_Pseudo>(NAME # "_vgprcd" # "_e64");
defvar Name = PS_ACD.Mnemonic;
defvar F8F8Name = !substr(NAME, 0, !sub(!size(NAME), !size("_fN_fM")))#"_f8_f8";

let SubtargetPredicate = HasGFX950Insts,
DecoderNamespace = "GFX940",
AsmString = Name # PS_ACD.AsmOperands, Constraints = "" in {
Expand Down
22 changes: 12 additions & 10 deletions llvm/lib/Target/AMDGPU/VOPInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -526,14 +526,16 @@ class VOP3PXe <bits<7> op, VOPProfile MFMAPfl, bit acc_cd = 0> : Enc128, VOP3Pe_
bits<9> scale_src0;
bits<9> scale_src1;

bits<2> scale_src0_opsel;
bits<2> scale_src1_opsel;
//MFMALdScaleModifierOp transforms 2 bit opsel input to 4 bit value
//where opsel and opselHi are in 3rd and 4th bit.
bits<4> src0_modifiers;
bits<4> src1_modifiers;
Comment on lines +531 to +532
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Needs comment explaining these. If you're writing custom operand handling maybe we shouldn't rename these? It is a weird case

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thought so too, but this matches the current src_modifiers field and uses an existing bit mask 4 bit value (OpselHI, OpselLo, Abs and Neg) in TableGen. There is also already a logic in place for asm printing using this bit mask. Also, in previous revision, it was a 2-bit field, which was reading the lower 2 bits hence not generating the right encoding for opsel bits. Though this MFMA instruction doesn't use neg and abs, I thought it would be cleaner to match the existing implementation.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it does support neg, it's just not wired up

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh I see it, I see this in doc " NEG[1:0] and ABS[1:0] must be zero. NEG[2] and ABS[2] may be used to control matrix C" . So I will wire up a src2_modifier with neg/abs bits mapped and add some tests for it as well


// Inst{7-0} = unused
// Inst{10-8} = neg_hi;
// Inst{13-11} = op_sel
let Inst{11} = scale_src0_opsel{0};
let Inst{12} = scale_src1_opsel{0};
let Inst{11} = src0_modifiers{2}; //opsel[0]
let Inst{12} = src1_modifiers{2}; //opsel[1]
// Inst{13} = unused op_sel
// Inst{14} = unused op_sel_hi2

Expand All @@ -542,8 +544,8 @@ class VOP3PXe <bits<7> op, VOPProfile MFMAPfl, bit acc_cd = 0> : Enc128, VOP3Pe_
let Inst{49-41} = scale_src1;
// Inst{50-58} = unused
// Inst{60-59} = op_sel_hi;
let Inst{59} = scale_src0_opsel{1};
let Inst{60} = scale_src1_opsel{1};
let Inst{59} = src0_modifiers{3}; //opsel_hi[0]
let Inst{60} = src1_modifiers{3}; //opsel_hi[1]
// Inst{63-61} = neg;

// The high half of the encoding is the unscaled mfma op.
Expand Down Expand Up @@ -1437,17 +1439,17 @@ class getVOP3MAIScaledPat<VOPProfile P, SDPatternOperator node> {
// mfma
[(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2,
timm:$cbsz, timm:$blgp,
MFMALdScaleModifierOp:$scale_src0_opsel,
MFMALdScaleModifierOp:$src0_modifiers,
i32:$scale_src0,
MFMALdScaleModifierOp:$scale_src1_opsel,
MFMALdScaleModifierOp:$src1_modifiers,
i32:$scale_src1
))],
// smfmac
[(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2, i32:$idx,
timm:$cbsz, timm:$abid,
MFMALdScaleModifierOp:$scale_src0_opsel,
MFMALdScaleModifierOp:$src0_modifiers,
i32:$scale_src0,
MFMALdScaleModifierOp:$scale_src1_opsel,
MFMALdScaleModifierOp:$src1_modifiers,
i32:$scale_src1))]);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_1_1__cbsz1__blgp1(<8 x
; GCN-NEXT: v_accvgpr_write_b32 a2, v18
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,1,0] op_sel_hi:[0,0,0]
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
Expand All @@ -70,7 +70,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_2_2__cbsz1__blgp1(<8 x
; GCN-NEXT: v_accvgpr_write_b32 a2, v18
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[1,1,0]
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
Expand All @@ -94,7 +94,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_3__cbsz1__blgp1(<8 x
; GCN-NEXT: v_accvgpr_write_b32 a2, v18
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,1,0]
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
Expand All @@ -118,7 +118,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_3__cbsz1__blgp1(<8 x
; GCN-NEXT: v_accvgpr_write_b32 a2, v18
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[0,1,0] op_sel_hi:[0,1,0]
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
Expand All @@ -142,7 +142,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_0__cbsz1__blgp1(<8 x
; GCN-NEXT: v_accvgpr_write_b32 a2, v18
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
Expand All @@ -166,7 +166,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_2_3__cbsz1__blgp1(<8 x
; GCN-NEXT: v_accvgpr_write_b32 a2, v18
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[0,1,0] op_sel_hi:[1,1,0]
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
Expand All @@ -190,7 +190,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_2__cbsz1__blgp1(<8 x
; GCN-NEXT: v_accvgpr_write_b32 a2, v18
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,0,0] op_sel_hi:[1,1,0]
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
Expand Down Expand Up @@ -1797,7 +1797,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_inlineimm__
; GCN-NEXT: v_accvgpr_write_b32 a2, v18
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 33, -2 op_sel_hi:[0,0,0]
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 33, -2 op_sel_hi:[1,1,0]
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
Expand All @@ -1819,7 +1819,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale
; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
; SDAG-NEXT: s_nop 1
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, -2 op_sel_hi:[0,0,0]
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, -2 op_sel_hi:[1,1,0]
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
Expand All @@ -1837,7 +1837,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale
; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
; GISEL-NEXT: v_mov_b32_e32 v16, 0x41
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, -2 op_sel_hi:[0,0,0]
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, -2 op_sel_hi:[1,1,0]
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
Expand All @@ -1860,7 +1860,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale
; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
; SDAG-NEXT: v_mov_b32_e32 v16, 0x4d
; SDAG-NEXT: s_nop 1
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, v16 op_sel_hi:[0,0,0]
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, v16 op_sel_hi:[1,1,0]
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
Expand All @@ -1879,7 +1879,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale
; GISEL-NEXT: v_mov_b32_e32 v16, 0x41
; GISEL-NEXT: v_mov_b32_e32 v17, 0x4d
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0]
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[1,1,0]
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 3
; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
Expand Down Expand Up @@ -1921,7 +1921,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32
; SDAG-NEXT: v_accvgpr_write_b32 a3, s11
; SDAG-NEXT: v_mov_b32_e32 v17, s13
; SDAG-NEXT: s_nop 1
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s12, v17 op_sel_hi:[0,0,0] blgp:2
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s12, v17 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[14:15]
Expand All @@ -1946,7 +1946,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32
; GISEL-NEXT: v_accvgpr_write_b32 a3, s27
; GISEL-NEXT: v_mov_b32_e32 v16, s29
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s28, v16 op_sel_hi:[0,0,0] blgp:2
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s28, v16 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
; GISEL-NEXT: v_mov_b32_e32 v0, 0
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 2
Expand Down Expand Up @@ -1987,7 +1987,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
; SDAG-NEXT: s_nop 1
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s6, -2 op_sel_hi:[0,0,0]
; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s6, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[4:5]
Expand All @@ -2013,7 +2013,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
; GISEL-NEXT: v_accvgpr_write_b32 a2, s2
; GISEL-NEXT: v_accvgpr_write_b32 a3, s3
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, -2 op_sel_hi:[0,0,0]
; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0]
; GISEL-NEXT: v_mov_b32_e32 v0, 0
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 2
Expand Down
Loading
Loading