Skip to content

[mlir][AMDGPU] Add gfx950 MFMAs to the amdgpu.mfma op #133553

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Apr 1, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -650,10 +650,12 @@ def AMDGPU_MFMAPermBAttr : EnumAttr<AMDGPU_Dialect, AMDGPU_MFMAPermB,
// mfma
def MFMAInTypes : AnyTypeOf<[F32, F64, I32, I64,
VectorOfLengthAndType<[2], [F32]>,
VectorOfLengthAndType<[4], [F16]>,
VectorOfLengthAndType<[2, 4], [BF16]>,
VectorOfLengthAndType<[4, 8], [I8]>,
VectorOfLengthAndType<[8], [F8E5M2FNUZ, F8E4M3FNUZ, F8E5M2, F8E4M3FN]>]>;
VectorOfLengthAndType<[4, 8], [F16]>,
VectorOfLengthAndType<[2, 4, 8], [BF16]>,
VectorOfLengthAndType<[4, 8, 16], [I8]>,
VectorOfLengthAndType<[8], [F8E5M2FNUZ, F8E4M3FNUZ]>,
VectorOfLengthAndType<[8, 32], [F8E5M2, F8E4M3FN]>,
VectorOfLengthAndType<[32], [F6E2M3FN, F6E3M2FN, F4E2M1FN]>]>;
def MFMAOutTypes : AnyTypeOf<[F64,
VectorOfLengthAndType<[4, 16, 32], [F32]>,
VectorOfLengthAndType<[4, 16, 32], [I32]>,
Expand Down
163 changes: 135 additions & 28 deletions mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include "../LLVMCommon/MemRefDescriptor.h"

#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/TypeSwitch.h"
#include <optional>

namespace mlir {
Expand All @@ -36,6 +37,7 @@ using namespace mlir::amdgpu;
constexpr Chipset kGfx908 = Chipset(9, 0, 8);
constexpr Chipset kGfx90a = Chipset(9, 0, 0xa);
constexpr Chipset kGfx942 = Chipset(9, 4, 2);
constexpr Chipset kGfx950 = Chipset(9, 5, 0);

/// Convert an unsigned number `val` to i32.
static Value convertUnsignedToI32(ConversionPatternRewriter &rewriter,
Expand Down Expand Up @@ -494,18 +496,33 @@ struct SchedBarrierOpLowering : public ConvertOpToLLVMPattern<SchedBarrierOp> {
/// and LLVM AMDGPU intrinsics convention.
///
/// Specifically:
/// 1. If `input` is a vector of N bytes, bitcast it to a (N * 8)-bit integer.
/// 2. If the element type is bfloat16, bitcast it to i16.
/// 1. If the element type is bfloat16, bitcast it to i16.
/// 2. If instead we have a more than 64-bit quantity, use a <N / 4 x i32>
/// instead, which is what the f8f6f4 intrinsics use.
/// 3. If `input` is a vector of N <= 8 bytes, bitcast it to a (N * 8)-bit
/// integer.
///
/// Note that the type of `input` has already been LLVM type converted:
/// therefore 8-bit and smaller floats are represented as their corresponding
/// `iN` integers.
static Value convertMFMAVectorOperand(ConversionPatternRewriter &rewriter,
Location loc, Value input) {
Type inputType = input.getType();
if (auto vectorType = dyn_cast<VectorType>(inputType)) {
if (vectorType.getElementType().isBF16())
return rewriter.create<LLVM::BitcastOp>(
loc, vectorType.clone(rewriter.getI16Type()), input);
if (vectorType.getElementType().isInteger(8)) {
if (vectorType.getElementType().isInteger(8) &&
vectorType.getNumElements() <= 8)
return rewriter.create<LLVM::BitcastOp>(
loc, rewriter.getIntegerType(vectorType.getNumElements() * 8), input);
if (isa<IntegerType>(vectorType.getElementType()) &&
vectorType.getElementTypeBitWidth() <= 8) {
int64_t numWords = llvm::divideCeil(
vectorType.getNumElements() * vectorType.getElementTypeBitWidth(),
32);
return rewriter.create<LLVM::BitcastOp>(
loc, VectorType::get(numWords, rewriter.getI32Type()), input);
}
}
return input;
Expand Down Expand Up @@ -622,12 +639,8 @@ static std::optional<StringRef> mfmaOpToIntrinsic(MFMAOp mfma,
Chipset chipset) {
uint32_t m = mfma.getM(), n = mfma.getN(), k = mfma.getK(),
b = mfma.getBlocks();
Type sourceElem = mfma.getSourceA().getType();
if (auto sourceType = dyn_cast<VectorType>(sourceElem))
sourceElem = sourceType.getElementType();
Type destElem = mfma.getDestC().getType();
if (auto destType = dyn_cast<VectorType>(destElem))
destElem = destType.getElementType();
Type sourceElem = getElementTypeOrSelf(mfma.getSourceA().getType());
Type destElem = getElementTypeOrSelf(mfma.getDestC().getType());

if (sourceElem.isF32() && destElem.isF32()) {
if (mfma.getReducePrecision() && chipset >= kGfx942) {
Expand All @@ -649,6 +662,12 @@ static std::optional<StringRef> mfmaOpToIntrinsic(MFMAOp mfma,
}

if (sourceElem.isF16() && destElem.isF32()) {
if (chipset >= kGfx950) {
if (m == 32 && n == 32 && k == 16 && b == 1)
return ROCDL::mfma_f32_32x32x16_f16::getOperationName();
if (m == 16 && n == 16 && k == 32 && b == 1)
return ROCDL::mfma_f32_16x16x32_f16::getOperationName();
}
if (m == 32 && n == 32 && k == 4 && b == 2)
return ROCDL::mfma_f32_32x32x4f16::getOperationName();
if (m == 16 && n == 16 && k == 4 && b == 4)
Expand All @@ -661,20 +680,25 @@ static std::optional<StringRef> mfmaOpToIntrinsic(MFMAOp mfma,
return ROCDL::mfma_f32_16x16x16f16::getOperationName();
}

if (sourceElem.isBF16() && destElem.isF32() && chipset >= kGfx90a) {
if (m == 32 && n == 32 && k == 4 && b == 2)
return ROCDL::mfma_f32_32x32x4bf16_1k::getOperationName();
if (m == 16 && n == 16 && k == 4 && b == 4)
return ROCDL::mfma_f32_16x16x4bf16_1k::getOperationName();
if (m == 4 && n == 4 && k == 4 && b == 16)
return ROCDL::mfma_f32_4x4x4bf16_1k::getOperationName();
if (m == 32 && n == 32 && k == 8 && b == 1)
return ROCDL::mfma_f32_32x32x8bf16_1k::getOperationName();
if (m == 16 && n == 16 && k == 16 && b == 1)
return ROCDL::mfma_f32_16x16x16bf16_1k::getOperationName();
}

if (sourceElem.isBF16() && destElem.isF32()) {
if (chipset >= kGfx950) {
if (m == 32 && n == 32 && k == 16 && b == 1)
return ROCDL::mfma_f32_32x32x16_bf16::getOperationName();
if (m == 16 && n == 16 && k == 32 && b == 1)
return ROCDL::mfma_f32_16x16x32_bf16::getOperationName();
}
if (chipset >= kGfx90a) {
if (m == 32 && n == 32 && k == 4 && b == 2)
return ROCDL::mfma_f32_32x32x4bf16_1k::getOperationName();
if (m == 16 && n == 16 && k == 4 && b == 4)
return ROCDL::mfma_f32_16x16x4bf16_1k::getOperationName();
if (m == 4 && n == 4 && k == 4 && b == 16)
return ROCDL::mfma_f32_4x4x4bf16_1k::getOperationName();
if (m == 32 && n == 32 && k == 8 && b == 1)
return ROCDL::mfma_f32_32x32x8bf16_1k::getOperationName();
if (m == 16 && n == 16 && k == 16 && b == 1)
return ROCDL::mfma_f32_16x16x16bf16_1k::getOperationName();
}
if (m == 32 && n == 32 && k == 2 && b == 2)
return ROCDL::mfma_f32_32x32x2bf16::getOperationName();
if (m == 16 && n == 16 && k == 2 && b == 4)
Expand All @@ -687,7 +711,13 @@ static std::optional<StringRef> mfmaOpToIntrinsic(MFMAOp mfma,
return ROCDL::mfma_f32_16x16x8bf16::getOperationName();
}

if (isa<IntegerType>(sourceElem) && destElem.isInteger(32)) {
if (sourceElem.isInteger(8) && destElem.isInteger(32)) {
if (chipset >= kGfx950) {
if (m == 32 && n == 32 && k == 32 && b == 1)
return ROCDL::mfma_i32_32x32x32_i8::getOperationName();
if (m == 16 && n == 16 && k == 64 && b == 1)
return ROCDL::mfma_i32_16x16x64_i8::getOperationName();
}
if (m == 32 && n == 32 && k == 4 && b == 2)
return ROCDL::mfma_i32_32x32x4i8::getOperationName();
if (m == 16 && n == 16 && k == 4 && b == 4)
Expand Down Expand Up @@ -750,6 +780,59 @@ static std::optional<StringRef> mfmaOpToIntrinsic(MFMAOp mfma,
return std::nullopt;
}

static std::optional<uint32_t> mfmaTypeSelectCode(Type mlirElemType) {
return llvm::TypeSwitch<Type, std::optional<uint32_t>>(mlirElemType)
.Case([](Float8E4M3FNType) { return 0u; })
.Case([](Float8E5M2Type) { return 1u; })
.Case([](Float6E2M3FNType) { return 2u; })
.Case([](Float6E3M2FNType) { return 3u; })
.Case([](Float4E2M1FNType) { return 4u; })
.Default([](Type) { return std::nullopt; });
}

/// If there is a scaled MFMA instruction for the input element types `aType`
/// and `bType`, output type `destType`, problem size M, N, K, and B (number of
/// blocks) on the given `chipset`, return a tuple consisting of the
/// OperationName of the intrinsic and the type codes that need to be passed to
/// that intrinsic. Note that this is also used to implement some un-scaled
/// MFMAs, since the compiler represents the ordinary instruction as a "scaled"
/// MFMA with a scale of 0.
static std::optional<std::tuple<StringRef, uint32_t, uint32_t>>
mfmaOpToScaledIntrinsic(Type aType, Type bType, Type destType, uint32_t m,
uint32_t n, uint32_t k, uint32_t b, Chipset chipset) {
aType = getElementTypeOrSelf(aType);
bType = getElementTypeOrSelf(bType);
destType = getElementTypeOrSelf(destType);

if (chipset < kGfx950)
return std::nullopt;
if (!isa<Float32Type>(destType))
return std::nullopt;

std::optional<uint32_t> aTypeCode = mfmaTypeSelectCode(aType);
std::optional<uint32_t> bTypeCode = mfmaTypeSelectCode(bType);
if (!aTypeCode || !bTypeCode)
return std::nullopt;

if (m == 32 && n == 32 && k == 64 && b == 1)
return std::tuple{ROCDL::mfma_scale_f32_32x32x64_f8f6f4::getOperationName(),
*aTypeCode, *bTypeCode};
if (m == 16 && n == 16 && k == 128 && b == 1)
return std::tuple{
ROCDL::mfma_scale_f32_16x16x128_f8f6f4::getOperationName(), *aTypeCode,
*bTypeCode};

return std::nullopt;
}

static std::optional<std::tuple<StringRef, uint32_t, uint32_t>>
mfmaOpToScaledIntrinsic(MFMAOp mfma, Chipset chipset) {
return mfmaOpToScaledIntrinsic(
mfma.getSourceA().getType(), mfma.getSourceB().getType(),
mfma.getDestC().getType(), mfma.getM(), mfma.getN(), mfma.getK(),
mfma.getBlocks(), chipset);
}

/// Return the `rocdl` intrinsic corresponding to a WMMA operation `wmma`
/// if one exists. This includes checking to ensure the intrinsic is supported
/// on the architecture you are compiling for.
Expand Down Expand Up @@ -829,16 +912,40 @@ struct MFMAOpLowering : public ConvertOpToLLVMPattern<MFMAOp> {
op.getNegateA() | (op.getNegateB() << 1) | (op.getNegateC() << 2);
}
std::optional<StringRef> maybeIntrinsic = mfmaOpToIntrinsic(op, chipset);
if (!maybeIntrinsic.has_value())
std::optional<std::tuple<StringRef, uint32_t, uint32_t>>
maybeScaledIntrinsic = mfmaOpToScaledIntrinsic(op, chipset);
if (!maybeIntrinsic.has_value() && !maybeScaledIntrinsic.has_value())
return op.emitOpError("no intrinsic matching MFMA size on given chipset");
OperationState loweredOp(loc, *maybeIntrinsic);

bool isScaled =
!maybeIntrinsic.has_value() && maybeScaledIntrinsic.has_value();
if (isScaled &&
(adaptor.getAbid() > 0 || getBlgpField > 0 || op.getCbsz() > 0)) {
return op.emitOpError(
"non-default abid, blgp, and cbsz aren't supported on MFMAs that can "
"be scaled as those fields are used for type information");
}

StringRef intrinsicName =
isScaled ? std::get<0>(*maybeScaledIntrinsic) : *maybeIntrinsic;
OperationState loweredOp(loc, intrinsicName);
loweredOp.addTypes(intrinsicOutType);
loweredOp.addOperands(
{convertMFMAVectorOperand(rewriter, loc, adaptor.getSourceA()),
convertMFMAVectorOperand(rewriter, loc, adaptor.getSourceB()),
adaptor.getDestC(), createI32Constant(rewriter, loc, op.getCbsz()),
createI32Constant(rewriter, loc, op.getAbid()),
createI32Constant(rewriter, loc, getBlgpField)});
adaptor.getDestC()});
if (isScaled) {
Value zero = createI32Constant(rewriter, loc, 0);
auto [_scaledName, aTypeCode, bTypeCode] = *maybeScaledIntrinsic;
loweredOp.addOperands({createI32Constant(rewriter, loc, aTypeCode),
createI32Constant(rewriter, loc, bTypeCode),
/*scale A byte=*/zero, /*scale A=*/zero,
/*scale B byte=*/zero, /*scale B=*/zero});
} else {
loweredOp.addOperands({createI32Constant(rewriter, loc, op.getCbsz()),
createI32Constant(rewriter, loc, op.getAbid()),
createI32Constant(rewriter, loc, getBlgpField)});
};
Value lowered = rewriter.create(loweredOp)->getResult(0);
if (outType != intrinsicOutType)
lowered = rewriter.create<LLVM::BitcastOp>(loc, outType, lowered);
Expand Down
14 changes: 8 additions & 6 deletions mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -341,22 +341,24 @@ LogicalResult MFMAOp::verify() {
}

Type sourceBType = getSourceB().getType();
if (sourceElem.isFloat(8)) {
if (sourceElem.isFloat(8) || sourceElem.isFloat(6) || sourceElem.isFloat(4)) {
int64_t sourceBLen = 1;
Type sourceBElem = sourceBType;
if (auto sourceBVector = llvm::dyn_cast<VectorType>(sourceBType)) {
sourceBLen = sourceBVector.getNumElements();
sourceBElem = sourceBVector.getElementType();
}
if (!sourceBElem.isFloat(8))
return emitOpError("expected both source operands to have f8 elements");
if (!sourceBElem.isFloat(8) && !sourceBElem.isFloat(6) &&
!sourceBElem.isFloat(4))
return emitOpError("expected both source operands to have small-float "
"elements if one does");
if (sourceLen != sourceBLen)
return emitOpError(
"expected both f8 source vectors to have the same length");
"expected both small-float source vectors to have the same length");
} else {
if (sourceType != sourceBType)
return emitOpError(
"expected both non-f8 source operand types to match exactly");
return emitOpError("expected both non-small-float source operand types "
"to match exactly");
}
// Normalize the wider integer types the compiler expects to i8
if (sourceElem.isInteger(32)) {
Expand Down
Loading
Loading