Skip to content

[AMDGPU][MLIR] Replace gfx940 and gfx941 with gfx942 in MLIR #125836

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Feb 19, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -602,7 +602,7 @@ def AMDGPU_MFMAOp :
order (that is, v[0] will go to arg[7:0], v[1] to arg[15:8] and so on).

The negateA, negateB, and negateC flags are only supported for double-precision
operations on gfx940+.
operations on gfx94x.
}];
let assemblyFormat = [{
$sourceA `*` $sourceB `+` $destC
Expand Down
8 changes: 4 additions & 4 deletions mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
Original file line number Diff line number Diff line change
Expand Up @@ -348,11 +348,11 @@ def ROCDL_mfma_f32_16x16x4bf16_1k : ROCDL_Mfma_IntrOp<"mfma.f32.16x16x4bf16.1k">
def ROCDL_mfma_f32_4x4x4bf16_1k : ROCDL_Mfma_IntrOp<"mfma.f32.4x4x4bf16.1k">;
def ROCDL_mfma_f32_32x32x8bf16_1k : ROCDL_Mfma_IntrOp<"mfma.f32.32x32x8bf16.1k">;
def ROCDL_mfma_f32_16x16x16bf16_1k : ROCDL_Mfma_IntrOp<"mfma.f32.16x16x16bf16.1k">;
// Note: in gfx940, unlike in gfx90a, the f64 xdlops use the "blgp" argument as a
// NEG bitfield. See IntrinsicsAMDGPU.td for more info.
// Note: in gfx94x, unlike in gfx90a, the f64 xdlops use the "blgp" argument as
// a NEG bitfield. See IntrinsicsAMDGPU.td for more info.
def ROCDL_mfma_f64_16x16x4f64 : ROCDL_Mfma_IntrOp<"mfma.f64.16x16x4f64">;
def ROCDL_mfma_f64_4x4x4f64 : ROCDL_Mfma_IntrOp<"mfma.f64.4x4x4f64">;
// New in gfx940.
// New in gfx94x.
def ROCDL_mfma_i32_16x16x32_i8 : ROCDL_Mfma_IntrOp<"mfma.i32.16x16x32.i8">;
def ROCDL_mfma_i32_32x32x16_i8 : ROCDL_Mfma_IntrOp<"mfma.i32.32x32x16.i8">;
def ROCDL_mfma_f32_16x16x8_xf32 : ROCDL_Mfma_IntrOp<"mfma.f32.16x16x8.xf32">;
Expand All @@ -375,7 +375,7 @@ def ROCDL_mfma_f32_32x32x16_f16 : ROCDL_Mfma_IntrOp<"mfma.f32.32x32x16.f16">;
def ROCDL_mfma_scale_f32_16x16x128_f8f6f4 : ROCDL_Mfma_OO_IntrOp<"mfma.scale.f32.16x16x128.f8f6f4", [0,1]>;
def ROCDL_mfma_scale_f32_32x32x64_f8f6f4 : ROCDL_Mfma_OO_IntrOp<"mfma.scale.f32.32x32x64.f8f6f4", [0,1]>;

// 2:4 Sparsity ops (GFX940)
// 2:4 Sparsity ops (GFX94x)
def ROCDL_smfmac_f32_16x16x32_f16 : ROCDL_Mfma_IntrOp<"smfmac.f32.16x16x32.f16">;
def ROCDL_smfmac_f32_32x32x16_f16 : ROCDL_Mfma_IntrOp<"smfmac.f32.32x32x16.f16">;
def ROCDL_smfmac_f32_16x16x32_bf16 : ROCDL_Mfma_IntrOp<"smfmac.f32.16x16x32.bf16">;
Expand Down
22 changes: 11 additions & 11 deletions mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ namespace {
// Define commonly used chipsets versions for convenience.
constexpr Chipset kGfx908 = Chipset(9, 0, 8);
constexpr Chipset kGfx90a = Chipset(9, 0, 0xa);
constexpr Chipset kGfx940 = Chipset(9, 4, 0);
constexpr Chipset kGfx942 = Chipset(9, 4, 2);

/// Define lowering patterns for raw buffer ops
template <typename GpuOp, typename Intrinsic>
Expand Down Expand Up @@ -483,7 +483,7 @@ static std::optional<StringRef> mfmaOpToIntrinsic(MFMAOp mfma,
destElem = destType.getElementType();

if (sourceElem.isF32() && destElem.isF32()) {
if (mfma.getReducePrecision() && chipset >= kGfx940) {
if (mfma.getReducePrecision() && chipset >= kGfx942) {
if (m == 32 && n == 32 && k == 4 && b == 1)
return ROCDL::mfma_f32_32x32x4_xf32::getOperationName();
if (m == 16 && n == 16 && k == 8 && b == 1)
Expand Down Expand Up @@ -551,9 +551,9 @@ static std::optional<StringRef> mfmaOpToIntrinsic(MFMAOp mfma,
return ROCDL::mfma_i32_32x32x8i8::getOperationName();
if (m == 16 && n == 16 && k == 16 && b == 1)
return ROCDL::mfma_i32_16x16x16i8::getOperationName();
if (m == 32 && n == 32 && k == 16 && b == 1 && chipset >= kGfx940)
if (m == 32 && n == 32 && k == 16 && b == 1 && chipset >= kGfx942)
return ROCDL::mfma_i32_32x32x16_i8::getOperationName();
if (m == 16 && n == 16 && k == 32 && b == 1 && chipset >= kGfx940)
if (m == 16 && n == 16 && k == 32 && b == 1 && chipset >= kGfx942)
return ROCDL::mfma_i32_16x16x32_i8::getOperationName();
}

Expand All @@ -565,7 +565,7 @@ static std::optional<StringRef> mfmaOpToIntrinsic(MFMAOp mfma,
}

if (isa<Float8E5M2FNUZType>(sourceElem) && destElem.isF32() &&
chipset >= kGfx940) {
chipset >= kGfx942) {
// Known to be correct because there are no scalar f8 instructions and
// because a length mismatch will have been caught by the verifier.
Type sourceBElem =
Expand All @@ -585,7 +585,7 @@ static std::optional<StringRef> mfmaOpToIntrinsic(MFMAOp mfma,
}

if (isa<Float8E4M3FNUZType>(sourceElem) && destElem.isF32() &&
chipset >= kGfx940) {
chipset >= kGfx942) {
Type sourceBElem =
cast<VectorType>(mfma.getSourceB().getType()).getElementType();
if (m == 16 && n == 16 && k == 32 && b == 1) {
Expand Down Expand Up @@ -653,8 +653,8 @@ struct MFMAOpLowering : public ConvertOpToLLVMPattern<MFMAOp> {
return op->emitOpError("MFMA only supported on gfx908+");
uint32_t getBlgpField = static_cast<uint32_t>(op.getBlgp());
if (op.getNegateA() || op.getNegateB() || op.getNegateC()) {
if (chipset < kGfx940)
return op.emitOpError("negation unsupported on older than gfx940");
if (chipset < kGfx942)
return op.emitOpError("negation unsupported on older than gfx942");
getBlgpField |=
op.getNegateA() | (op.getNegateB() << 1) | (op.getNegateC() << 2);
}
Expand Down Expand Up @@ -775,7 +775,7 @@ LogicalResult ExtPackedFp8OpLowering::matchAndRewrite(
ExtPackedFp8Op op, ExtPackedFp8OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const {
Location loc = op.getLoc();
if (chipset.majorVersion != 9 || chipset < kGfx940)
if (chipset.majorVersion != 9 || chipset < kGfx942)
return rewriter.notifyMatchFailure(
loc, "Fp8 conversion instructions are not available on target "
"architecture and their emulation is not implemented");
Expand Down Expand Up @@ -819,7 +819,7 @@ LogicalResult PackedTrunc2xFp8OpLowering::matchAndRewrite(
PackedTrunc2xFp8Op op, PackedTrunc2xFp8OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const {
Location loc = op.getLoc();
if (chipset.majorVersion != 9 || chipset < kGfx940)
if (chipset.majorVersion != 9 || chipset < kGfx942)
return rewriter.notifyMatchFailure(
loc, "Fp8 conversion instructions are not available on target "
"architecture and their emulation is not implemented");
Expand Down Expand Up @@ -856,7 +856,7 @@ LogicalResult PackedStochRoundFp8OpLowering::matchAndRewrite(
PackedStochRoundFp8Op op, PackedStochRoundFp8OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const {
Location loc = op.getLoc();
if (chipset.majorVersion != 9 || chipset < kGfx940)
if (chipset.majorVersion != 9 || chipset < kGfx942)
return rewriter.notifyMatchFailure(
loc, "Fp8 conversion instructions are not available on target "
"architecture and their emulation is not implemented");
Expand Down
2 changes: 1 addition & 1 deletion mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -384,7 +384,7 @@ void ArithToAMDGPUConversionPass::runOnOperation() {
}

bool convertFP8Arithmetic =
maybeChipset->majorVersion == 9 && *maybeChipset >= Chipset(9, 4, 0);
maybeChipset->majorVersion == 9 && *maybeChipset >= Chipset(9, 4, 2);
arith::populateArithToAMDGPUConversionPatterns(
patterns, convertFP8Arithmetic, saturateFP8Truncf, allowPackedF16Rtz,
*maybeChipset);
Expand Down
8 changes: 1 addition & 7 deletions mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ void mlir::amdgpu::populateAmdgpuEmulateAtomicsPatterns(
}
// gfx9 has no to a very limited support for floating-point min and max.
if (chipset.majorVersion == 9) {
if (chipset >= Chipset(9, 0, 0xa) && chipset != Chipset(9, 4, 1)) {
if (chipset >= Chipset(9, 0, 0xa)) {
// gfx90a supports f64 max (and min, but we don't have a min wrapper right
// now) but all other types need to be emulated.
target.addDynamicallyLegalOp<RawBufferAtomicFmaxOp>(
Expand All @@ -189,12 +189,6 @@ void mlir::amdgpu::populateAmdgpuEmulateAtomicsPatterns(
} else {
target.addIllegalOp<RawBufferAtomicFmaxOp>();
}
if (chipset == Chipset(9, 4, 1)) {
// gfx941 requires non-CAS atomics to be implemented with CAS loops.
// The workaround here mirrors HIP and OpenMP.
target.addIllegalOp<RawBufferAtomicFaddOp, RawBufferAtomicFmaxOp,
RawBufferAtomicSmaxOp, RawBufferAtomicUminOp>();
}
}
patterns.add<
RawBufferAtomicByCasPattern<RawBufferAtomicFaddOp, arith::AddFOp>,
Expand Down
2 changes: 1 addition & 1 deletion mlir/test/Conversion/AMDGPUToROCDL/8-bit-floats.mlir
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx940 | FileCheck %s
// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx942 | FileCheck %s

// CHECK-LABEL: func @ext_scalar
// CHECK: [[V:%.+]] = builtin.unrealized_conversion_cast %{{.+}} : f8E5M2FNUZ to i8
Expand Down
2 changes: 1 addition & 1 deletion mlir/test/Conversion/AMDGPUToROCDL/mfma.mlir
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx940 -cse | FileCheck %s
// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx942 -cse | FileCheck %s
func.func @mfma_to_rocdl(%arg0 : f32, %arg1 : vector<32xf32>,
%arg2 : vector<16xf32>, %arg3 : vector<4xf32>,
%arg4 : vector<4xf16>, %arg5 : vector<4xi8>,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
// RUN: mlir-opt --split-input-file %s \
// RUN: --pass-pipeline='builtin.module(func.func(convert-arith-to-amdgpu{chipset=gfx940 saturate-fp8-truncf=true}))' \
// RUN: --pass-pipeline='builtin.module(func.func(convert-arith-to-amdgpu{chipset=gfx942 saturate-fp8-truncf=true}))' \
// RUN: | FileCheck %s

// CHECK-LABEL: func.func @scalar_trunc
Expand Down
2 changes: 1 addition & 1 deletion mlir/test/Conversion/ArithToAMDGPU/8-bit-floats.mlir
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// RUN: mlir-opt --split-input-file %s -convert-arith-to-amdgpu="chipset=gfx940" | FileCheck %s
// RUN: mlir-opt --split-input-file %s -convert-arith-to-amdgpu="chipset=gfx942" | FileCheck %s

// CHECK-LABEL: func.func @scalar_ext
// CHECK-SAME: ([[V:%.+]]: f8E5M2FNUZ)
Expand Down
20 changes: 8 additions & 12 deletions mlir/unittests/Dialect/AMDGPU/AMDGPUUtilsTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,11 @@ TEST(ChipsetTest, Parsing) {
EXPECT_EQ(chipset->minorVersion, 0u);
EXPECT_EQ(chipset->steppingVersion, 0xau);

chipset = Chipset::parse("gfx940");
chipset = Chipset::parse("gfx942");
ASSERT_TRUE(succeeded(chipset));
EXPECT_EQ(chipset->majorVersion, 9u);
EXPECT_EQ(chipset->minorVersion, 4u);
EXPECT_EQ(chipset->steppingVersion, 0u);
EXPECT_EQ(chipset->steppingVersion, 2u);

chipset = Chipset::parse("gfx1103");
ASSERT_TRUE(succeeded(chipset));
Expand All @@ -36,30 +36,26 @@ TEST(ChipsetTest, ParsingInvalid) {
EXPECT_TRUE(failed(Chipset::parse("navi33")));
EXPECT_TRUE(failed(Chipset::parse("rdna2")));
EXPECT_TRUE(failed(Chipset::parse("sm_80")));
EXPECT_TRUE(failed(Chipset::parse("GFX940")));
EXPECT_TRUE(failed(Chipset::parse("Gfx940")));
EXPECT_TRUE(failed(Chipset::parse("GFX942")));
EXPECT_TRUE(failed(Chipset::parse("Gfx942")));
EXPECT_TRUE(failed(Chipset::parse("gfx9")));
EXPECT_TRUE(failed(Chipset::parse("gfx_940")));
EXPECT_TRUE(failed(Chipset::parse("gfx940_")));
EXPECT_TRUE(failed(Chipset::parse("gfx_942")));
EXPECT_TRUE(failed(Chipset::parse("gfx942_")));
EXPECT_TRUE(failed(Chipset::parse("gfxmeow")));
EXPECT_TRUE(failed(Chipset::parse("gfx1fff")));
}

TEST(ChipsetTest, Comparison) {
EXPECT_EQ(Chipset(9, 4, 0), Chipset(9, 4, 0));
EXPECT_NE(Chipset(9, 4, 0), Chipset(9, 4, 2));
EXPECT_EQ(Chipset(9, 4, 2), Chipset(9, 4, 2));
EXPECT_NE(Chipset(9, 0, 0), Chipset(10, 0, 0));

EXPECT_LT(Chipset(9, 0, 0), Chipset(10, 0, 0));
EXPECT_LT(Chipset(9, 0, 0), Chipset(9, 4, 2));
EXPECT_LE(Chipset(9, 4, 1), Chipset(9, 4, 1));
EXPECT_FALSE(Chipset(9, 4, 2) < Chipset(9, 4, 2));
EXPECT_FALSE(Chipset(9, 4, 2) < Chipset(9, 4, 0));

EXPECT_GT(Chipset(9, 0, 0xa), Chipset(9, 0, 8));
EXPECT_GE(Chipset(9, 0, 0xa), Chipset(9, 0, 0xa));
EXPECT_FALSE(Chipset(9, 4, 1) >= Chipset(9, 4, 2));
EXPECT_FALSE(Chipset(9, 0, 0xa) >= Chipset(9, 4, 0));
EXPECT_FALSE(Chipset(9, 0, 0xa) >= Chipset(9, 4, 2));
}

} // namespace
Expand Down