Revert "[MLIR][AMDGPU] Introduce fp16 packed arithmetic (llvm#105688)"

MaheshRavishankar · MaheshRavishankar · commit f6935c777f67 · 2024-08-29T01:04:44.000-07:00
This reverts commit 1387ba4.
diff --git a/mlir/include/mlir/Conversion/ArithToAMDGPU/ArithToAMDGPU.h b/mlir/include/mlir/Conversion/ArithToAMDGPU/ArithToAMDGPU.h
@@ -9,9 +9,7 @@
 #ifndef MLIR_CONVERSION_ARITHTOAMDGPU_ARITHTOAMDGPU_H
 #define MLIR_CONVERSION_ARITHTOAMDGPU_ARITHTOAMDGPU_H
 
-#include "mlir/Dialect/AMDGPU/Utils/Chipset.h"
 #include <memory>
-#include <string>
 
 namespace mlir {
 
@@ -28,10 +26,7 @@ namespace arith {
 /// to the largest value of that type instead of being rewritten to Inf (aka
 /// NaN).
 void populateArithToAMDGPUConversionPatterns(RewritePatternSet &patterns,
-                                             bool convertFP8Arithmetic,
-                                             bool saturateFP8Truncf,
-                                             bool allowPackedF16Rtz,
-                                             amdgpu::Chipset chipset);
+                                             bool saturateFP8TruncF);
 } // namespace arith
 } // namespace mlir
 
diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
@@ -153,15 +153,9 @@ def ArithToAMDGPUConversionPass : Pass<"convert-arith-to-amdgpu"> {
   let dependentDialects = ["amdgpu::AMDGPUDialect", "vector::VectorDialect"];
 
   let options = [
-    Option<"chipset", "chipset", "std::string",
-                        /*default=*/"\"gfx000\"",
-                        "Chipset that these operations will run on">,
     Option<"saturateFP8Truncf", "saturate-fp8-truncf", "bool",
            /*default=*/"false",
            "Use saturating truncation for 8-bit float types">,
-    Option<"allowPackedF16Rtz", "allow-packed-f16-round-to-zero", "bool",
-           /*default=*/"false",
-           "Whether we should allow f32->f16 packed round-to-zero conversion">,
   ];
 }
 
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -25,7 +25,6 @@ def AMDGPU_Dialect : Dialect {
 
 
   let dependentDialects = [
-    "ROCDL::ROCDLDialect",
     "arith::ArithDialect",
     "gpu::GPUDialect"
   ];
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -166,7 +166,7 @@ def ROCDL_BallotOp :
   let summary = "Vote across thread group";
 
   let description = [{
-      Ballot provides a bit mask containing the 1-bit predicate value from each lane.
+      Ballot provides a bit mask containing the 1-bit predicate value from each lane. 
       The nth bit of the result contains the 1 bit contributed by the nth warp lane.
   }];
 
@@ -579,21 +579,6 @@ def ROCDL_DPPUpdateOp : ROCDL_IntrOp<"update.dpp", [], [0],
   }];
 }
 
-//===---------------------------------------------------------------------===//
-// 16-bit float intrinsics
-//===---------------------------------------------------------------------===//
-def ROCDL_CvtPkRtz:
-    ROCDL_IntrOp<"cvt.pkrtz", [], [], [Pure], 1>,
-    Arguments<(ins F32:$srcA, F32:$srcB)> {
-  let summary = "Convert two f32 input into a vector<2xf16>";
-  let description = [{
-    Convert two f32 values into a packed vector<2xf16>.
-  }];
-  let assemblyFormat = [{
-    attr-dict $srcA `,` $srcB `:` type($res)
-  }];
-}
-
 //===---------------------------------------------------------------------===//
 // 8-bit float intrinsics
 //===---------------------------------------------------------------------===//
diff --git a/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp b/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp
@@ -9,11 +9,8 @@
 #include "mlir/Conversion/ArithToAMDGPU/ArithToAMDGPU.h"
 
 #include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
-#include "mlir/Dialect/AMDGPU/Utils/Chipset.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Arith/Utils/Utils.h"
-#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
-#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/PatternMatch.h"
@@ -27,7 +24,6 @@ namespace mlir {
 } // namespace mlir
 
 using namespace mlir;
-using namespace mlir::amdgpu;
 
 namespace {
 struct ArithToAMDGPUConversionPass final
@@ -47,25 +43,12 @@ struct ExtFOnFloat8RewritePattern final : OpRewritePattern<arith::ExtFOp> {
 
 struct TruncFToFloat8RewritePattern final : OpRewritePattern<arith::TruncFOp> {
   bool saturateFP8 = false;
-  TruncFToFloat8RewritePattern(MLIRContext *ctx, bool saturateFP8,
-                               Chipset chipset)
-      : OpRewritePattern::OpRewritePattern(ctx), saturateFP8(saturateFP8),
-        chipset(chipset) {}
-  Chipset chipset;
+  TruncFToFloat8RewritePattern(MLIRContext *ctx, bool saturateFP8)
+      : OpRewritePattern::OpRewritePattern(ctx), saturateFP8(saturateFP8) {}
 
   LogicalResult match(arith::TruncFOp op) const override;
   void rewrite(arith::TruncFOp op, PatternRewriter &rewriter) const override;
 };
-
-struct TruncfToFloat16RewritePattern final
-    : public OpRewritePattern<arith::TruncFOp> {
-
-  using OpRewritePattern<arith::TruncFOp>::OpRewritePattern;
-
-  LogicalResult match(arith::TruncFOp op) const override;
-  void rewrite(arith::TruncFOp op, PatternRewriter &rewriter) const override;
-};
-
 } // end namespace
 
 static Value castF32To(Type elementType, Value f32, Location loc,
@@ -289,105 +272,17 @@ void TruncFToFloat8RewritePattern::rewrite(arith::TruncFOp op,
   rewriter.replaceOp(op, result);
 }
 
-LogicalResult TruncfToFloat16RewritePattern::match(arith::TruncFOp op) const {
-  Type outType = op.getOut().getType();
-  Type inputType = getElementTypeOrSelf(op.getIn());
-  if (auto outVecType = dyn_cast<VectorType>(outType)) {
-    if (outVecType.isScalable())
-      return failure();
-    outType = outVecType.getElementType();
-  }
-  return success(outType.isF16() && inputType.isF32());
-}
-
-void TruncfToFloat16RewritePattern::rewrite(arith::TruncFOp op,
-                                            PatternRewriter &rewriter) const {
-  Location loc = op.getLoc();
-  Value in = op.getIn();
-  Type outElemType = getElementTypeOrSelf(op.getOut().getType());
-  VectorType truncResType = VectorType::get(2, outElemType);
-  auto inVectorTy = dyn_cast<VectorType>(in.getType());
-
-  // Handle the case where input type is not a vector type
-  if (!inVectorTy) {
-    auto sourceB = rewriter.create<LLVM::PoisonOp>(loc, rewriter.getF32Type());
-    Value asF16s =
-        rewriter.create<ROCDL::CvtPkRtz>(loc, truncResType, in, sourceB);
-    Value result = rewriter.create<vector::ExtractElementOp>(
-        loc, asF16s, rewriter.createOrFold<arith::ConstantIndexOp>(loc, 0));
-    return rewriter.replaceOp(op, result);
-  }
-  VectorType outType = cast<VectorType>(op.getOut().getType());
-  int64_t numElements = outType.getNumElements();
-  Value zero = rewriter.createOrFold<arith::ConstantOp>(
-      loc, outElemType, rewriter.getFloatAttr(outElemType, 0.0));
-  Value result = rewriter.createOrFold<vector::SplatOp>(loc, outType, zero);
-
-  if (inVectorTy.getRank() > 1) {
-    inVectorTy = VectorType::get(SmallVector<int64_t>{numElements},
-                                 inVectorTy.getElementType());
-    in = rewriter.create<vector::ShapeCastOp>(loc, inVectorTy, in);
-  }
-
-  // Handle the vector case. We also handle the (uncommon) case where the vector
-  // length is odd
-  for (int64_t i = 0; i < numElements; i += 2) {
-    int64_t elemsThisOp = std::min(numElements, i + 2) - i;
-    Value thisResult = nullptr;
-    Value elemA = rewriter.create<vector::ExtractElementOp>(
-        loc, in, rewriter.create<arith::ConstantIndexOp>(loc, i));
-    Value elemB = rewriter.create<LLVM::PoisonOp>(loc, rewriter.getF32Type());
-
-    if (elemsThisOp == 2) {
-      elemB = rewriter.create<vector::ExtractElementOp>(
-          loc, in, rewriter.createOrFold<arith::ConstantIndexOp>(loc, i + 1));
-    }
-
-    thisResult =
-        rewriter.create<ROCDL::CvtPkRtz>(loc, truncResType, elemA, elemB);
-    // Place back the truncated result into the possibly larger vector. If we
-    // are operating on a size 2 vector, these operations should be folded away
-    thisResult = rewriter.create<vector::ExtractStridedSliceOp>(
-        loc, thisResult, 0, elemsThisOp, 1);
-    result = rewriter.create<vector::InsertStridedSliceOp>(loc, thisResult,
-                                                           result, i, 1);
-  }
-
-  if (inVectorTy.getRank() != outType.getRank()) {
-    result = rewriter.create<vector::ShapeCastOp>(loc, outType, result);
-  }
-
-  rewriter.replaceOp(op, result);
-}
-
 void mlir::arith::populateArithToAMDGPUConversionPatterns(
-    RewritePatternSet &patterns, bool convertFP8Arithmetic,
-    bool saturateFP8Truncf, bool allowPackedF16Rtz, Chipset chipset) {
-
-  if (convertFP8Arithmetic) {
-    patterns.add<ExtFOnFloat8RewritePattern>(patterns.getContext());
-    patterns.add<TruncFToFloat8RewritePattern>(patterns.getContext(),
-                                               saturateFP8Truncf, chipset);
-  }
-  if (allowPackedF16Rtz)
-    patterns.add<TruncfToFloat16RewritePattern>(patterns.getContext());
+    RewritePatternSet &patterns, bool saturateFP8TruncF) {
+  patterns.add<ExtFOnFloat8RewritePattern>(patterns.getContext());
+  patterns.add<TruncFToFloat8RewritePattern>(patterns.getContext(),
+                                             saturateFP8TruncF);
 }
 
 void ArithToAMDGPUConversionPass::runOnOperation() {
   Operation *op = getOperation();
-  MLIRContext *ctx = &getContext();
   RewritePatternSet patterns(op->getContext());
-  FailureOr<amdgpu::Chipset> maybeChipset = amdgpu::Chipset::parse(chipset);
-  if (failed(maybeChipset)) {
-    emitError(UnknownLoc::get(ctx), "Invalid chipset name: " + chipset);
-    return signalPassFailure();
-  }
-
-  bool convertFP8Arithmetic =
-      (*maybeChipset).majorVersion == 9 && (*maybeChipset).minorVersion >= 0x40;
-  arith::populateArithToAMDGPUConversionPatterns(
-      patterns, convertFP8Arithmetic, saturateFP8Truncf, allowPackedF16Rtz,
-      *maybeChipset);
+  arith::populateArithToAMDGPUConversionPatterns(patterns, saturateFP8Truncf);
   if (failed(applyPatternsAndFoldGreedily(op, std::move(patterns))))
     return signalPassFailure();
 }
diff --git a/mlir/lib/Conversion/ArithToAMDGPU/CMakeLists.txt b/mlir/lib/Conversion/ArithToAMDGPU/CMakeLists.txt
@@ -12,7 +12,6 @@ add_mlir_conversion_library(MLIRArithToAMDGPU
 
   LINK_LIBS PUBLIC
   MLIRAMDGPUDialect
-  MLIRAMDGPUUtils
   MLIRArithDialect
   MLIRArithUtils
   MLIRVectorDialect
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
@@ -14,7 +14,6 @@
 
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
-#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Diagnostics.h"
diff --git a/mlir/lib/Dialect/AMDGPU/IR/CMakeLists.txt b/mlir/lib/Dialect/AMDGPU/IR/CMakeLists.txt
@@ -11,7 +11,6 @@ add_mlir_dialect_library(MLIRAMDGPUDialect
 
   LINK_LIBS PUBLIC
   MLIRArithDialect
-  MLIRROCDLDialect
   # Needed for GPU address space enum definition
   MLIRGPUDialect
   MLIRIR
diff --git a/mlir/test/Conversion/ArithToAMDGPU/16-bit-floats.mlir b/mlir/test/Conversion/ArithToAMDGPU/16-bit-floats.mlir
diff --git a/mlir/test/Conversion/ArithToAMDGPU/8-bit-float-saturation.mlir b/mlir/test/Conversion/ArithToAMDGPU/8-bit-float-saturation.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-opt --split-input-file %s \
-// RUN:   --pass-pipeline='builtin.module(func.func(convert-arith-to-amdgpu{chipset=gfx940 saturate-fp8-truncf=true}))' \
+// RUN:   --pass-pipeline='builtin.module(func.func(convert-arith-to-amdgpu{saturate-fp8-truncf=true}))' \
 // RUN:   | FileCheck %s
 
 // CHECK-LABEL: func.func @scalar_trunc
diff --git a/mlir/test/Conversion/ArithToAMDGPU/8-bit-floats.mlir b/mlir/test/Conversion/ArithToAMDGPU/8-bit-floats.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt --split-input-file %s -convert-arith-to-amdgpu="chipset=gfx940" | FileCheck %s
+// RUN: mlir-opt --split-input-file %s -convert-arith-to-amdgpu | FileCheck %s
 
 // CHECK-LABEL: func.func @scalar_ext
 // CHECK-SAME: ([[V:%.+]]: f8E5M2FNUZ)
diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir
@@ -530,12 +530,6 @@ llvm.func @rocdl_8bit_floats(%source: i32, %stoch: i32) -> i32 {
   llvm.return %source5 : i32
 }
 
-llvm.func @rocdl_16bit_packed_floats(%sourceA: f32, %sourceB: f32) -> vector<2xf16> {
-  // CHECK: call <2 x half> @llvm.amdgcn.cvt.pkrtz(float {{.*}}, float {{.*}})
-  %source = rocdl.cvt.pkrtz %sourceA, %sourceB  : vector<2xf16>
-  llvm.return %source : vector<2xf16>
-}
-
 // CHECK-DAG: attributes #[[$KERNEL_ATTRS]] = { "amdgpu-flat-work-group-size"="1,256" "uniform-work-group-size"="true" }
 // CHECK-DAG: attributes #[[$KERNEL_WORKGROUP_ATTRS]] = { "amdgpu-flat-work-group-size"="1,1024"
 // CHECK-DAG: attributes #[[$KNOWN_BLOCK_SIZE_ATTRS]] = { "amdgpu-flat-work-group-size"="128,128"

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-// RUN: mlir-opt --split-input-file %s -convert-arith-to-amdgpu="chipset=gfx940" \| FileCheck %s`
	`1`	`+// RUN: mlir-opt --split-input-file %s -convert-arith-to-amdgpu \| FileCheck %s`
`2`	`2`
`3`	`3`	`// CHECK-LABEL: func.func @scalar_ext`
`4`	`4`	`// CHECK-SAME: ([[V:%.+]]: f8E5M2FNUZ)`