llvm
diff --git a/‎mlir/include/mlir/Conversion/ArithToAMDGPU/ArithToAMDGPU.h
Lines changed: 27 additions & 0 deletions b/‎mlir/include/mlir/Conversion/ArithToAMDGPU/ArithToAMDGPU.h
Lines changed: 27 additions & 0 deletions
diff --git a/‎mlir/include/mlir/Conversion/Passes.h
Lines changed: 1 addition & 0 deletions b/‎mlir/include/mlir/Conversion/Passes.h
Lines changed: 1 addition & 0 deletions
diff --git a/‎mlir/include/mlir/Conversion/Passes.td
Lines changed: 15 additions & 0 deletions b/‎mlir/include/mlir/Conversion/Passes.td
Lines changed: 15 additions & 0 deletions
diff --git a/‎mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
Lines changed: 79 additions & 0 deletions b/‎mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
Lines changed: 79 additions & 0 deletions
diff --git a/‎mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
Lines changed: 81 additions & 3 deletions b/‎mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
Lines changed: 81 additions & 3 deletions
@@ -0,0 +1,27 @@
+//===- ArithToAMDGPU.h - Arith to AMDGPU dialect conversion ---*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_CONVERSION_ARITHTOAMDGPU_ARITHTOAMDGPU_H
+#define MLIR_CONVERSION_ARITHTOAMDGPU_ARITHTOAMDGPU_H
+
+#include <memory>
+
+namespace mlir {
+
+class RewritePatternSet;
+class Pass;
+
+#define GEN_PASS_DECL_ARITHTOAMDGPUCONVERSIONPASS
+#include "mlir/Conversion/Passes.h.inc"
+
+namespace arith {
+void populateArithToAMDGPUConversionPatterns(RewritePatternSet &patterns);
+} // namespace arith
+} // namespace mlir
+
+#endif // MLIR_CONVERSION_ARITHTOAMDGPU_ARITHTOAMDGPU_H
@@ -11,6 +11,7 @@
 
 #include "mlir/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.h"
 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
+#include "mlir/Conversion/ArithToAMDGPU/ArithToAMDGPU.h"
 #include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"
 #include "mlir/Conversion/ArithToSPIRV/ArithToSPIRV.h"
 #include "mlir/Conversion/ArmNeon2dToIntr/ArmNeon2dToIntr.h"
 
@@ -112,6 +112,21 @@ def ConvertAMDGPUToROCDL : Pass<"convert-amdgpu-to-rocdl"> {
                         "Chipset that these operations will run on">];
 }
 
+//===----------------------------------------------------------------------===//
+// ArithToAMDGPU
+//===----------------------------------------------------------------------===//
+def ArithToAMDGPUConversionPass : Pass<"convert-arith-to-amdgpu"> {
+  let summary = "Convert Arith operations to AMDGPU-specific implementations";
+  let description = [{
+    Convert `arith` operations (currently extf and truncf on 8-bit floats)
+    to operations in the `amdgpu` dialect. This pass is done in two steps
+    in order to avoid running a notional arith-to-rocdl and arith-to-llvm
+    simultaniously.
+  }];
+
+  let dependentDialects = ["amdgpu::AMDGPUDialect", "vector::VectorDialect"];
+}
+
 //===----------------------------------------------------------------------===//
 // ArithToLLVM
 //===----------------------------------------------------------------------===//
 
@@ -38,6 +38,85 @@ def AMDGPU_Dialect : Dialect {
 class AMDGPU_Op<string mnemonic, list<Trait> traits = []> :
   Op<AMDGPU_Dialect, mnemonic, traits> {}
 
+def AMDGPU_ExtPackedFp8Op :
+    AMDGPU_Op<"ext_packed_fp8", [Pure]>,
+    Arguments<(ins AnyTypeOf<[F8E5M2FNUZ, F8E4M3FNUZ,
+        VectorOfLengthAndType<[1, 2, 3, 4], [F8E5M2FNUZ, F8E4M3FNUZ]>]>:$source,
+      ConfinedAttr<I32Attr, [IntNonNegative, IntMaxValue<3>]>:$index)>,
+    Results<(outs F32:$res)> {
+  let summary = "Extend one of a vector of packed fp8 values to a float";
+  let description = [{
+    Extend the value `source[index]` to a 32-bit float and return it.
+
+    This rather unusual signature arises from the fact that AMD GPUs cannot
+    easily work with sub 32-bit quantities, so the compiler intrinsics for
+    extending 8-bit floats (which are, currently, the only way to work with
+    this operation) take packed vectors of 4 such floats.
+
+    If the passed-in vector has fewer than four elements, or the input is scalar,
+    the remaining values in the <4 x i8> will be filled with with
+    undefined values as needed.
+  }];
+  let assemblyFormat = [{
+    attr-dict $source `[` $index `]` `:` type($source) `to` type($res)
+  }];
+}
+
+def AMDGPU_PackedTrunc2xFp8Op :
+    AMDGPU_Op<"packed_trunc_2xfp8", [Pure, AttrSizedOperandSegments]>,
+    Arguments<(ins F32:$sourceA,
+      Optional<F32>:$sourceB,
+      ConfinedAttr<I32Attr, [IntNonNegative, IntMaxValue<1>]>:$wordIndex,
+      Optional<FixedVectorOfLengthAndType<[4], [F8E4M3FNUZ, F8E5M2FNUZ]>>:$existing)>,
+    Results<(outs FixedVectorOfLengthAndType<[4], [F8E4M3FNUZ, F8E5M2FNUZ]>:$res)> {
+  let summary = "Round two floats into a packed vector of 8-bit floats";
+  let description = [{
+    Round the inputs `sourceA` and `sourceB` (which is undefined if not
+    specified) into the low or high word (bottom two or top two) elements
+    of the returned vector, keeping the other two elements of `existing`
+    unchanged if present (or undefined if it was not passed in).
+
+    The reason for this odd signature is that AMD GPUs cannot easily work with
+    sub-registers, and so the conversion intrinsics (which are currently the
+    only way to work with 8-bit float types) take packed vectors of 4 8-bit
+    values.
+  }];
+  let assemblyFormat = [{
+    attr-dict $sourceA `,` ($sourceB^):(`undef`)?
+    `into` ($existing^):(`undef`)? `[` `word` $wordIndex `]`
+    `:` type($sourceA) `to` type($res) (`into` type($existing)^)?
+  }];
+  let hasVerifier = 1;
+}
+
+def AMDGPU_PackedStochRoundFp8Op :
+    AMDGPU_Op<"packed_stoch_round_fp8", [Pure]>,
+    Arguments<(ins F32:$source,
+      I32:$stochiasticParam,
+      ConfinedAttr<I32Attr, [IntNonNegative, IntMaxValue<3>]>:$storeIndex,
+      Optional<FixedVectorOfLengthAndType<[4], [F8E4M3FNUZ, F8E5M2FNUZ]>>:$existing)>,
+    Results<(outs FixedVectorOfLengthAndType<[4], [F8E4M3FNUZ, F8E5M2FNUZ]>:$res)> {
+  let summary = "Round float stochiastically into a packed vector of 8-bit floats";
+  let description = [{
+    Round the input `source`, adding in `stochiasticParam`, and place it into
+    the `storeIndex`th element of `res`.
+
+    If `existing` is passed in, elements of `res` other than the one at `storeIndex`
+    are copied from `existing`.
+
+    The reason for this odd signature is that AMD GPUs cannot easily work with
+    sub-registers, and so the conversion intrinsics (which are currently the
+    only way to work with 8-bit float types) take packed vectors of 4 8-bit
+    values.
+  }];
+  let assemblyFormat = [{
+    attr-dict $source `+` $stochiasticParam
+    `into` ($existing^):(`undef`)? `[` $storeIndex `]`
+    `:` type($source) `to` type($res) (`into` type($existing)^)?
+  }];
+  let hasVerifier = 1;
+}
+
 /// Raw buffer load
 def AMDGPU_RawBufferLoadOp :
     AMDGPU_Op<"raw_buffer_load", [AllElementTypesMatch<["value", "memref"]>,
 
@@ -116,7 +116,7 @@ class ROCDL_MbcntOp<string mnemonic> :
 def ROCDL_MbcntLoOp : ROCDL_MbcntOp<"lo">;
 def ROCDL_MbcntHiOp : ROCDL_MbcntOp<"hi">;
 
-def ROCDL_DsSwizzleOp : 
+def ROCDL_DsSwizzleOp :
 ROCDL_Op<"ds_swizzle">,
 Results<(outs I32:$res)>,
 Arguments<(ins I32:$src,
@@ -130,7 +130,7 @@ Arguments<(ins I32:$src,
    }];
 }
 
-def ROCDL_DsBpermuteOp : 
+def ROCDL_DsBpermuteOp :
 ROCDL_Op<"ds_bpermute">,
 Results<(outs I32:$res)>,
 Arguments<(ins I32:$index,
@@ -525,6 +525,85 @@ def ROCDL_RawBufferAtomicUMinOp :
   let hasCustomAssemblyFormat = 1;
 }
 
+//===---------------------------------------------------------------------===//
+// 8-bit float intrinsics
+//===---------------------------------------------------------------------===//
+def ROCDL_CvtF32Bf8Op :
+    ROCDL_IntrOp<"cvt.f32.bf8", [], [], [Pure], 1>,
+    Arguments<(ins I32:$srcA, I32:$byteSel)> {
+  let summary = "Convert bf8 to f32";
+  let description = [{
+    Convert 8-bit bf8 value from the `byteSel`th bit of `srcA` to fp32.
+  }];
+  let assemblyFormat = [{
+    attr-dict $srcA `[` $byteSel `]` `:` type($res)
+  }];
+}
+
+def ROCDL_CvtF32Fp8Op :
+    ROCDL_IntrOp<"cvt.f32.fp8", [], [], [Pure], 1>,
+    Arguments<(ins I32:$srcA, I32:$byteSel)> {
+  let summary = "Convert fp8 to f32";
+  let description = [{
+    Convert 8-bit fp8 value from the `byteSel`th bit of `srcA` to fp32.
+  }];
+  let assemblyFormat = [{
+    attr-dict $srcA `[` $byteSel `]` `:` type($res)
+  }];
+}
+
+def ROCDL_CvtPkBf8F32Op :
+    ROCDL_IntrOp<"cvt.pk.bf8.f32", [], [], [Pure], 1>,
+    Arguments<(ins F32:$srcA, F32:$srcB, I32:$old, I1:$wordSel)> {
+  let summary = "Convert two f32's to bf8";
+  let description = [{
+    Convert `srcA` and `srcB` to bf8 and store into the low/high word of
+    `old`, preserving the other word.
+  }];
+  let assemblyFormat = [{
+    attr-dict $srcA `,` $srcB `->` $old `[` $wordSel `]` `:` type($res)
+  }];
+}
+
+def ROCDL_CvtPkFp8F32Op :
+    ROCDL_IntrOp<"cvt.pk.fp8.f32", [], [], [Pure], 1>,
+    Arguments<(ins F32:$srcA, F32:$srcB, I32:$old, I1:$wordSel)> {
+  let summary = "Convert two f32's to fp8";
+  let description = [{
+    Convert `srcA` and `srcB` to fp8 and store into the low/high word of
+    `old`, preserving the other word.
+  }];
+  let assemblyFormat = [{
+    attr-dict $srcA `,` $srcB `->` $old `[` $wordSel `]` `:` type($res)
+  }];
+}
+
+def ROCDL_CvtSrBf8F32Op :
+    ROCDL_IntrOp<"cvt.sr.bf8.f32", [], [], [Pure], 1>,
+    Arguments<(ins F32:$srcA, I32:$srcB, I32:$old, I32:$byteSel)> {
+  let summary = "Convert f32 to bf8, stochiastic rounding";
+  let description = [{
+    Convert `srcA` to bf8, adding the rounding factor from `srcB`,
+    and store into the `byteSel`th byte of `old`, preserving the others.
+  }];
+  let assemblyFormat = [{
+    attr-dict $srcA `,` $srcB `->` $old `[` $byteSel `]` `:` type($res)
+  }];
+}
+
+def ROCDL_CvtSrFp8F32Op :
+    ROCDL_IntrOp<"cvt.sr.fp8.f32", [], [], [Pure], 1>,
+    Arguments<(ins F32:$srcA, I32:$srcB, I32:$old, I32:$byteSel)> {
+  let summary = "Convert f32 to fp8, stochiastic rounding";
+  let description = [{
+    Convert `srcA` to fp8, adding the rounding factor from `srcB`,
+    and store into the `byteSel`th byte of `old`, preserving the others.
+  }];
+  let assemblyFormat = [{
+    attr-dict $srcA `,` $srcB `->` $old `[` $byteSel `]` `:` type($res)
+  }];
+}
+
 //===----------------------------------------------------------------------===//
 // ROCDL target attribute.
 //===----------------------------------------------------------------------===//
@@ -612,5 +691,4 @@ def ROCDL_TargettAttr :
     }
   }];
 }
-
 #endif // ROCDLIR_OPS