llvm
diff --git a/‎llvm/docs/AMDGPUUsage.rst
Lines changed: 13 additions & 0 deletions b/‎llvm/docs/AMDGPUUsage.rst
Lines changed: 13 additions & 0 deletions
diff --git a/‎llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Lines changed: 23 additions & 0 deletions b/‎llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Lines changed: 23 additions & 0 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUGISel.td
Lines changed: 3 additions & 0 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPUGISel.td
Lines changed: 3 additions & 0 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
Lines changed: 12 additions & 0 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
Lines changed: 12 additions & 0 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
Lines changed: 2 additions & 0 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
Lines changed: 2 additions & 0 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
Lines changed: 1 addition & 1 deletion b/‎llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
Lines changed: 1 addition & 1 deletion
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Lines changed: 19 additions & 0 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Lines changed: 19 additions & 0 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
Lines changed: 20 additions & 0 deletions b/‎llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
Lines changed: 20 additions & 0 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
Lines changed: 4 additions & 0 deletions b/‎llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
Lines changed: 4 additions & 0 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Lines changed: 17 additions & 0 deletions b/‎llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Lines changed: 17 additions & 0 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/SIInstrFormats.td
Lines changed: 5 additions & 0 deletions b/‎llvm/lib/Target/AMDGPU/SIInstrFormats.td
Lines changed: 5 additions & 0 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/SIInstrInfo.h
Lines changed: 6 additions & 0 deletions b/‎llvm/lib/Target/AMDGPU/SIInstrInfo.h
Lines changed: 6 additions & 0 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/SIInstrInfo.td
Lines changed: 16 additions & 0 deletions b/‎llvm/lib/Target/AMDGPU/SIInstrInfo.td
Lines changed: 16 additions & 0 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/SIRegisterInfo.td
Lines changed: 2 additions & 0 deletions b/‎llvm/lib/Target/AMDGPU/SIRegisterInfo.td
Lines changed: 2 additions & 0 deletions
@@ -1397,6 +1397,19 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
                                                    used by hardware to control active lanes when used in EXEC register.
                                                    For example, ballot(i1 true) return EXEC mask.
 
+  llvm.amdgcn.mfma.f32.16x16x128.f8f6f4.scaled     Emit `v_mfma_f32_16x16x128_f8f6f4`, bundled with a `v_mfma_ld_scale_b32`
+                                                   to set the scale factor. The last 4 operands correspond to the inputs
+                                                   to `v_mfma_ld_scale_b32`:
+
+   llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4     Emit `v_mfma_scale_f32_16x16x128_f8f6f4` to set the scale factor. The
+                                                   last 4 operands correspond to the scale inputs.
+                                                     2-bit byte index to use for each lane for matrix A
+                                                     Matrix A scale values
+                                                     2-bit byte index to use for each lane for matrix B
+                                                     Matrix B scale values
+
+   llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4      Emit `v_mfma_scale_f32_32x32x64_f8f6f4`
+
   ==============================================   ==========================================================
 
 .. TODO::
 
@@ -2968,6 +2968,27 @@ class AMDGPUMfmaIntrinsic<LLVMType DestTy, LLVMType SrcABTy> :
             [IntrConvergent, IntrNoMem,
              ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
 
+
+class AMDGPUMfmaScaleIntrinsic<LLVMType DestTy, LLVMType SrcABTy> :
+  ClangBuiltin<!subst("int", "__builtin", NAME)>,
+  DefaultAttrsIntrinsic<[DestTy],
+            [SrcABTy, SrcABTy, DestTy,
+             llvm_i32_ty, // cbsz
+             llvm_i32_ty, // abid
+             llvm_i32_ty, // blgp
+             // llvm_i1_ty, // TODO: neg_src2
+             // llvm_i1_ty, // TODO: abs_src2
+             // llvm_i1_ty, // TODO: clamp
+             llvm_i32_ty, // op_sel (A matrix scale, 2-bits) // TODO: Make i2?
+             llvm_i32_ty, // v_mfma_ld_scale_b32 src0 (A matrix scale)
+             llvm_i32_ty, // op_sel (B matrix scale, 2-bits) // TODO: Make i2?
+             llvm_i32_ty  // v_mfma_ld_scale_b32 src1 (B matrix scale)
+            ],
+            [IntrConvergent, IntrNoMem,
+             ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>,
+             ImmArg<ArgIndex<6>>, ImmArg<ArgIndex<8>>
+             ]>;
+
 defset list<Intrinsic> AMDGPUMFMAIntrinsics908 = {
 def int_amdgcn_mfma_f32_32x32x1f32  : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_float_ty>;
 def int_amdgcn_mfma_f32_16x16x1f32  : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_float_ty>;
@@ -3119,6 +3140,8 @@ def int_amdgcn_mfma_f32_16x16x32_f16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v
 def int_amdgcn_mfma_f32_32x32x16_f16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v8f16_ty>;
 
 def int_amdgcn_mfma_f32_32x32x16_bf16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v8bf16_ty>;
+def int_amdgcn_mfma_scale_f32_16x16x128_f8f6f4 : AMDGPUMfmaScaleIntrinsic<llvm_v4f32_ty, llvm_v8i32_ty>;
+def int_amdgcn_mfma_scale_f32_32x32x64_f8f6f4 : AMDGPUMfmaScaleIntrinsic<llvm_v16f32_ty, llvm_v8i32_ty>;
 }
 
 //===----------------------------------------------------------------------===//
 
@@ -423,3 +423,6 @@ def gi_fp_pow2_to_exponent : GICustomOperandRenderer<"renderFPPow2ToExponent">,
 
 def gi_as_hw_round_mode : GICustomOperandRenderer<"renderRoundMode">,
   GISDNodeXFormEquiv<as_hw_round_mode>;
+
+def gi_MFMALdScaleModifierOp : GICustomOperandRenderer<"renderScaledMAIIntrinsicOperand">,
+  GISDNodeXFormEquiv<MFMALdScaleXForm>;
@@ -5737,6 +5737,18 @@ void AMDGPUInstructionSelector::renderRoundMode(MachineInstrBuilder &MIB,
   MIB.addImm((MI.getOperand(OpIdx).getImm() + 3) % 4);
 }
 
+/// Convert from 2-bit value to enum values used for op_sel* source modifiers.
+void AMDGPUInstructionSelector::renderScaledMAIIntrinsicOperand(
+    MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
+  unsigned Val = MI.getOperand(OpIdx).getImm();
+  unsigned New = 0;
+  if (Val & 0x1)
+    New |= SISrcMods::OP_SEL_0;
+  if (Val & 0x2)
+    New |= SISrcMods::OP_SEL_1;
+  MIB.addImm(New);
+}
+
 bool AMDGPUInstructionSelector::isInlineImmediate(const APInt &Imm) const {
   return TII.isInlineConstant(Imm);
 }
 
@@ -364,6 +364,8 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
 
   void renderRoundMode(MachineInstrBuilder &MIB, const MachineInstr &MI,
                        int OpIdx) const;
+  void renderScaledMAIIntrinsicOperand(MachineInstrBuilder &MIB,
+                                       const MachineInstr &MI, int OpIdx) const;
 
   bool isInlineImmediate(const APInt &Imm) const;
   bool isInlineImmediate(const APFloat &Imm) const;
 
@@ -40,7 +40,7 @@ class AMDGPUInst <dag outs, dag ins, string asm = "",
   // instructions to not match without killing the whole decode process. It is
   // mainly used for ARM, but Tablegen expects this field to exist or it fails
   // to build the decode table.
-  field bits<96> SoftFail = 0;
+  field bits<128> SoftFail = 0; // FIXME: If this is smaller than largest instruction, DecodeEmitter crashes
 
   let DecoderNamespace = Namespace;
 
 
@@ -4769,6 +4769,25 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
               : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
       break;
     }
+    case Intrinsic::amdgcn_mfma_scale_f32_16x16x128_f8f6f4:
+    case Intrinsic::amdgcn_mfma_scale_f32_32x32x64_f8f6f4: {
+      const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+      OpdsMapping[0] =
+          Info->mayNeedAGPRs()
+              ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI)
+              : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
+
+      OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
+      OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
+      OpdsMapping[4] =
+          Info->mayNeedAGPRs()
+              ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI)
+              : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
+
+      OpdsMapping[9] = getVGPROpMapping(MI.getOperand(9).getReg(), MRI, *TRI);
+      OpdsMapping[11] = getVGPROpMapping(MI.getOperand(11).getReg(), MRI, *TRI);
+      break;
+    }
     case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
     case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
     case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
 
@@ -493,6 +493,17 @@ static inline DecoderUInt128 eat12Bytes(ArrayRef<uint8_t> &Bytes) {
   return DecoderUInt128(Lo, Hi);
 }
 
+static inline DecoderUInt128 eat16Bytes(ArrayRef<uint8_t> &Bytes) {
+  assert(Bytes.size() >= 16);
+  uint64_t Lo =
+      support::endian::read<uint64_t, llvm::endianness::little>(Bytes.data());
+  Bytes = Bytes.slice(8);
+  uint64_t Hi =
+      support::endian::read<uint64_t, llvm::endianness::little>(Bytes.data());
+  Bytes = Bytes.slice(8);
+  return DecoderUInt128(Lo, Hi);
+}
+
 DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
                                                 ArrayRef<uint8_t> Bytes_,
                                                 uint64_t Address,
@@ -529,6 +540,15 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
 
       // Reinitialize Bytes
       Bytes = Bytes_.slice(0, MaxInstBytesNum);
+
+    } else if (Bytes.size() >= 16 &&
+               STI.hasFeature(AMDGPU::FeatureGFX950Insts)) {
+      DecoderUInt128 DecW = eat16Bytes(Bytes);
+      if (tryDecodeInst(DecoderTableGFX940128, MI, DecW, Address, CS))
+        break;
+
+      // Reinitialize Bytes
+      Bytes = Bytes_.slice(0, MaxInstBytesNum);
     }
 
     if (Bytes.size() >= 8) {
 
@@ -59,6 +59,10 @@ unsigned AMDGPUMCAsmInfo::getMaxInstLength(const MCSubtargetInfo *STI) const {
   if (STI->hasFeature(AMDGPU::FeatureNSAEncoding))
     return 20;
 
+  // VOP3PX encoding.
+  if (STI->hasFeature(AMDGPU::FeatureGFX950Insts))
+    return 16;
+
   // 64-bit instruction with 32-bit literal.
   if (STI->hasFeature(AMDGPU::FeatureVOP3Literal))
     return 12;
 
@@ -15449,6 +15449,23 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
         MRI.setRegClass(Op.getReg(), NewRC);
       }
 
+      if (TII->isMAI(MI)) {
+        // The ordinary src0, src1, src2 were legalized above.
+        //
+        // We have to also legalize the appended v_mfma_ld_scale_b32 operands,
+        // as a separate instruction.
+        int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
+                                                 AMDGPU::OpName::scale_src0);
+        if (Src0Idx != -1) {
+          int Src1Idx = Src0Idx + 2;
+          assert(Src1Idx = AMDGPU::getNamedOperandIdx(
+                     MI.getOpcode(), AMDGPU::OpName::scale_src1));
+          if (TII->usesConstantBus(MRI, MI, Src0Idx) &&
+              TII->usesConstantBus(MRI, MI, Src1Idx))
+            TII->legalizeOpWithMove(MI, Src1Idx);
+        }
+      }
+
       if (!HasAGPRs)
         return;
 
 
@@ -300,6 +300,11 @@ class Enc96 {
   int Size = 12;
 }
 
+class Enc128 {
+  field bits<128> Inst;
+  int Size = 16;
+}
+
 def CPolBit {
   int GLC = 0;
   int SLC = 1;
 
@@ -1115,6 +1115,12 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
                        const MachineOperand &MO,
                        const MCOperandInfo &OpInfo) const;
 
+  bool usesConstantBus(const MachineRegisterInfo &MRI, const MachineInstr &MI,
+                       int OpIdx) const {
+    return usesConstantBus(MRI, MI.getOperand(OpIdx),
+                           MI.getDesc().operands()[OpIdx]);
+  }
+
   /// Return true if this instruction has any modifiers.
   ///  e.g. src[012]_mod, omod, clamp.
   bool hasModifiers(unsigned Opcode) const;
 
@@ -914,6 +914,16 @@ def fp16_zeros_high_16bits : PatLeaf<(f16 VGPR_32:$src), [{
   return fp16SrcZerosHighBits(N->getOpcode());
 }]>;
 
+def MFMALdScaleXForm : SDNodeXForm<timm, [{
+  unsigned Val = N->getZExtValue();
+  unsigned New = 0;
+  if (Val & 0x1)
+    New |= SISrcMods::OP_SEL_0;
+  if (Val & 0x2)
+    New |= SISrcMods::OP_SEL_1;
+  return CurDAG->getTargetConstant(New, SDLoc(N), MVT::i32);
+}]>;
+
 def is_canonicalized : PatLeaf<(fAny srcvalue:$src), [{
   const SITargetLowering &Lowering =
       *static_cast<const SITargetLowering *>(getTargetLowering());
@@ -1515,6 +1525,10 @@ class PackedIntInputMods <PackedIntInputModsMatchClass matchClass> : InputMods <
 def PackedF16InputMods : PackedFPInputMods<PackedF16InputModsMatchClass>;
 def PackedI16InputMods : PackedIntInputMods<PackedI16InputModsMatchClass>;
 
+def MFMALdScaleModifierOp : TImmLeaf<i32, [{
+  return isUInt<2>(Imm);
+}], MFMALdScaleXForm>;
+
 //===----------------------------------------------------------------------===//
 // Complex patterns
 //===----------------------------------------------------------------------===//
@@ -2851,6 +2865,8 @@ def VOP_V16F32_V2I32_V4I32_I32    : VOPProfile <[v16f32, v2i32, v4i32, i32]>;
 def VOP_V4F32_V8F16_V8F16_V4F32   : VOPProfile <[v4f32,  v8f16, v8f16, v4f32]>;
 def VOP_V16F32_V8F16_V8F16_V16F32 : VOPProfile <[v16f32, v8f16, v8f16, v16f32]>;
 def VOP_V16F32_V8BF16_V8BF16_V16F32 : VOPProfile <[v16f32, v8bf16, v8bf16, v16f32]>;
+def VOP_V4F32_V8I32_V8I32_V4F32   : VOPProfile <[v4f32,  v8i32, v8i32, v4f32]>;
+def VOP_V16F32_V8I32_V8I32_V16F32   : VOPProfile <[v16f32, v8i32, v8i32, v16f32]>;
 
 
 class Commutable_REV <string revOp, bit isOrig> {
 
@@ -1338,11 +1338,13 @@ class AVSrcOperand<RegisterClass regClass, string width>
 def AVSrc_32 : AVSrcOperand<AV_32, "OPW32">;
 def AVSrc_64 : AVSrcOperand<AV_64, "OPW64">;
 def AVSrc_128 : AVSrcOperand<AV_128, "OPW128">;
+def AVSrc_256 : AVSrcOperand<AV_256, "OPW256">;
 
 class AVDstOperand<RegisterClass regClass, string width>
   : AVOperand<regClass, "decodeAV10", width>;
 
 def AVDst_128 : AVDstOperand<AV_128, "OPW128">;
+def AVDst_256 : AVDstOperand<AV_256, "OPW256">;
 def AVDst_512 : AVDstOperand<AV_512, "OPW512">;
 
 class AVLdStOperand<RegisterClass regClass, string width>