intel
diff --git a/‎clang/include/clang/Basic/arm_neon.td
Lines changed: 33 additions & 1 deletion b/‎clang/include/clang/Basic/arm_neon.td
Lines changed: 33 additions & 1 deletion
diff --git a/‎clang/lib/Basic/Targets/AArch64.cpp
Lines changed: 6 additions & 0 deletions b/‎clang/lib/Basic/Targets/AArch64.cpp
Lines changed: 6 additions & 0 deletions
diff --git a/‎clang/lib/Basic/Targets/AArch64.h
Lines changed: 1 addition & 0 deletions b/‎clang/lib/Basic/Targets/AArch64.h
Lines changed: 1 addition & 0 deletions
diff --git a/‎clang/lib/CodeGen/CGBuiltin.cpp
Lines changed: 24 additions & 0 deletions b/‎clang/lib/CodeGen/CGBuiltin.cpp
Lines changed: 24 additions & 0 deletions
diff --git a/‎clang/test/CodeGen/aarch64-matmul.cpp
Lines changed: 8 additions & 0 deletions b/‎clang/test/CodeGen/aarch64-matmul.cpp
Lines changed: 8 additions & 0 deletions
diff --git a/‎clang/test/CodeGen/aarch64-v8.6a-neon-intrinsics.c
Lines changed: 147 additions & 0 deletions b/‎clang/test/CodeGen/aarch64-v8.6a-neon-intrinsics.c
Lines changed: 147 additions & 0 deletions
diff --git a/‎llvm/include/llvm/IR/IntrinsicsAArch64.td
Lines changed: 11 additions & 0 deletions b/‎llvm/include/llvm/IR/IntrinsicsAArch64.td
Lines changed: 11 additions & 0 deletions
diff --git a/‎llvm/lib/Target/AArch64/AArch64.td
Lines changed: 10 additions & 2 deletions b/‎llvm/lib/Target/AArch64/AArch64.td
Lines changed: 10 additions & 2 deletions
@@ -221,6 +221,21 @@ def OP_FMLAL_LN_Hi  : Op<(call "vfmlal_high", $p0, $p1,
 def OP_FMLSL_LN_Hi  : Op<(call "vfmlsl_high", $p0, $p1,
                            (dup_typed $p1, (call "vget_lane", $p2, $p3)))>;
 
+def OP_USDOT_LN
+    : Op<(call "vusdot", $p0, $p1,
+          (cast "8", "S", (call_mangled "splat_lane", (bitcast "int32x2_t", $p2), $p3)))>;
+def OP_USDOT_LNQ
+    : Op<(call "vusdot", $p0, $p1,
+          (cast "8", "S", (call_mangled "splat_lane", (bitcast "int32x4_t", $p2), $p3)))>;
+
+// sudot splats the second vector and then calls vusdot
+def OP_SUDOT_LN
+    : Op<(call "vusdot", $p0,
+          (cast "8", "U", (call_mangled "splat_lane", (bitcast "int32x2_t", $p2), $p3)), $p1)>;
+def OP_SUDOT_LNQ
+    : Op<(call "vusdot", $p0,
+          (cast "8", "U", (call_mangled "splat_lane", (bitcast "int32x4_t", $p2), $p3)), $p1)>;
+
 //===----------------------------------------------------------------------===//
 // Auxiliary Instructions
 //===----------------------------------------------------------------------===//
@@ -1792,6 +1807,23 @@ let ArchGuard = "defined(__ARM_FEATURE_FP16FML) && defined(__aarch64__)" in {
   }
 }
 
+let ArchGuard = "defined(__ARM_FEATURE_MATMUL_INT8)" in {
+  def VMMLA   : SInst<"vmmla", "..(<<)(<<)", "QUiQi">;
+  def VUSMMLA : SInst<"vusmmla", "..(<<U)(<<)", "Qi">;
+
+  def VUSDOT  : SInst<"vusdot", "..(<<U)(<<)", "iQi">;
+
+  def VUSDOT_LANE  : SOpInst<"vusdot_lane", "..(<<U)(<<q)I", "iQi", OP_USDOT_LN>;
+  def VSUDOT_LANE  : SOpInst<"vsudot_lane", "..(<<)(<<qU)I", "iQi", OP_SUDOT_LN>;
+
+  let ArchGuard = "defined(__aarch64__)" in {
+    let isLaneQ = 1 in {
+      def VUSDOT_LANEQ  : SOpInst<"vusdot_laneq", "..(<<U)(<<Q)I", "iQi", OP_USDOT_LNQ>;
+      def VSUDOT_LANEQ  : SOpInst<"vsudot_laneq", "..(<<)(<<QU)I", "iQi", OP_SUDOT_LNQ>;
+    }
+  }
+}
+
 // v8.3-A Vector complex addition intrinsics
 let ArchGuard = "defined(__ARM_FEATURE_COMPLEX) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)" in {
   def VCADD_ROT90_FP16   : SInst<"vcadd_rot90", "...", "h">;
@@ -1808,4 +1840,4 @@ let ArchGuard = "defined(__ARM_FEATURE_COMPLEX)" in {
 let ArchGuard = "defined(__ARM_FEATURE_COMPLEX) && defined(__aarch64__)" in {
   def VCADDQ_ROT90_FP64  : SInst<"vcaddq_rot90", "QQQ", "d">;
   def VCADDQ_ROT270_FP64 : SInst<"vcaddq_rot270", "QQQ", "d">;
-}
+}
@@ -280,6 +280,9 @@ void AArch64TargetInfo::getTargetDefines(const LangOptions &Opts,
   if (HasTME)
     Builder.defineMacro("__ARM_FEATURE_TME", "1");
 
+  if (HasMatMul)
+    Builder.defineMacro("__ARM_FEATURE_MATMUL_INT8", "1");
+
   if ((FPU & NeonMode) && HasFP16FML)
     Builder.defineMacro("__ARM_FEATURE_FP16FML", "1");
 
@@ -356,6 +359,7 @@ bool AArch64TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
   HasFP16FML = false;
   HasMTE = false;
   HasTME = false;
+  HasMatMul = false;
   ArchKind = llvm::AArch64::ArchKind::ARMV8A;
 
   for (const auto &Feature : Features) {
@@ -391,6 +395,8 @@ bool AArch64TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
       HasMTE = true;
     if (Feature == "+tme")
       HasTME = true;
+    if (Feature == "+i8mm")
+      HasMatMul = true;
   }
 
   setDataLayout();
 
@@ -36,6 +36,7 @@ class LLVM_LIBRARY_VISIBILITY AArch64TargetInfo : public TargetInfo {
   bool HasFP16FML;
   bool HasMTE;
   bool HasTME;
+  bool HasMatMul;
 
   llvm::AArch64::ArchKind ArchKind;
 
 
@@ -5009,6 +5009,7 @@ static const ARMVectorIntrinsicInfo AArch64SIMDIntrinsicMap[] = {
   NEONMAP1(vld1q_x2_v, aarch64_neon_ld1x2, 0),
   NEONMAP1(vld1q_x3_v, aarch64_neon_ld1x3, 0),
   NEONMAP1(vld1q_x4_v, aarch64_neon_ld1x4, 0),
+  NEONMAP2(vmmlaq_v, aarch64_neon_ummla, aarch64_neon_smmla, 0),
   NEONMAP0(vmovl_v),
   NEONMAP0(vmovn_v),
   NEONMAP1(vmul_v, aarch64_neon_pmul, Add1ArgType),
@@ -5091,6 +5092,9 @@ static const ARMVectorIntrinsicInfo AArch64SIMDIntrinsicMap[] = {
   NEONMAP0(vsubhn_v),
   NEONMAP0(vtst_v),
   NEONMAP0(vtstq_v),
+  NEONMAP1(vusdot_v, aarch64_neon_usdot, 0),
+  NEONMAP1(vusdotq_v, aarch64_neon_usdot, 0),
+  NEONMAP1(vusmmlaq_v, aarch64_neon_usmmla, 0),
 };
 
 static const ARMVectorIntrinsicInfo AArch64SISDIntrinsicMap[] = {
@@ -6076,6 +6080,26 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr(
     llvm::Type *Tys[2] = { Ty, InputTy };
     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vfmlsl_high");
   }
+  case NEON::BI__builtin_neon_vmmlaq_v: {
+    llvm::Type *InputTy =
+           llvm::VectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8);
+    llvm::Type *Tys[2] = { Ty, InputTy };
+    Int = Usgn ? LLVMIntrinsic : AltLLVMIntrinsic;
+    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmmla");
+  }
+  case NEON::BI__builtin_neon_vusmmlaq_v: {
+    llvm::Type *InputTy =
+           llvm::VectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8);
+    llvm::Type *Tys[2] = { Ty, InputTy };
+    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vusmmla");
+  }
+  case NEON::BI__builtin_neon_vusdot_v:
+  case NEON::BI__builtin_neon_vusdotq_v: {
+    llvm::Type *InputTy =
+           llvm::VectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8);
+    llvm::Type *Tys[2] = { Ty, InputTy };
+    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vusdot");
+  }
   }
 
   assert(Int && "Expected valid intrinsic number");
 
@@ -0,0 +1,8 @@
+// RUN: %clang_cc1             -triple aarch64-eabi -target-feature +neon -target-feature +i8mm -S -emit-llvm %s -o - | FileCheck %s
+
+#ifdef __ARM_FEATURE_MATMUL_INT8
+extern "C" void arm_feature_matmulint8_defined() {}
+#endif
+// CHECK: define void @arm_feature_matmulint8_defined()
+
+
@@ -0,0 +1,147 @@
+// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +fullfp16 -target-feature +v8.6a -target-feature +i8mm \
+// RUN: -fallow-half-arguments-and-returns -S -disable-O0-optnone -emit-llvm -o - %s \
+// RUN: | opt -S -mem2reg -sroa \
+// RUN: | FileCheck %s
+
+// REQUIRES: aarch64-registered-target
+
+#include <arm_neon.h>
+
+// CHECK-LABEL: test_vmmlaq_s32
+// CHECK: [[VAL:%.*]] = call <4 x i32> @llvm.aarch64.neon.smmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b)
+// CHECK: ret <4 x i32> [[VAL]]
+int32x4_t test_vmmlaq_s32(int32x4_t r, int8x16_t a, int8x16_t b) {
+  return vmmlaq_s32(r, a, b);
+}
+
+// CHECK-LABEL: test_vmmlaq_u32
+// CHECK: [[VAL:%.*]] = call <4 x i32> @llvm.aarch64.neon.ummla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b)
+// CHECK: ret <4 x i32> [[VAL]]
+uint32x4_t test_vmmlaq_u32(uint32x4_t r, uint8x16_t a, uint8x16_t b) {
+  return vmmlaq_u32(r, a, b);
+}
+
+// CHECK-LABEL: test_vusmmlaq_s32
+// CHECK: [[VAL:%.*]] = call <4 x i32> @llvm.aarch64.neon.usmmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b)
+// CHECK: ret <4 x i32> [[VAL]]
+int32x4_t test_vusmmlaq_s32(int32x4_t r, uint8x16_t a, int8x16_t b) {
+  return vusmmlaq_s32(r, a, b);
+}
+
+// CHECK-LABEL: test_vusdot_s32
+// CHECK: [[VAL:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b)
+// CHECK: ret <2 x i32> [[VAL]]
+int32x2_t test_vusdot_s32(int32x2_t r, uint8x8_t a, int8x8_t b) {
+  return vusdot_s32(r, a, b);
+}
+
+// CHECK-LABEL: test_vusdot_lane_s32
+// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %b to <2 x i32>
+// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
+// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <2 x i32> zeroinitializer
+// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> %r to <8 x i8>
+// CHECK: [[OP:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> [[TMP4]])
+// CHECK: ret <2 x i32> [[OP]]
+int32x2_t test_vusdot_lane_s32(int32x2_t r, uint8x8_t a, int8x8_t b) {
+  return vusdot_lane_s32(r, a, b, 0);
+}
+
+// CHECK-LABEL: test_vsudot_lane_s32
+// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %b to <2 x i32>
+// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %0 to <8 x i8>
+// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> %1 to <2 x i32>
+// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <2 x i32> zeroinitializer
+// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> %r to <8 x i8>
+// CHECK: [[OP:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> [[TMP4]], <8 x i8> %a)
+// CHECK: ret <2 x i32> [[OP]]
+int32x2_t test_vsudot_lane_s32(int32x2_t r, int8x8_t a, uint8x8_t b) {
+  return vsudot_lane_s32(r, a, b, 0);
+}
+
+// CHECK-LABEL: test_vusdot_laneq_s32
+// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %b to <4 x i32>
+// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
+// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP2]], <2 x i32> zeroinitializer
+// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> %r to <8 x i8>
+// CHECK: [[OP:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> [[TMP4]])
+// CHECK: ret <2 x i32> [[OP]]
+int32x2_t test_vusdot_laneq_s32(int32x2_t r, uint8x8_t a, int8x16_t b) {
+  return vusdot_laneq_s32(r, a, b, 0);
+}
+
+// CHECK-LABEL: test_vsudot_laneq_s32
+// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %b to <4 x i32>
+// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
+// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP2]], <2 x i32> zeroinitializer
+// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> %r to <8 x i8>
+// CHECK: [[OP:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> [[TMP4]], <8 x i8> %a)
+// CHECK: ret <2 x i32> [[OP]]
+int32x2_t test_vsudot_laneq_s32(int32x2_t r, int8x8_t a, uint8x16_t b) {
+  return vsudot_laneq_s32(r, a, b, 0);
+}
+
+// CHECK-LABEL: test_vusdotq_s32
+// CHECK: [[VAL:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b)
+// CHECK: ret <4 x i32> [[VAL]]
+int32x4_t test_vusdotq_s32(int32x4_t r, uint8x16_t a, int8x16_t b) {
+  return vusdotq_s32(r, a, b);
+}
+
+// CHECK-LABEL: test_vusdotq_lane_s32
+// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %b to <2 x i32>
+// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
+// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <4 x i32> zeroinitializer
+// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8>
+// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> %r to <16 x i8>
+// CHECK: [[OP:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> [[TMP4]])
+// CHECK: ret <4 x i32> [[OP]]
+int32x4_t test_vusdotq_lane_s32(int32x4_t r, uint8x16_t a, int8x8_t b) {
+  return vusdotq_lane_s32(r, a, b, 0);
+}
+
+// CHECK-LABEL: test_vsudotq_lane_s32
+// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %b to <2 x i32>
+// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
+// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <4 x i32> zeroinitializer
+// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8>
+// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> %r to <16 x i8>
+// CHECK: [[OP:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> [[TMP4]], <16 x i8> %a)
+// CHECK: ret <4 x i32> [[OP]]
+int32x4_t test_vsudotq_lane_s32(int32x4_t r, int8x16_t a, uint8x8_t b) {
+  return vsudotq_lane_s32(r, a, b, 0);
+}
+
+// CHECK-LABEL: test_vusdotq_laneq_s32
+// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %b to <4 x i32>
+// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
+// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP2]], <4 x i32> zeroinitializer
+// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8>
+// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> %r to <16 x i8>
+// CHECK: [[OP:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> [[TMP4]])
+// CHECK: ret <4 x i32> [[OP]]
+int32x4_t test_vusdotq_laneq_s32(int32x4_t r, uint8x16_t a, int8x16_t b) {
+  return vusdotq_laneq_s32(r, a, b, 0);
+}
+
+// CHECK-LABEL: test_vsudotq_laneq_s32
+// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %b to <4 x i32>
+// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
+// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP2]], <4 x i32> zeroinitializer
+// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8>
+// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> %r to <16 x i8>
+// CHECK: [[OP:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> [[TMP4]], <16 x i8> %a)
+// CHECK: ret <4 x i32> [[OP]]
+int32x4_t test_vsudotq_laneq_s32(int32x4_t r, int8x16_t a, uint8x16_t b) {
+  return vsudotq_laneq_s32(r, a, b, 0);
+}
@@ -173,6 +173,11 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
     : Intrinsic<[llvm_anyvector_ty],
                 [LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>],
                 [IntrNoMem]>;
+
+  class AdvSIMD_MatMul_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>],
+                [IntrNoMem]>;
 }
 
 // Arithmetic ops
@@ -449,6 +454,12 @@ let TargetPrefix = "aarch64", IntrProperties = [IntrNoMem] in {
   def int_aarch64_neon_udot : AdvSIMD_Dot_Intrinsic;
   def int_aarch64_neon_sdot : AdvSIMD_Dot_Intrinsic;
 
+// v8.6-A Matrix Multiply Intrinsics
+  def int_aarch64_neon_ummla : AdvSIMD_MatMul_Intrinsic;
+  def int_aarch64_neon_smmla : AdvSIMD_MatMul_Intrinsic;
+  def int_aarch64_neon_usmmla : AdvSIMD_MatMul_Intrinsic;
+  def int_aarch64_neon_usdot : AdvSIMD_Dot_Intrinsic;
+
   // v8.2-A FP16 Fused Multiply-Add Long
   def int_aarch64_neon_fmlal : AdvSIMD_FP16FML_Intrinsic;
   def int_aarch64_neon_fmlsl : AdvSIMD_FP16FML_Intrinsic;
 
@@ -373,14 +373,22 @@ def FeatureTaggedGlobals : SubtargetFeature<"tagged-globals",
 def FeatureBF16 : SubtargetFeature<"bf16", "HasBF16",
     "true", "Enable BFloat16 Extension" >;
 
+def FeatureMatMulInt8 : SubtargetFeature<"i8mm", "HasMatMulInt8",
+    "true", "Enable Matrix Multiply Int8 Extension">;
+
+def FeatureMatMulFP32 : SubtargetFeature<"f32mm", "HasMatMulFP32",
+    "true", "Enable Matrix Multiply FP32 Extension", [FeatureSVE]>;
+
+def FeatureMatMulFP64 : SubtargetFeature<"f64mm", "HasMatMulFP64",
+    "true", "Enable Matrix Multiply FP64 Extension", [FeatureSVE]>;
+
 def FeatureFineGrainedTraps : SubtargetFeature<"fgt", "HasFineGrainedTraps",
     "true", "Enable fine grained virtualization traps extension">;
 
 def FeatureEnhancedCounterVirtualization :
       SubtargetFeature<"ecv", "HasEnhancedCounterVirtualization",
       "true", "Enable enhanced counter virtualization extension">;
 
-
 //===----------------------------------------------------------------------===//
 // Architectures.
 //
@@ -413,7 +421,7 @@ def HasV8_6aOps : SubtargetFeature<
   "v8.6a", "HasV8_6aOps", "true", "Support ARM v8.6a instructions",
 
   [HasV8_5aOps, FeatureAMVS, FeatureBF16, FeatureFineGrainedTraps,
-   FeatureEnhancedCounterVirtualization]>;
+   FeatureEnhancedCounterVirtualization, FeatureMatMulInt8]>;
 
 //===----------------------------------------------------------------------===//
 // Register File Description