Skip to content

[GlobalISel][AArch64] Legalize G_ADD, G_SUB, G_AND, G_OR, and G_XOR for SVE #110561

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Oct 27, 2024

Conversation

tschuett
Copy link

@tschuett tschuett commented Sep 30, 2024

Credits: #72976

LLVM ERROR: cannot select: %3:zpr(<vscale x 2 x s64>) = G_MUL %0:fpr, %1:fpr (in function: xmulnxv2i64)

;; mul
define void @xmulnxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, ptr %p) {
entry:
%c = mul <vscale x 2 x i64> %a, %b
store <vscale x 2 x i64> %c, ptr %p, align 16
ret void
}

define void @mulnxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, ptr %p) {
entry:
%c = mul <vscale x 4 x i32> %a, %b
store <vscale x 4 x i32> %c, ptr %p, align 16
ret void
}

define void @mulnxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, ptr %p) {
entry:
%c = mul <vscale x 8 x i16> %a, %b
store <vscale x 8 x i16> %c, ptr %p, align 16
ret void
}

define void @mulnxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, ptr %p) {
entry:
%c = mul <vscale x 16 x i8> %a, %b
store <vscale x 16 x i8> %c, ptr %p, align 16
ret void
}

@llvmbot
Copy link
Member

llvmbot commented Sep 30, 2024

@llvm/pr-subscribers-backend-aarch64

Author: Thorsten Schütt (tschuett)

Changes

for SVE.

Credits: #72976

LLVM ERROR: cannot select: %3:zpr(<vscale x 2 x s64>) = G_MUL %0:fpr, %1:fpr (in function: xmulnxv2i64)

;; mul
define void @xmulnxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, ptr %p) { entry:
%c = mul <vscale x 2 x i64> %a, %b
store <vscale x 2 x i64> %c, ptr %p, align 16
ret void
}

define void @mulnxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, ptr %p) { entry:
%c = mul <vscale x 4 x i32> %a, %b
store <vscale x 4 x i32> %c, ptr %p, align 16
ret void
}

define void @mulnxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, ptr %p) { entry:
%c = mul <vscale x 8 x i16> %a, %b
store <vscale x 8 x i16> %c, ptr %p, align 16
ret void
}

define void @mulnxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, ptr %p) { entry:
%c = mul <vscale x 16 x i8> %a, %b
store <vscale x 16 x i8> %c, ptr %p, align 16
ret void
}


Full diff: https://github.com/llvm/llvm-project/pull/110561.diff

7 Files Affected:

  • (modified) llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h (+9-5)
  • (modified) llvm/lib/Target/AArch64/AArch64GenRegisterBankInfo.def (+2)
  • (modified) llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp (+2-2)
  • (modified) llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp (+9-1)
  • (modified) llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp (+32-1)
  • (added) llvm/test/CodeGen/AArch64/GlobalISel/regbank-mul.mir (+25)
  • (added) llvm/test/CodeGen/AArch64/sve-integer.ll (+268)
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
index 82e713f30ea31c..d42bfea2bd4438 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
@@ -970,8 +970,7 @@ class LegalizeRuleSet {
         LegalizeAction::WidenScalar,
         [=](const LegalityQuery &Query) {
           const LLT VecTy = Query.Types[TypeIdx];
-          return VecTy.isVector() && !VecTy.isScalable() &&
-                 VecTy.getSizeInBits() < VectorSize;
+          return VecTy.isFixedVector() && VecTy.getSizeInBits() < VectorSize;
         },
         [=](const LegalityQuery &Query) {
           const LLT VecTy = Query.Types[TypeIdx];
@@ -1139,7 +1138,7 @@ class LegalizeRuleSet {
         LegalizeAction::MoreElements,
         [=](const LegalityQuery &Query) {
           LLT VecTy = Query.Types[TypeIdx];
-          return VecTy.isVector() && VecTy.getElementType() == EltTy &&
+          return VecTy.isFixedVector() && VecTy.getElementType() == EltTy &&
                  VecTy.getNumElements() < MinElements;
         },
         [=](const LegalityQuery &Query) {
@@ -1157,7 +1156,7 @@ class LegalizeRuleSet {
         LegalizeAction::MoreElements,
         [=](const LegalityQuery &Query) {
           LLT VecTy = Query.Types[TypeIdx];
-          return VecTy.isVector() && VecTy.getElementType() == EltTy &&
+          return VecTy.isFixedVector() && VecTy.getElementType() == EltTy &&
                  (VecTy.getNumElements() % NumElts != 0);
         },
         [=](const LegalityQuery &Query) {
@@ -1177,7 +1176,7 @@ class LegalizeRuleSet {
         LegalizeAction::FewerElements,
         [=](const LegalityQuery &Query) {
           LLT VecTy = Query.Types[TypeIdx];
-          return VecTy.isVector() && VecTy.getElementType() == EltTy &&
+          return VecTy.isFixedVector() && VecTy.getElementType() == EltTy &&
                  VecTy.getNumElements() > MaxElements;
         },
         [=](const LegalityQuery &Query) {
@@ -1198,6 +1197,11 @@ class LegalizeRuleSet {
     assert(MinTy.getElementType() == MaxTy.getElementType() &&
            "Expected element types to agree");
 
+    if (MinTy.isScalableVector())
+      return actionIf(LegalizeAction::Unsupported, always);
+    if (MaxTy.isScalableVector())
+      return actionIf(LegalizeAction::Unsupported, always);
+
     const LLT EltTy = MinTy.getElementType();
     return clampMinNumElements(TypeIdx, EltTy, MinTy.getNumElements())
         .clampMaxNumElements(TypeIdx, EltTy, MaxTy.getNumElements());
diff --git a/llvm/lib/Target/AArch64/AArch64GenRegisterBankInfo.def b/llvm/lib/Target/AArch64/AArch64GenRegisterBankInfo.def
index 82066b48c84b40..8ff59f60968beb 100644
--- a/llvm/lib/Target/AArch64/AArch64GenRegisterBankInfo.def
+++ b/llvm/lib/Target/AArch64/AArch64GenRegisterBankInfo.def
@@ -183,6 +183,8 @@ unsigned AArch64GenRegisterBankInfo::getRegBankBaseIdxOffset(unsigned RBIdx,
     const unsigned MinSize = Size.getKnownMinValue();
     assert((!Size.isScalable() || MinSize >= 128) &&
            "Scalable vector types should have size of at least 128 bits");
+    if (Size.isScalable())
+      return 3;
     if (MinSize <= 16)
       return 0;
     if (MinSize <= 32)
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
index 5aee7804de3e3f..6cbfb018b3183a 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
@@ -393,8 +393,8 @@ bool AArch64CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
       // i1 is a special case because SDAG i1 true is naturally zero extended
       // when widened using ANYEXT. We need to do it explicitly here.
       auto &Flags = CurArgInfo.Flags[0];
-      if (MRI.getType(CurVReg).getSizeInBits() == 1 && !Flags.isSExt() &&
-          !Flags.isZExt()) {
+      if (MRI.getType(CurVReg).getSizeInBits() == TypeSize::getFixed(1) &&
+          !Flags.isSExt() && !Flags.isZExt()) {
         CurVReg = MIRBuilder.buildZExt(LLT::scalar(8), CurVReg).getReg(0);
       } else if (TLI.getNumRegistersForCallingConv(Ctx, CC, SplitEVTs[i]) ==
                  1) {
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index df0c09d32c074a..afea08ab092501 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -615,6 +615,7 @@ getMinClassForRegBank(const RegisterBank &RB, TypeSize SizeInBits,
   unsigned RegBankID = RB.getID();
 
   if (RegBankID == AArch64::GPRRegBankID) {
+    assert(!SizeInBits.isScalable() && "Unexpected scalable register size");
     if (SizeInBits <= 32)
       return GetAllRegSet ? &AArch64::GPR32allRegClass
                           : &AArch64::GPR32RegClass;
@@ -626,6 +627,12 @@ getMinClassForRegBank(const RegisterBank &RB, TypeSize SizeInBits,
   }
 
   if (RegBankID == AArch64::FPRRegBankID) {
+    if (SizeInBits.isScalable()) {
+      assert(SizeInBits == TypeSize::getScalable(128) &&
+             "Unexpected scalable register size");
+      return &AArch64::ZPRRegClass;
+    }
+
     switch (SizeInBits) {
     default:
       return nullptr;
@@ -964,7 +971,8 @@ getRegClassesForCopy(MachineInstr &I, const TargetInstrInfo &TII,
   // then we can pull it into the helpers that get the appropriate class for a
   // register bank. Or make a new helper that carries along some constraint
   // information.
-  if (SrcRegBank != DstRegBank && (DstSize == 1 && SrcSize == 1))
+  if (SrcRegBank != DstRegBank &&
+      (DstSize == TypeSize::getFixed(1) && SrcSize == TypeSize::getFixed(1)))
     SrcSize = DstSize = TypeSize::getFixed(32);
 
   return {getMinClassForRegBank(SrcRegBank, SrcSize, true),
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 51aeee023f2e34..910a4ab2ddc178 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -91,6 +91,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
 
   const bool HasCSSC = ST.hasCSSC();
   const bool HasRCPC3 = ST.hasRCPC3();
+  const bool HasSVE = ST.hasSVE();
 
   getActionDefinitionsBuilder(
       {G_IMPLICIT_DEF, G_FREEZE, G_CONSTANT_FOLD_BARRIER})
@@ -127,7 +128,37 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .clampNumElements(0, v2s64, v2s64)
       .moreElementsToNextPow2(0);
 
-  getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR})
+  auto &IntegerArithmeticActions =
+      getActionDefinitionsBuilder({G_ADD, G_SUB, G_AND, G_OR, G_XOR});
+  if (HasSVE)
+    IntegerArithmeticActions.legalFor({nxv16s8, nxv8s16, nxv4s32, nxv2s64});
+  IntegerArithmeticActions
+      .legalFor({s32, s64, v2s32, v2s64, v4s32, v4s16, v8s16, v16s8, v8s8})
+      .widenScalarToNextPow2(0)
+      .clampScalar(0, s32, s64)
+      .clampMaxNumElements(0, s8, 16)
+      .clampMaxNumElements(0, s16, 8)
+      .clampNumElements(0, v2s32, v4s32)
+      .clampNumElements(0, v2s64, v2s64)
+      .minScalarOrEltIf(
+          [=](const LegalityQuery &Query) {
+            return Query.Types[0].getNumElements() <= 2;
+          },
+          0, s32)
+      .minScalarOrEltIf(
+          [=](const LegalityQuery &Query) {
+            return Query.Types[0].getNumElements() <= 4;
+          },
+          0, s16)
+      .minScalarOrEltIf(
+          [=](const LegalityQuery &Query) {
+            return Query.Types[0].getNumElements() <= 16;
+          },
+          0, s8)
+      .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
+      .moreElementsToNextPow2(0);
+
+  getActionDefinitionsBuilder(G_MUL)
       .legalFor({s32, s64, v2s32, v2s64, v4s32, v4s16, v8s16, v16s8, v8s8})
       .widenScalarToNextPow2(0)
       .clampScalar(0, s32, s64)
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/regbank-mul.mir b/llvm/test/CodeGen/AArch64/GlobalISel/regbank-mul.mir
new file mode 100644
index 00000000000000..d2e76227741cb6
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/regbank-mul.mir
@@ -0,0 +1,25 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=aarch64 -mattr=+sve -run-pass=regbankselect -verify-machineinstrs %s -o - | FileCheck %s
+
+...
+---
+name:            fp_inputs
+legalized:       true
+body:             |
+  bb.0:
+    liveins: $s0, $s1
+
+    ; CHECK-LABEL: name: fp_inputs
+    ; CHECK: liveins: $s0, $s1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %lhs:fpr(<vscale x 2 x s64>) = COPY $z0
+    ; CHECK-NEXT: %rhs:fpr(<vscale x 2 x s64>) = COPY $z1
+    ; CHECK-NEXT: %res:fpr(<vscale x 2 x s64>) = G_MUL %lhs, %rhs
+    ; CHECK-NEXT: $z0 = COPY %res(<vscale x 2 x s64>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $z0
+    %lhs:_(<vscale x 2 x s64>) = COPY $z0
+    %rhs:_(<vscale x 2 x s64>) = COPY $z1
+    %res:_(<vscale x 2 x s64>) = G_MUL %lhs, %rhs
+    $z0 = COPY %res(<vscale x 2 x s64>)
+    RET_ReallyLR implicit $z0
+
diff --git a/llvm/test/CodeGen/AArch64/sve-integer.ll b/llvm/test/CodeGen/AArch64/sve-integer.ll
new file mode 100644
index 00000000000000..ad66190839ce0f
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-integer.ll
@@ -0,0 +1,268 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc < %s -mtriple aarch64 -mattr=+sve -aarch64-enable-gisel-sve=1  | FileCheck %s
+; RUN: llc < %s -mtriple aarch64 -mattr=+sve -global-isel -aarch64-enable-gisel-sve=1 | FileCheck %s
+
+;; add
+define void @addnxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, ptr %p) {
+; CHECK-LABEL: addnxv2i64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    add z0.d, z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %c = add <vscale x 2 x i64> %a, %b
+  store <vscale x 2 x i64> %c, ptr %p, align 16
+  ret void
+}
+
+define void @addnxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, ptr %p) {
+; CHECK-LABEL: addnxv4i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    add z0.s, z0.s, z1.s
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %c = add <vscale x 4 x i32> %a, %b
+  store <vscale x 4 x i32> %c, ptr %p, align 16
+  ret void
+}
+
+define void @addnxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, ptr %p) {
+; CHECK-LABEL: addnxv8i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    add z0.h, z0.h, z1.h
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %c = add <vscale x 8 x i16> %a, %b
+  store <vscale x 8 x i16> %c, ptr %p, align 16
+  ret void
+}
+
+define void @addnxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, ptr %p) {
+; CHECK-LABEL: addnxv16i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    add z0.b, z0.b, z1.b
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %c = add <vscale x 16 x i8> %a, %b
+  store <vscale x 16 x i8> %c, ptr %p, align 16
+  ret void
+}
+
+;; sub
+define void @subnxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, ptr %p) {
+; CHECK-LABEL: subnxv2i64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub z0.d, z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %c = sub <vscale x 2 x i64> %a, %b
+  store <vscale x 2 x i64> %c, ptr %p, align 16
+  ret void
+}
+
+define void @subnxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, ptr %p) {
+; CHECK-LABEL: subnxv4i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub z0.s, z0.s, z1.s
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %c = sub <vscale x 4 x i32> %a, %b
+  store <vscale x 4 x i32> %c, ptr %p, align 16
+  ret void
+}
+
+define void @subnxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, ptr %p) {
+; CHECK-LABEL: subnxv8i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub z0.h, z0.h, z1.h
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %c = sub <vscale x 8 x i16> %a, %b
+  store <vscale x 8 x i16> %c, ptr %p, align 16
+  ret void
+}
+
+define void @subnxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, ptr %p) {
+; CHECK-LABEL: subnxv16i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub z0.b, z0.b, z1.b
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %c = sub <vscale x 16 x i8> %a, %b
+  store <vscale x 16 x i8> %c, ptr %p, align 16
+  ret void
+}
+
+;; and
+define void @andnxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, ptr %p) {
+; CHECK-LABEL: andnxv2i64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    and z0.d, z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %c = and <vscale x 2 x i64> %a, %b
+  store <vscale x 2 x i64> %c, ptr %p, align 16
+  ret void
+}
+
+define void @andnxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, ptr %p) {
+; CHECK-LABEL: andnxv4i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    and z0.d, z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %c = and <vscale x 4 x i32> %a, %b
+  store <vscale x 4 x i32> %c, ptr %p, align 16
+  ret void
+}
+
+define void @andnxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, ptr %p) {
+; CHECK-LABEL: andnxv8i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    and z0.d, z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %c = and <vscale x 8 x i16> %a, %b
+  store <vscale x 8 x i16> %c, ptr %p, align 16
+  ret void
+}
+
+define void @andnxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, ptr %p) {
+; CHECK-LABEL: andnxv16i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    and z0.d, z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %c = and <vscale x 16 x i8> %a, %b
+  store <vscale x 16 x i8> %c, ptr %p, align 16
+  ret void
+}
+
+;; or
+define void @ornxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, ptr %p) {
+; CHECK-LABEL: ornxv2i64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    orr z0.d, z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %c = or <vscale x 2 x i64> %a, %b
+  store <vscale x 2 x i64> %c, ptr %p, align 16
+  ret void
+}
+
+define void @ornxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, ptr %p) {
+; CHECK-LABEL: ornxv4i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    orr z0.d, z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %c = or <vscale x 4 x i32> %a, %b
+  store <vscale x 4 x i32> %c, ptr %p, align 16
+  ret void
+}
+
+define void @ornxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, ptr %p) {
+; CHECK-LABEL: ornxv8i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    orr z0.d, z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %c = or <vscale x 8 x i16> %a, %b
+  store <vscale x 8 x i16> %c, ptr %p, align 16
+  ret void
+}
+
+define void @ornxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, ptr %p) {
+; CHECK-LABEL: ornxv16i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    orr z0.d, z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %c = or <vscale x 16 x i8> %a, %b
+  store <vscale x 16 x i8> %c, ptr %p, align 16
+  ret void
+}
+
+;; xor
+define void @xornxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, ptr %p) {
+; CHECK-LABEL: xornxv2i64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    eor z0.d, z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %c = xor <vscale x 2 x i64> %a, %b
+  store <vscale x 2 x i64> %c, ptr %p, align 16
+  ret void
+}
+
+define void @xornxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, ptr %p) {
+; CHECK-LABEL: xornxv4i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    eor z0.d, z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %c = xor <vscale x 4 x i32> %a, %b
+  store <vscale x 4 x i32> %c, ptr %p, align 16
+  ret void
+}
+
+define void @xornxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, ptr %p) {
+; CHECK-LABEL: xornxv8i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    eor z0.d, z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %c = xor <vscale x 8 x i16> %a, %b
+  store <vscale x 8 x i16> %c, ptr %p, align 16
+  ret void
+}
+
+define void @xornxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, ptr %p) {
+; CHECK-LABEL: xornxv16i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    eor z0.d, z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %c = xor <vscale x 16 x i8> %a, %b
+  store <vscale x 16 x i8> %c, ptr %p, align 16
+  ret void
+}

@llvmbot
Copy link
Member

llvmbot commented Sep 30, 2024

@llvm/pr-subscribers-llvm-globalisel

Author: Thorsten Schütt (tschuett)

Changes

for SVE.

Credits: #72976

LLVM ERROR: cannot select: %3:zpr(<vscale x 2 x s64>) = G_MUL %0:fpr, %1:fpr (in function: xmulnxv2i64)

;; mul
define void @xmulnxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, ptr %p) { entry:
%c = mul <vscale x 2 x i64> %a, %b
store <vscale x 2 x i64> %c, ptr %p, align 16
ret void
}

define void @mulnxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, ptr %p) { entry:
%c = mul <vscale x 4 x i32> %a, %b
store <vscale x 4 x i32> %c, ptr %p, align 16
ret void
}

define void @mulnxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, ptr %p) { entry:
%c = mul <vscale x 8 x i16> %a, %b
store <vscale x 8 x i16> %c, ptr %p, align 16
ret void
}

define void @mulnxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, ptr %p) { entry:
%c = mul <vscale x 16 x i8> %a, %b
store <vscale x 16 x i8> %c, ptr %p, align 16
ret void
}


Full diff: https://github.com/llvm/llvm-project/pull/110561.diff

7 Files Affected:

  • (modified) llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h (+9-5)
  • (modified) llvm/lib/Target/AArch64/AArch64GenRegisterBankInfo.def (+2)
  • (modified) llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp (+2-2)
  • (modified) llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp (+9-1)
  • (modified) llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp (+32-1)
  • (added) llvm/test/CodeGen/AArch64/GlobalISel/regbank-mul.mir (+25)
  • (added) llvm/test/CodeGen/AArch64/sve-integer.ll (+268)
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
index 82e713f30ea31c..d42bfea2bd4438 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
@@ -970,8 +970,7 @@ class LegalizeRuleSet {
         LegalizeAction::WidenScalar,
         [=](const LegalityQuery &Query) {
           const LLT VecTy = Query.Types[TypeIdx];
-          return VecTy.isVector() && !VecTy.isScalable() &&
-                 VecTy.getSizeInBits() < VectorSize;
+          return VecTy.isFixedVector() && VecTy.getSizeInBits() < VectorSize;
         },
         [=](const LegalityQuery &Query) {
           const LLT VecTy = Query.Types[TypeIdx];
@@ -1139,7 +1138,7 @@ class LegalizeRuleSet {
         LegalizeAction::MoreElements,
         [=](const LegalityQuery &Query) {
           LLT VecTy = Query.Types[TypeIdx];
-          return VecTy.isVector() && VecTy.getElementType() == EltTy &&
+          return VecTy.isFixedVector() && VecTy.getElementType() == EltTy &&
                  VecTy.getNumElements() < MinElements;
         },
         [=](const LegalityQuery &Query) {
@@ -1157,7 +1156,7 @@ class LegalizeRuleSet {
         LegalizeAction::MoreElements,
         [=](const LegalityQuery &Query) {
           LLT VecTy = Query.Types[TypeIdx];
-          return VecTy.isVector() && VecTy.getElementType() == EltTy &&
+          return VecTy.isFixedVector() && VecTy.getElementType() == EltTy &&
                  (VecTy.getNumElements() % NumElts != 0);
         },
         [=](const LegalityQuery &Query) {
@@ -1177,7 +1176,7 @@ class LegalizeRuleSet {
         LegalizeAction::FewerElements,
         [=](const LegalityQuery &Query) {
           LLT VecTy = Query.Types[TypeIdx];
-          return VecTy.isVector() && VecTy.getElementType() == EltTy &&
+          return VecTy.isFixedVector() && VecTy.getElementType() == EltTy &&
                  VecTy.getNumElements() > MaxElements;
         },
         [=](const LegalityQuery &Query) {
@@ -1198,6 +1197,11 @@ class LegalizeRuleSet {
     assert(MinTy.getElementType() == MaxTy.getElementType() &&
            "Expected element types to agree");
 
+    if (MinTy.isScalableVector())
+      return actionIf(LegalizeAction::Unsupported, always);
+    if (MaxTy.isScalableVector())
+      return actionIf(LegalizeAction::Unsupported, always);
+
     const LLT EltTy = MinTy.getElementType();
     return clampMinNumElements(TypeIdx, EltTy, MinTy.getNumElements())
         .clampMaxNumElements(TypeIdx, EltTy, MaxTy.getNumElements());
diff --git a/llvm/lib/Target/AArch64/AArch64GenRegisterBankInfo.def b/llvm/lib/Target/AArch64/AArch64GenRegisterBankInfo.def
index 82066b48c84b40..8ff59f60968beb 100644
--- a/llvm/lib/Target/AArch64/AArch64GenRegisterBankInfo.def
+++ b/llvm/lib/Target/AArch64/AArch64GenRegisterBankInfo.def
@@ -183,6 +183,8 @@ unsigned AArch64GenRegisterBankInfo::getRegBankBaseIdxOffset(unsigned RBIdx,
     const unsigned MinSize = Size.getKnownMinValue();
     assert((!Size.isScalable() || MinSize >= 128) &&
            "Scalable vector types should have size of at least 128 bits");
+    if (Size.isScalable())
+      return 3;
     if (MinSize <= 16)
       return 0;
     if (MinSize <= 32)
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
index 5aee7804de3e3f..6cbfb018b3183a 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
@@ -393,8 +393,8 @@ bool AArch64CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
       // i1 is a special case because SDAG i1 true is naturally zero extended
       // when widened using ANYEXT. We need to do it explicitly here.
       auto &Flags = CurArgInfo.Flags[0];
-      if (MRI.getType(CurVReg).getSizeInBits() == 1 && !Flags.isSExt() &&
-          !Flags.isZExt()) {
+      if (MRI.getType(CurVReg).getSizeInBits() == TypeSize::getFixed(1) &&
+          !Flags.isSExt() && !Flags.isZExt()) {
         CurVReg = MIRBuilder.buildZExt(LLT::scalar(8), CurVReg).getReg(0);
       } else if (TLI.getNumRegistersForCallingConv(Ctx, CC, SplitEVTs[i]) ==
                  1) {
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index df0c09d32c074a..afea08ab092501 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -615,6 +615,7 @@ getMinClassForRegBank(const RegisterBank &RB, TypeSize SizeInBits,
   unsigned RegBankID = RB.getID();
 
   if (RegBankID == AArch64::GPRRegBankID) {
+    assert(!SizeInBits.isScalable() && "Unexpected scalable register size");
     if (SizeInBits <= 32)
       return GetAllRegSet ? &AArch64::GPR32allRegClass
                           : &AArch64::GPR32RegClass;
@@ -626,6 +627,12 @@ getMinClassForRegBank(const RegisterBank &RB, TypeSize SizeInBits,
   }
 
   if (RegBankID == AArch64::FPRRegBankID) {
+    if (SizeInBits.isScalable()) {
+      assert(SizeInBits == TypeSize::getScalable(128) &&
+             "Unexpected scalable register size");
+      return &AArch64::ZPRRegClass;
+    }
+
     switch (SizeInBits) {
     default:
       return nullptr;
@@ -964,7 +971,8 @@ getRegClassesForCopy(MachineInstr &I, const TargetInstrInfo &TII,
   // then we can pull it into the helpers that get the appropriate class for a
   // register bank. Or make a new helper that carries along some constraint
   // information.
-  if (SrcRegBank != DstRegBank && (DstSize == 1 && SrcSize == 1))
+  if (SrcRegBank != DstRegBank &&
+      (DstSize == TypeSize::getFixed(1) && SrcSize == TypeSize::getFixed(1)))
     SrcSize = DstSize = TypeSize::getFixed(32);
 
   return {getMinClassForRegBank(SrcRegBank, SrcSize, true),
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 51aeee023f2e34..910a4ab2ddc178 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -91,6 +91,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
 
   const bool HasCSSC = ST.hasCSSC();
   const bool HasRCPC3 = ST.hasRCPC3();
+  const bool HasSVE = ST.hasSVE();
 
   getActionDefinitionsBuilder(
       {G_IMPLICIT_DEF, G_FREEZE, G_CONSTANT_FOLD_BARRIER})
@@ -127,7 +128,37 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .clampNumElements(0, v2s64, v2s64)
       .moreElementsToNextPow2(0);
 
-  getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR})
+  auto &IntegerArithmeticActions =
+      getActionDefinitionsBuilder({G_ADD, G_SUB, G_AND, G_OR, G_XOR});
+  if (HasSVE)
+    IntegerArithmeticActions.legalFor({nxv16s8, nxv8s16, nxv4s32, nxv2s64});
+  IntegerArithmeticActions
+      .legalFor({s32, s64, v2s32, v2s64, v4s32, v4s16, v8s16, v16s8, v8s8})
+      .widenScalarToNextPow2(0)
+      .clampScalar(0, s32, s64)
+      .clampMaxNumElements(0, s8, 16)
+      .clampMaxNumElements(0, s16, 8)
+      .clampNumElements(0, v2s32, v4s32)
+      .clampNumElements(0, v2s64, v2s64)
+      .minScalarOrEltIf(
+          [=](const LegalityQuery &Query) {
+            return Query.Types[0].getNumElements() <= 2;
+          },
+          0, s32)
+      .minScalarOrEltIf(
+          [=](const LegalityQuery &Query) {
+            return Query.Types[0].getNumElements() <= 4;
+          },
+          0, s16)
+      .minScalarOrEltIf(
+          [=](const LegalityQuery &Query) {
+            return Query.Types[0].getNumElements() <= 16;
+          },
+          0, s8)
+      .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
+      .moreElementsToNextPow2(0);
+
+  getActionDefinitionsBuilder(G_MUL)
       .legalFor({s32, s64, v2s32, v2s64, v4s32, v4s16, v8s16, v16s8, v8s8})
       .widenScalarToNextPow2(0)
       .clampScalar(0, s32, s64)
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/regbank-mul.mir b/llvm/test/CodeGen/AArch64/GlobalISel/regbank-mul.mir
new file mode 100644
index 00000000000000..d2e76227741cb6
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/regbank-mul.mir
@@ -0,0 +1,25 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=aarch64 -mattr=+sve -run-pass=regbankselect -verify-machineinstrs %s -o - | FileCheck %s
+
+...
+---
+name:            fp_inputs
+legalized:       true
+body:             |
+  bb.0:
+    liveins: $s0, $s1
+
+    ; CHECK-LABEL: name: fp_inputs
+    ; CHECK: liveins: $s0, $s1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %lhs:fpr(<vscale x 2 x s64>) = COPY $z0
+    ; CHECK-NEXT: %rhs:fpr(<vscale x 2 x s64>) = COPY $z1
+    ; CHECK-NEXT: %res:fpr(<vscale x 2 x s64>) = G_MUL %lhs, %rhs
+    ; CHECK-NEXT: $z0 = COPY %res(<vscale x 2 x s64>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $z0
+    %lhs:_(<vscale x 2 x s64>) = COPY $z0
+    %rhs:_(<vscale x 2 x s64>) = COPY $z1
+    %res:_(<vscale x 2 x s64>) = G_MUL %lhs, %rhs
+    $z0 = COPY %res(<vscale x 2 x s64>)
+    RET_ReallyLR implicit $z0
+
diff --git a/llvm/test/CodeGen/AArch64/sve-integer.ll b/llvm/test/CodeGen/AArch64/sve-integer.ll
new file mode 100644
index 00000000000000..ad66190839ce0f
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-integer.ll
@@ -0,0 +1,268 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc < %s -mtriple aarch64 -mattr=+sve -aarch64-enable-gisel-sve=1  | FileCheck %s
+; RUN: llc < %s -mtriple aarch64 -mattr=+sve -global-isel -aarch64-enable-gisel-sve=1 | FileCheck %s
+
+;; add
+define void @addnxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, ptr %p) {
+; CHECK-LABEL: addnxv2i64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    add z0.d, z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %c = add <vscale x 2 x i64> %a, %b
+  store <vscale x 2 x i64> %c, ptr %p, align 16
+  ret void
+}
+
+define void @addnxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, ptr %p) {
+; CHECK-LABEL: addnxv4i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    add z0.s, z0.s, z1.s
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %c = add <vscale x 4 x i32> %a, %b
+  store <vscale x 4 x i32> %c, ptr %p, align 16
+  ret void
+}
+
+define void @addnxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, ptr %p) {
+; CHECK-LABEL: addnxv8i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    add z0.h, z0.h, z1.h
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %c = add <vscale x 8 x i16> %a, %b
+  store <vscale x 8 x i16> %c, ptr %p, align 16
+  ret void
+}
+
+define void @addnxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, ptr %p) {
+; CHECK-LABEL: addnxv16i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    add z0.b, z0.b, z1.b
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %c = add <vscale x 16 x i8> %a, %b
+  store <vscale x 16 x i8> %c, ptr %p, align 16
+  ret void
+}
+
+;; sub
+define void @subnxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, ptr %p) {
+; CHECK-LABEL: subnxv2i64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub z0.d, z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %c = sub <vscale x 2 x i64> %a, %b
+  store <vscale x 2 x i64> %c, ptr %p, align 16
+  ret void
+}
+
+define void @subnxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, ptr %p) {
+; CHECK-LABEL: subnxv4i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub z0.s, z0.s, z1.s
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %c = sub <vscale x 4 x i32> %a, %b
+  store <vscale x 4 x i32> %c, ptr %p, align 16
+  ret void
+}
+
+define void @subnxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, ptr %p) {
+; CHECK-LABEL: subnxv8i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub z0.h, z0.h, z1.h
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %c = sub <vscale x 8 x i16> %a, %b
+  store <vscale x 8 x i16> %c, ptr %p, align 16
+  ret void
+}
+
+define void @subnxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, ptr %p) {
+; CHECK-LABEL: subnxv16i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub z0.b, z0.b, z1.b
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %c = sub <vscale x 16 x i8> %a, %b
+  store <vscale x 16 x i8> %c, ptr %p, align 16
+  ret void
+}
+
+;; and
+define void @andnxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, ptr %p) {
+; CHECK-LABEL: andnxv2i64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    and z0.d, z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %c = and <vscale x 2 x i64> %a, %b
+  store <vscale x 2 x i64> %c, ptr %p, align 16
+  ret void
+}
+
+define void @andnxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, ptr %p) {
+; CHECK-LABEL: andnxv4i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    and z0.d, z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %c = and <vscale x 4 x i32> %a, %b
+  store <vscale x 4 x i32> %c, ptr %p, align 16
+  ret void
+}
+
+define void @andnxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, ptr %p) {
+; CHECK-LABEL: andnxv8i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    and z0.d, z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %c = and <vscale x 8 x i16> %a, %b
+  store <vscale x 8 x i16> %c, ptr %p, align 16
+  ret void
+}
+
+define void @andnxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, ptr %p) {
+; CHECK-LABEL: andnxv16i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    and z0.d, z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %c = and <vscale x 16 x i8> %a, %b
+  store <vscale x 16 x i8> %c, ptr %p, align 16
+  ret void
+}
+
+;; or
+define void @ornxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, ptr %p) {
+; CHECK-LABEL: ornxv2i64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    orr z0.d, z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %c = or <vscale x 2 x i64> %a, %b
+  store <vscale x 2 x i64> %c, ptr %p, align 16
+  ret void
+}
+
+define void @ornxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, ptr %p) {
+; CHECK-LABEL: ornxv4i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    orr z0.d, z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %c = or <vscale x 4 x i32> %a, %b
+  store <vscale x 4 x i32> %c, ptr %p, align 16
+  ret void
+}
+
+define void @ornxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, ptr %p) {
+; CHECK-LABEL: ornxv8i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    orr z0.d, z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %c = or <vscale x 8 x i16> %a, %b
+  store <vscale x 8 x i16> %c, ptr %p, align 16
+  ret void
+}
+
+define void @ornxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, ptr %p) {
+; CHECK-LABEL: ornxv16i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    orr z0.d, z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %c = or <vscale x 16 x i8> %a, %b
+  store <vscale x 16 x i8> %c, ptr %p, align 16
+  ret void
+}
+
+;; xor
+define void @xornxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, ptr %p) {
+; CHECK-LABEL: xornxv2i64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    eor z0.d, z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %c = xor <vscale x 2 x i64> %a, %b
+  store <vscale x 2 x i64> %c, ptr %p, align 16
+  ret void
+}
+
+define void @xornxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, ptr %p) {
+; CHECK-LABEL: xornxv4i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    eor z0.d, z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %c = xor <vscale x 4 x i32> %a, %b
+  store <vscale x 4 x i32> %c, ptr %p, align 16
+  ret void
+}
+
+define void @xornxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, ptr %p) {
+; CHECK-LABEL: xornxv8i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    eor z0.d, z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %c = xor <vscale x 8 x i16> %a, %b
+  store <vscale x 8 x i16> %c, ptr %p, align 16
+  ret void
+}
+
+define void @xornxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, ptr %p) {
+; CHECK-LABEL: xornxv16i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    eor z0.d, z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %c = xor <vscale x 16 x i8> %a, %b
+  store <vscale x 16 x i8> %c, ptr %p, align 16
+  ret void
+}

@tschuett
Copy link
Author

How is GlobalISel going to handle predication, e.g., ABS?

@tschuett tschuett changed the title [GlobalISel][AArch64] Legalize G_ADD, G_SUB, G_AND, G_OR, and G_XOR [GlobalISel][AArch64] Legalize G_ADD, G_SUB, G_AND, G_OR, and G_XOR for SVE Oct 2, 2024
@tschuett
Copy link
Author

tschuett commented Oct 3, 2024

First bet, there is no pattern for mul for SVE.

@tschuett
Copy link
Author

tschuett commented Oct 5, 2024

> ag "\(ADD_ZZZ_H" lib/Target/AArch64/AArch64GenGlobalISel.inc
8233:      // (add:{ *:[nxv8i16] } nxv8i16:{ *:[nxv8i16] }:$Op1, nxv8i16:{ *:[nxv8i16] }:$Op2)  =>  (ADD_ZZZ_H:{ *:[nxv8i16] } ?:{ *:[nxv8i16] }:$Op1, ?:{ *:[nxv8i16] }:$Op2)
>  ag "\(SUB_ZZZ_H" lib/Target/AArch64/AArch64GenGlobalISel.inc
10643:      // (sub:{ *:[nxv8i16] } nxv8i16:{ *:[nxv8i16] }:$Op1, nxv8i16:{ *:[nxv8i16] }:$Op2)  =>  (SUB_ZZZ_H:{ *:[nxv8i16] } ?:{ *:[nxv8i16] }:$Op1, ?:{ *:[nxv8i16] }:$Op2)
> ag "\(MUL_ZZZ_H" lib/Target/AArch64/AArch64GenGlobalISel.inc

There are indeed patterns missing for the SVE G_MUL.

@tschuett
Copy link
Author

tschuett commented Oct 5, 2024

success:

> ag "\(AND_ZZZ" lib/Target/AArch64/AArch64GenGlobalISel.inc
> ag "\(EOR_ZZZ" lib/Target/AArch64/AArch64GenGlobalISel.inc
> ag "\(ORR_ZZZ" lib/Target/AArch64/AArch64GenGlobalISel.inc

@tschuett
Copy link
Author

tschuett commented Oct 5, 2024

Included from llvm/lib/Target/AArch64/AArch64.td:37:
Included from llvm/lib/Target/AArch64/AArch64InstrInfo.td:10288:
llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td:3491:21: note: instantiated from multiclass
  defm MUL_ZZZ    : sve2_int_mul<0b000,  "mul",   AArch64mul>;
>  ./bin/llvm-tblgen -gen-global-isel -warn-on-skipped-patterns 

The emitter does not support let statements:

let Predicates = [HasSVE2orSME] in {

but complains about multiclass. There are many interesting instructions hidden in let s.

There are no warnings about ADD_ZZZ and SUB_ZZZ despite them defined in let statements?!?

defm ADD_ZZZ   : sve_int_bin_cons_arit_0<0b000, "add", add>;
defm SUB_ZZZ   : sve_int_bin_cons_arit_0<0b001, "sub", sub>;

Add and Sub are SVE while Mul is SVE2?

let Predicates = [HasSVEorSME] in {

contains the working instructions. Mul is in a different let.

@tschuett
Copy link
Author

tschuett commented Oct 6, 2024

The complete error message about importing MUL_ZZZ:

Included from llvm/lib/Target/AArch64/AArch64.td:37:
Included from llvm/lib/Target/AArch64/AArch64InstrInfo.td:1035:
llvm/lib/Target/AArch64/SVEInstrFormats.td:3694:3: warning: Skipped pattern: Pattern operator lacks an equivalent Instruction (AArch64ISD::MUL_PRED)
  def : SVE_2_Op_Pat<nxv16i8, op, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
  ^
Included from llvm/lib/Target/AArch64/AArch64.td:37:
Included from llvm/lib/Target/AArch64/AArch64InstrInfo.td:10288:
llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td:3491:21: note: instantiated from multiclass
  defm MUL_ZZZ    : sve2_int_mul<0b000,  "mul",   AArch64mul>;
                    ^
Included from llvm/lib/Target/AArch64/AArch64.td:37:
Included from llvm/lib/Target/AArch64/AArch64InstrInfo.td:1035:
llvm/lib/Target/AArch64/SVEInstrFormats.td:3695:3: warning: Skipped pattern: Pattern operator lacks an equivalent Instruction (AArch64ISD::MUL_PRED)
  def : SVE_2_Op_Pat<nxv8i16, op, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
  ^
Included from llvm/lib/Target/AArch64/AArch64.td:37:
Included from llvm/lib/Target/AArch64/AArch64InstrInfo.td:10288:
llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td:3491:21: note: instantiated from multiclass
  defm MUL_ZZZ    : sve2_int_mul<0b000,  "mul",   AArch64mul>;
                    ^
Included from llvm/lib/Target/AArch64/AArch64.td:37:
Included from llvm/lib/Target/AArch64/AArch64InstrInfo.td:1035:
llvm/lib/Target/AArch64/SVEInstrFormats.td:3696:3: warning: Skipped pattern: Pattern operator lacks an equivalent Instruction (AArch64ISD::MUL_PRED)
  def : SVE_2_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
  ^
Included from llvm/lib/Target/AArch64/AArch64.td:37:
Included from llvm/lib/Target/AArch64/AArch64InstrInfo.td:10288:
llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td:3491:21: note: instantiated from multiclass
  defm MUL_ZZZ    : sve2_int_mul<0b000,  "mul",   AArch64mul>;
                    ^
Included from llvm/lib/Target/AArch64/AArch64.td:37:
Included from llvm/lib/Target/AArch64/AArch64InstrInfo.td:1035:
llvm/lib/Target/AArch64/SVEInstrFormats.td:3697:3: warning: Skipped pattern: Pattern operator lacks an equivalent Instruction (AArch64ISD::MUL_PRED)
  def : SVE_2_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
  ^
Included from llvm/lib/Target/AArch64/AArch64.td:37:
Included from llvm/lib/Target/AArch64/AArch64InstrInfo.td:10288:
llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td:3491:21: note: instantiated from multiclass
  defm MUL_ZZZ    : sve2_int_mul<0b000,  "mul",   AArch64mul>;

@tschuett
Copy link
Author

tschuett commented Oct 7, 2024

There are several occurrences.

def AArch64mul_p  : SDNode<"AArch64ISD::MUL_PRED",  SDT_AArch64Arith>;

where AArch64ISD::MUL_PRED is C++ and not tablegen.

@tschuett
Copy link
Author

tschuett commented Oct 8, 2024

Ping.

@davemgreen
Copy link
Collaborator

For the SVE issue you are seeing - some instructions in SVE are "unpredicated", many require a predicate even if it is an all-on predicate vector. mul I believe is one where there is an predicated +sve instruction, and an unpredicated +sve2 version. SDAG will convert them into MUL_PRED, but that has obvious downsides.

Do you mind if we have a first patch that just gets the arg-passing basics out of the way first, and start adding instruction support separately? Or does that all work already and all of this is for instruction support? We should be able to add tests that pass in values and store them, or load values and return them. I feel we need to make sure SVE is decently designed. I suppose that without any other suggestions we can follow how SDAG behaves.

@tschuett
Copy link
Author

tschuett commented Oct 10, 2024

I have ARM DDI 0487K.a. It lists under SVE integer arithmetic:

MUL Multiply by immediate MUL (immediate)
Multiply vectors MUL (vectors, predicated)
MUL (vectors, unpredicated)

but the unpredicated is actually SVE2 but listed in the SVE section.

And MUL_ZZZ was not imported. Everything works out of the box. I didn't dare to try ret but every else seems to work so far.

Thanks for taking a look!

@tschuett
Copy link
Author

At your speed.

@tschuett
Copy link
Author

My plan was to legalize more instructions, but eventually we will run into predication and/or import issues.

@tschuett
Copy link
Author

Translate legal SVE formal arguments and select COPY for SVE

Argument passing is already implemented.

@tschuett
Copy link
Author

Ping.

auto &IntegerArithmeticActions =
getActionDefinitionsBuilder({G_ADD, G_SUB, G_AND, G_OR, G_XOR});
if (HasSVE)
IntegerArithmeticActions.legalFor({nxv16s8, nxv8s16, nxv4s32, nxv2s64});
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you rebase now that a3010c7 is in?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It should be

getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR})
    .legalFor({s32, s64, v2s32, v2s64, v4s32, v4s16, v8s16, v16s8, v8s8})
    .legalFor(HasSVE, {nxv16s8, nxv8s16, nxv4s32, nxv2s64})
    ....

(You could make the argument that the HasSVE is unnecessary, as scalable-vectors are not supported without them and will just hit some failure elsewhere anyway. We might as well add it in though, now that we can without messing up the Definitions).

; CHECK-NEXT: ret
entry:
%c = add <vscale x 2 x i64> %a, %b
store <vscale x 2 x i64> %c, ptr %p, align 16
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can these all return directly, as opposed to storing?
Can we reuse an existing test file like llvm/test/CodeGen/AArch64/sve-int-arith.ll?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we use one of the existing tests with a new run line?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sve-int-arith.ll has too much illegal resp. unsupported tests.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see, it is still crashing trying to lower the arguments, as opposed to falling back. Can you move this into AArch64/GlobalISel/sve-integer.ll for the moment then, and we can try to de-duplicate them later on.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sve-integer.ll works with ret. Long term, we should merge the two files, but our SVE support is really limited right now.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep this is no longer about ret - the tests look good now. This is about the duplication and moving them to the GlobalISel directory for the time being until more stuff works without crashing.

@tschuett tschuett force-pushed the gisel-sve-integer branch 2 times, most recently from bc6c733 to f1d0b15 Compare October 24, 2024 09:12
Copy link

github-actions bot commented Oct 24, 2024

✅ With the latest revision this PR passed the C/C++ code formatter.

@tschuett
Copy link
Author

Thanks for taking a look.

@tschuett tschuett force-pushed the gisel-sve-integer branch 2 times, most recently from 5150968 to 068e785 Compare October 24, 2024 12:45
Thorsten Schütt added 5 commits October 25, 2024 10:40
for SVE.

Credits: llvm#72976

LLVM ERROR: cannot select: %3:zpr(<vscale x 2 x s64>) = G_MUL %0:fpr, %1:fpr (in function: xmulnxv2i64)

;; mul
define void @xmulnxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, ptr %p) {
entry:
  %c = mul <vscale x 2 x i64> %a, %b
    store <vscale x 2 x i64> %c, ptr %p, align 16
      ret void
}

define void @mulnxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, ptr %p) {
entry:
  %c = mul <vscale x 4 x i32> %a, %b
    store <vscale x 4 x i32> %c, ptr %p, align 16
      ret void
}

define void @mulnxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, ptr %p) {
entry:
  %c = mul <vscale x 8 x i16> %a, %b
    store <vscale x 8 x i16> %c, ptr %p, align 16
      ret void
}

define void @mulnxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, ptr %p) {
entry:
  %c = mul <vscale x 16 x i8> %a, %b
    store <vscale x 16 x i8> %c, ptr %p, align 16
      ret void
}
Copy link
Collaborator

@davemgreen davemgreen left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks. LGTM

@tschuett tschuett merged commit 7b3da7b into llvm:main Oct 27, 2024
6 of 8 checks passed
@tschuett tschuett deleted the gisel-sve-integer branch October 27, 2024 22:14
NoumanAmir657 pushed a commit to NoumanAmir657/llvm-project that referenced this pull request Nov 4, 2024
…or SVE (llvm#110561)

Credits: llvm#72976

LLVM ERROR: cannot select: %3:zpr(<vscale x 2 x s64>) = G_MUL %0:fpr,
%1:fpr (in function: xmulnxv2i64)

;; mul
define void @xmulnxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b,
ptr %p) {
entry:
  %c = mul <vscale x 2 x i64> %a, %b
  store <vscale x 2 x i64> %c, ptr %p, align 16
   ret void
}

define void @mulnxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b,
ptr %p) {
entry:
  %c = mul <vscale x 4 x i32> %a, %b
  store <vscale x 4 x i32> %c, ptr %p, align 16
   ret void
}

define void @mulnxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b,
ptr %p) {
entry:
  %c = mul <vscale x 8 x i16> %a, %b
  store <vscale x 8 x i16> %c, ptr %p, align 16
  ret void
}

define void @mulnxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b,
ptr %p) {
entry:
  %c = mul <vscale x 16 x i8> %a, %b
  store <vscale x 16 x i8> %c, ptr %p, align 16
  ret void
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

3 participants