[AArch64][GISel] Support SVE with 128-bit min-size for G_LOAD and G_STORE

Him188 · Him188 · commit a61fb1a3e989 · 2024-05-14T15:32:39.000+01:00
This patch adds basic support for scalable vector types in load & store instructions for AArch64 with GISel. Only scalable vector types with a 128-bit base size are supported, e.g. <vscale x 4 x i32>, <vscale x 16 x i8>. This patch adapted some ideas from a similar abandoned patch llvm#72976.
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h b/llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h
@@ -652,17 +652,17 @@ bool GIMatchTableExecutor::executeMatchTable(
       MachineMemOperand *MMO =
           *(State.MIs[InsnID]->memoperands_begin() + MMOIdx);
 
-      unsigned Size = MRI.getType(MO.getReg()).getSizeInBits();
+      const auto Size = MRI.getType(MO.getReg()).getSizeInBits();
       if (MatcherOpcode == GIM_CheckMemorySizeEqualToLLT &&
-          MMO->getSizeInBits().getValue() != Size) {
+          MMO->getSizeInBits() != Size) {
         if (handleReject() == RejectAndGiveUp)
           return false;
       } else if (MatcherOpcode == GIM_CheckMemorySizeLessThanLLT &&
-                 MMO->getSizeInBits().getValue() >= Size) {
+                 MMO->getSizeInBits().getValue() >= Size.getKnownMinValue()) {
         if (handleReject() == RejectAndGiveUp)
           return false;
       } else if (MatcherOpcode == GIM_CheckMemorySizeGreaterThanLLT &&
-                 MMO->getSizeInBits().getValue() <= Size)
+                 MMO->getSizeInBits().getValue() <= Size.getKnownMinValue())
         if (handleReject() == RejectAndGiveUp)
           return false;
 
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -1080,7 +1080,7 @@ bool CombinerHelper::isIndexedLoadStoreLegal(GLoadStore &LdSt) const {
   LLT Ty = MRI.getType(LdSt.getReg(0));
   LLT MemTy = LdSt.getMMO().getMemoryType();
   SmallVector<LegalityQuery::MemDesc, 2> MemDescrs(
-      {{MemTy, MemTy.getSizeInBits(), AtomicOrdering::NotAtomic}});
+      {{MemTy, MemTy.getSizeInBits().getKnownMinValue(), AtomicOrdering::NotAtomic}});
   unsigned IndexedOpc = getIndexedOpc(LdSt.getOpcode());
   SmallVector<LLT> OpTys;
   if (IndexedOpc == TargetOpcode::G_INDEXED_STORE)
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -1413,7 +1413,7 @@ bool IRTranslator::translateLoad(const User &U, MachineIRBuilder &MIRBuilder) {
 
 bool IRTranslator::translateStore(const User &U, MachineIRBuilder &MIRBuilder) {
   const StoreInst &SI = cast<StoreInst>(U);
-  if (DL->getTypeStoreSize(SI.getValueOperand()->getType()) == 0)
+  if (DL->getTypeStoreSize(SI.getValueOperand()->getType()).isZero())
     return true;
 
   ArrayRef<Register> Vals = getOrCreateVRegs(*SI.getValueOperand());
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -26375,12 +26375,20 @@ bool AArch64TargetLowering::shouldLocalize(
   return TargetLoweringBase::shouldLocalize(MI, TTI);
 }
 
+static bool isScalableTySupported(const unsigned Op) {
+  return Op == Instruction::Load || Op == Instruction::Store;
+}
+
 bool AArch64TargetLowering::fallBackToDAGISel(const Instruction &Inst) const {
-  if (Inst.getType()->isScalableTy())
-    return true;
+  const auto ScalableTySupported = isScalableTySupported(Inst.getOpcode());
+
+  // Fallback for scalable vectors
+  if (Inst.getType()->isScalableTy() && !ScalableTySupported) {
+      return true;
+  }
 
   for (unsigned i = 0; i < Inst.getNumOperands(); ++i)
-    if (Inst.getOperand(i)->getType()->isScalableTy())
+    if (Inst.getOperand(i)->getType()->isScalableTy() && !ScalableTySupported)
       return true;
 
   if (const AllocaInst *AI = dyn_cast<AllocaInst>(&Inst)) {
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterBanks.td b/llvm/lib/Target/AArch64/AArch64RegisterBanks.td
@@ -13,7 +13,7 @@
 def GPRRegBank : RegisterBank<"GPR", [XSeqPairsClass]>;
 
 /// Floating Point/Vector Registers: B, H, S, D, Q.
-def FPRRegBank : RegisterBank<"FPR", [QQQQ]>;
+def FPRRegBank : RegisterBank<"FPR", [QQQQ, ZPR]>;
 
 /// Conditional register: NZCV.
 def CCRegBank : RegisterBank<"CC", [CCR]>;
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -901,6 +901,27 @@ static unsigned selectLoadStoreUIOp(unsigned GenericOpc, unsigned RegBankID,
   return GenericOpc;
 }
 
+/// Select the AArch64 opcode for the G_LOAD or G_STORE operation for scalable 
+/// vectors.
+/// \p ElementSize size of the element of the scalable vector
+static unsigned selectLoadStoreSVEOp(const unsigned GenericOpc,
+                                     const unsigned ElementSize) {
+  const bool isStore = GenericOpc == TargetOpcode::G_STORE;
+  
+  switch (ElementSize) {
+    case 8:
+      return isStore ? AArch64::ST1B : AArch64::LD1B;
+    case 16:
+      return isStore ? AArch64::ST1H : AArch64::LD1H;
+    case 32:
+      return isStore ? AArch64::ST1W : AArch64::LD1W;
+    case 64:
+      return isStore ? AArch64::ST1D : AArch64::LD1D;
+  }
+  
+  return GenericOpc;
+}
+
 /// Helper function for selectCopy. Inserts a subregister copy from \p SrcReg
 /// to \p *To.
 ///
@@ -2853,8 +2874,8 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
       return false;
     }
 
-    uint64_t MemSizeInBytes = LdSt.getMemSize().getValue();
-    unsigned MemSizeInBits = LdSt.getMemSizeInBits().getValue();
+    uint64_t MemSizeInBytes = LdSt.getMemSize().getValue().getKnownMinValue();
+    unsigned MemSizeInBits = LdSt.getMemSizeInBits().getValue().getKnownMinValue();
     AtomicOrdering Order = LdSt.getMMO().getSuccessOrdering();
 
     // Need special instructions for atomics that affect ordering.
@@ -2906,9 +2927,23 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
     const LLT ValTy = MRI.getType(ValReg);
     const RegisterBank &RB = *RBI.getRegBank(ValReg, MRI, TRI);
 
+#ifndef NDEBUG
+    if (ValTy.isScalableVector()) {
+        assert(STI.hasSVE() 
+             && "Load/Store register operand is scalable vector "
+                "while SVE is not supported by the target");
+        // assert(RB.getID() == AArch64::SVRRegBankID 
+        //        && "Load/Store register operand is scalable vector "
+        //           "while its register bank is not SVR");
+    }
+#endif
+    
     // The code below doesn't support truncating stores, so we need to split it
     // again.
-    if (isa<GStore>(LdSt) && ValTy.getSizeInBits() > MemSizeInBits) {
+    // Truncate only if type is not scalable vector
+    const bool NeedTrunc = !ValTy.isScalableVector() 
+                      && ValTy.getSizeInBits().getFixedValue() > MemSizeInBits;
+    if (isa<GStore>(LdSt) && NeedTrunc) {
       unsigned SubReg;
       LLT MemTy = LdSt.getMMO().getMemoryType();
       auto *RC = getRegClassForTypeOnBank(MemTy, RB);
@@ -2921,7 +2956,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
                       .getReg(0);
       RBI.constrainGenericRegister(Copy, *RC, MRI);
       LdSt.getOperand(0).setReg(Copy);
-    } else if (isa<GLoad>(LdSt) && ValTy.getSizeInBits() > MemSizeInBits) {
+    } else if (isa<GLoad>(LdSt) && NeedTrunc) {
       // If this is an any-extending load from the FPR bank, split it into a regular
       // load + extend.
       if (RB.getID() == AArch64::FPRRegBankID) {
@@ -2951,10 +2986,19 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
     // instruction with an updated opcode, or a new instruction.
     auto SelectLoadStoreAddressingMode = [&]() -> MachineInstr * {
       bool IsStore = isa<GStore>(I);
-      const unsigned NewOpc =
-          selectLoadStoreUIOp(I.getOpcode(), RB.getID(), MemSizeInBits);
+      unsigned NewOpc;
+      if (ValTy.isScalableVector()) {
+        NewOpc = selectLoadStoreSVEOp(I.getOpcode(), ValTy.getElementType().getSizeInBits());
+      } else {
+        NewOpc = selectLoadStoreUIOp(I.getOpcode(), RB.getID(), MemSizeInBits);
+      }
       if (NewOpc == I.getOpcode())
         return nullptr;
+
+      if (ValTy.isScalableVector()) {
+        // Add the predicate register operand
+        I.addOperand(MachineOperand::CreatePredicate(true));
+      }
       // Check if we can fold anything into the addressing mode.
       auto AddrModeFns =
           selectAddrModeIndexed(I.getOperand(1), MemSizeInBytes);
@@ -2970,6 +3014,9 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
       Register CurValReg = I.getOperand(0).getReg();
       IsStore ? NewInst.addUse(CurValReg) : NewInst.addDef(CurValReg);
       NewInst.cloneMemRefs(I);
+      if (ValTy.isScalableVector()) {
+        NewInst.add(I.getOperand(1)); // Copy predicate register
+      }
       for (auto &Fn : *AddrModeFns)
         Fn(NewInst);
       I.eraseFromParent();
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -61,6 +61,79 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
   const LLT v2s64 = LLT::fixed_vector(2, 64);
   const LLT v2p0 = LLT::fixed_vector(2, p0);
 
+  // Scalable vector sizes range from 128 to 2048
+  // Note that subtargets may not support the full range.
+  // See [ScalableVecTypes] below.
+  const LLT nxv16s8 = LLT::scalable_vector(16, s8);
+  const LLT nxv32s8 = LLT::scalable_vector(32, s8);
+  const LLT nxv64s8 = LLT::scalable_vector(64, s8);
+  const LLT nxv128s8 = LLT::scalable_vector(128, s8);
+  const LLT nxv256s8 = LLT::scalable_vector(256, s8);
+
+  const LLT nxv8s16 = LLT::scalable_vector(8, s16);
+  const LLT nxv16s16 = LLT::scalable_vector(16, s16);
+  const LLT nxv32s16 = LLT::scalable_vector(32, s16);
+  const LLT nxv64s16 = LLT::scalable_vector(64, s16);
+  const LLT nxv128s16 = LLT::scalable_vector(128, s16);
+
+  const LLT nxv4s32 = LLT::scalable_vector(4, s32); 
+  const LLT nxv8s32 = LLT::scalable_vector(8, s32); 
+  const LLT nxv16s32 = LLT::scalable_vector(16, s32); 
+  const LLT nxv32s32 = LLT::scalable_vector(32, s32);
+  const LLT nxv64s32 = LLT::scalable_vector(64, s32);
+
+  const LLT nxv2s64 = LLT::scalable_vector(2, s64);
+  const LLT nxv4s64 = LLT::scalable_vector(4, s64);
+  const LLT nxv8s64 = LLT::scalable_vector(8, s64);
+  const LLT nxv16s64 = LLT::scalable_vector(16, s64);
+  const LLT nxv32s64 = LLT::scalable_vector(32, s64);
+
+  const LLT nxv2p0 = LLT::scalable_vector(2, p0);
+  const LLT nxv4p0 = LLT::scalable_vector(4, p0);
+  const LLT nxv8p0 = LLT::scalable_vector(8, p0);
+  const LLT nxv16p0 = LLT::scalable_vector(16, p0);
+  const LLT nxv32p0 = LLT::scalable_vector(32, p0);
+
+  const auto ScalableVec128 = {
+    nxv16s8, nxv8s16, nxv4s32, nxv2s64, nxv2p0,
+  };
+  const auto ScalableVec256 = {
+    nxv32s8, nxv16s16, nxv8s32, nxv4s64, nxv4p0,
+  };
+  const auto ScalableVec512 = {
+    nxv64s8, nxv32s16, nxv16s32, nxv8s64, nxv8p0,
+  };
+  const auto ScalableVec1024 = {
+    nxv128s8, nxv64s16, nxv32s32, nxv16s64, nxv16p0,
+  };
+  const auto ScalableVec2048 = {
+    nxv256s8, nxv128s16, nxv64s32, nxv32s64, nxv32p0,
+  };
+
+  /// Scalable vector types supported by the sub target.
+  /// Empty if SVE is not supported.
+  SmallVector<LLT> ScalableVecTypes;
+  
+  if (ST.hasSVE()) {
+    // Add scalable vector types that are supported by the subtarget
+    const auto MinSize = ST.getMinSVEVectorSizeInBits();
+    auto MaxSize = ST.getMaxSVEVectorSizeInBits();
+    if (MaxSize == 0) {
+      // Unknown max size, assume the target supports all sizes.
+      MaxSize = 2048; 
+    }
+    if (MinSize <= 128 && 128 <= MaxSize)
+      ScalableVecTypes.append(ScalableVec128);
+    if (MinSize <= 256 && 256 <= MaxSize)
+      ScalableVecTypes.append(ScalableVec256);
+    if (MinSize <= 512 && 512 <= MaxSize)
+      ScalableVecTypes.append(ScalableVec512);
+    if (MinSize <= 1024 && 1024 <= MaxSize)
+      ScalableVecTypes.append(ScalableVec1024);
+    if (MinSize <= 2048 && 2048 <= MaxSize)
+      ScalableVecTypes.append(ScalableVec2048);
+  }
+
   std::initializer_list<LLT> PackedVectorAllTypeList = {/* Begin 128bit types */
                                                         v16s8, v8s16, v4s32,
                                                         v2s64, v2p0,
@@ -329,6 +402,18 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
     return ValTy.isPointerVector() && ValTy.getAddressSpace() == 0;
   };
 
+  const auto IsSameScalableVecTy = [=](const LegalityQuery &Query) {
+    // Legal if loading a scalable vector type
+    // into a scalable vector register of the exactly same type
+    if (!Query.Types[0].isScalableVector() || Query.Types[1] != p0)
+      return false;
+    if (Query.MMODescrs[0].MemoryTy != Query.Types[0])
+      return false;
+    if (Query.MMODescrs[0].AlignInBits < 128)
+      return false;
+    return is_contained(ScalableVecTypes, Query.Types[0]);
+  };
+
   getActionDefinitionsBuilder(G_LOAD)
       .customIf([=](const LegalityQuery &Query) {
         return HasRCPC3 && Query.Types[0] == s128 &&
@@ -354,6 +439,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       // These extends are also legal
       .legalForTypesWithMemDesc(
           {{s32, p0, s8, 8}, {s32, p0, s16, 8}, {s64, p0, s32, 8}})
+      .legalIf(IsSameScalableVecTy)
       .widenScalarToNextPow2(0, /* MinSize = */ 8)
       .clampMaxNumElements(0, s8, 16)
       .clampMaxNumElements(0, s16, 8)
@@ -398,7 +484,9 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
            {s64, p0, s64, 8},   {s64, p0, s32, 8}, // truncstorei32 from s64
            {p0, p0, s64, 8},    {s128, p0, s128, 8},  {v16s8, p0, s128, 8},
            {v8s8, p0, s64, 8},  {v4s16, p0, s64, 8},  {v8s16, p0, s128, 8},
-           {v2s32, p0, s64, 8}, {v4s32, p0, s128, 8}, {v2s64, p0, s128, 8}})
+           {v2s32, p0, s64, 8}, {v4s32, p0, s128, 8}, {v2s64, p0, s128, 8},
+          })
+      .legalIf(IsSameScalableVecTy)
       .clampScalar(0, s8, s64)
       .lowerIf([=](const LegalityQuery &Query) {
         return Query.Types[0].isScalar() &&
@@ -440,8 +528,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
           {p0, v4s32, v4s32, 8},
           {p0, v2s64, v2s64, 8},
           {p0, v2p0, v2p0, 8},
-          {p0, s128, s128, 8},
-      })
+          {p0, s128, s128, 8}})
       .unsupported();
 
   auto IndexedLoadBasicPred = [=](const LegalityQuery &Query) {
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
@@ -309,7 +309,7 @@ bool matchSplitStoreZero128(MachineInstr &MI, MachineRegisterInfo &MRI) {
   if (!Store.isSimple())
     return false;
   LLT ValTy = MRI.getType(Store.getValueReg());
-  if (!ValTy.isVector() || ValTy.getSizeInBits() != 128)
+  if (!ValTy.isVector() || ValTy.getSizeInBits().getKnownMinValue() != 128)
     return false;
   if (Store.getMemSizeInBits() != ValTy.getSizeInBits())
     return false; // Don't split truncating stores.
@@ -657,8 +657,8 @@ bool AArch64PostLegalizerCombiner::optimizeConsecutiveMemOpAddressing(
         Register PtrBaseReg;
         APInt Offset;
         LLT StoredValTy = MRI.getType(St->getValueReg());
-        unsigned ValSize = StoredValTy.getSizeInBits();
-        if (ValSize < 32 || St->getMMO().getSizeInBits() != ValSize)
+        const auto ValSize = StoredValTy.getSizeInBits();
+        if (ValSize.getKnownMinValue() < 32 || St->getMMO().getSizeInBits() != ValSize)
           continue;
 
         Register PtrReg = St->getPointerReg();
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
@@ -257,6 +257,7 @@ AArch64RegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
   case AArch64::QQRegClassID:
   case AArch64::QQQRegClassID:
   case AArch64::QQQQRegClassID:
+  case AArch64::ZPRRegClassID:
     return getRegBank(AArch64::FPRRegBankID);
   case AArch64::GPR32commonRegClassID:
   case AArch64::GPR32RegClassID:
@@ -740,11 +741,14 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     LLT Ty = MRI.getType(MO.getReg());
     if (!Ty.isValid())
       continue;
-    OpSize[Idx] = Ty.getSizeInBits();
+    OpSize[Idx] = Ty.getSizeInBits().getKnownMinValue();
 
-    // As a top-level guess, vectors go in FPRs, scalars and pointers in GPRs.
+    // As a top-level guess, scalable vectors go in SVRs, non-scalable
+    // vectors go in FPRs, scalars and pointers in GPRs.
     // For floating-point instructions, scalars go in FPRs.
-    if (Ty.isVector() || isPreISelGenericFloatingPointOpcode(Opc) ||
+    if (Ty.isScalableVector()) 
+      OpRegBankIdx[Idx] = PMI_FirstFPR;
+    else if (Ty.isVector() || isPreISelGenericFloatingPointOpcode(Opc) ||
         Ty.getSizeInBits() > 64)
       OpRegBankIdx[Idx] = PMI_FirstFPR;
     else
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/sve-load-store.ll b/llvm/test/CodeGen/AArch64/GlobalISel/sve-load-store.ll