[AArch64] Sink vscale calls into loops for better isel

huntergr-arm · huntergr-arm · commit 320097db5e97 · 2023-11-03T14:23:41.000Z
For more recent sve capable CPUs it is beneficial to use the inc* instruction
to increment a value by vscale (potentially shifted or multiplied) even in
short loops.

This patch tells codegenprepare to sink appropriate vscale calls into
blocks where they are used so that isel can match them.
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -14520,6 +14520,19 @@ static bool shouldSinkVectorOfPtrs(Value *Ptrs, SmallVectorImpl<Use *> &Ops) {
   return true;
 }
 
+/// We want to sink following cases:
+/// (add|sub) A, ((mul|shl) vscale, imm); (add|sub) A, vscale
+static bool shouldSinkVScale(Value *Op, SmallVectorImpl<Use *> &Ops) {
+  if (match(Op, m_VScale()))
+    return true;
+  if (match(Op, m_Shl(m_VScale(), m_ConstantInt())) ||
+      match(Op, m_Mul(m_VScale(), m_ConstantInt()))) {
+    Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
+    return true;
+  }
+  return false;
+}
+
 /// Check if sinking \p I's operands to I's basic block is profitable, because
 /// the operands can be folded into a target instruction, e.g.
 /// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
@@ -14636,12 +14649,29 @@ bool AArch64TargetLowering::shouldSinkOperands(
     }
   }
 
-  if (!I->getType()->isVectorTy())
-    return false;
-
   switch (I->getOpcode()) {
   case Instruction::Sub:
   case Instruction::Add: {
+    // If the subtarget wants to make use of sve inc* instructions, then sink
+    // vscale intrinsic (along with any shifts or multiplies) so that the
+    // appropriate folds can be made.
+    if (Subtarget->useScalarIncVL()) {
+      bool Sink = false;
+      if (shouldSinkVScale(I->getOperand(0), Ops)) {
+        Ops.push_back(&I->getOperandUse(0));
+        Sink = true;
+      }
+      
+      if (shouldSinkVScale(I->getOperand(1), Ops)) {
+        Ops.push_back(&I->getOperandUse(1));
+        Sink = true;
+      }
+
+      if (Sink)
+        return true;
+    }
+    if (!I->getType()->isVectorTy())
+      return false;
     if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
       return false;
 
@@ -14660,6 +14690,8 @@ bool AArch64TargetLowering::shouldSinkOperands(
     return true;
   }
   case Instruction::Or: {
+    if (!I->getType()->isVectorTy())
+      return false;
     // Pattern: Or(And(MaskValue, A), And(Not(MaskValue), B)) ->
     // bitselect(MaskValue, A, B) where Not(MaskValue) = Xor(MaskValue, -1)
     if (Subtarget->hasNEON()) {
@@ -14697,6 +14729,8 @@ bool AArch64TargetLowering::shouldSinkOperands(
     return false;
   }
   case Instruction::Mul: {
+    if (!I->getType()->isVectorTy())
+      return false;
     int NumZExts = 0, NumSExts = 0;
     for (auto &Op : I->operands()) {
       // Make sure we are not already sinking this operand
diff --git a/llvm/test/CodeGen/AArch64/sve2-vscale-sinking-codegen.ll b/llvm/test/CodeGen/AArch64/sve2-vscale-sinking-codegen.ll
@@ -0,0 +1,96 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define void @inc_add(i32 %first, i32 %N, ptr %in1, ptr %in2, ptr %out) #0 {
+; CHECK-LABEL: inc_add:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    mov w9, w1
+; CHECK-NEXT:  .LBB0_1: // %vector.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x2, x8, lsl #2]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x3, x8, lsl #2]
+; CHECK-NEXT:    fmul z0.s, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x4, x8, lsl #2]
+; CHECK-NEXT:    incw x8
+; CHECK-NEXT:    cmp x9, x8
+; CHECK-NEXT:    b.ne .LBB0_1
+; CHECK-NEXT:  // %bb.2: // %for.cond.cleanup
+; CHECK-NEXT:    ret
+entry:
+  %wide.trip.count = zext i32 %N to i64
+  %0 = tail call i64 @llvm.vscale.i64()
+  %1 = shl nuw nsw i64 %0, 2
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %2 = getelementptr inbounds float, ptr %in1, i64 %index
+  %wide.load = load <vscale x 4 x float>, ptr %2, align 4
+  %3 = getelementptr inbounds float, ptr %in2, i64 %index
+  %wide.load16 = load <vscale x 4 x float>, ptr %3, align 4
+  %4 = fmul <vscale x 4 x float> %wide.load, %wide.load16
+  %5 = getelementptr inbounds float, ptr %out, i64 %index
+  store <vscale x 4 x float> %4, ptr %5, align 4
+  %index.next = add nuw i64 %index, %1
+  %6 = icmp eq i64 %index.next, %wide.trip.count
+  br i1 %6, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:
+  ret void
+}
+
+define void @dec_sub(i32 %first, i32 %N, ptr %in1, ptr %in2, ptr %out) #0 {
+; CHECK-LABEL: dec_sub:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    rdvl x9, #-1
+; CHECK-NEXT:    mov w8, w1
+; CHECK-NEXT:    add x11, x9, #4
+; CHECK-NEXT:    add x9, x2, x11
+; CHECK-NEXT:    add x10, x3, x11
+; CHECK-NEXT:    add x11, x4, x11
+; CHECK-NEXT:  .LBB1_1: // %vector.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x9, x8, lsl #2]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x10, x8, lsl #2]
+; CHECK-NEXT:    fmul z0.s, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x11, x8, lsl #2]
+; CHECK-NEXT:    decw x8
+; CHECK-NEXT:    cbnz x8, .LBB1_1
+; CHECK-NEXT:  // %bb.2: // %for.cond.cleanup
+; CHECK-NEXT:    ret
+entry:
+  %0 = zext i32 %N to i64
+  %1 = tail call i64 @llvm.vscale.i64()
+  %2 = shl nuw nsw i64 %1, 2
+  %3 = sub nsw i64 1, %2
+  %invariant.gep = getelementptr float, ptr %in1, i64 %3
+  %invariant.gep20 = getelementptr float, ptr %in2, i64 %3
+  %invariant.gep22 = getelementptr float, ptr %out, i64 %3
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %offset.idx = sub i64 %0, %index
+  %gep = getelementptr float, ptr %invariant.gep, i64 %offset.idx
+  %wide.load = load <vscale x 4 x float>, ptr %gep, align 4
+  %gep21 = getelementptr float, ptr %invariant.gep20, i64 %offset.idx
+  %wide.load16 = load <vscale x 4 x float>, ptr %gep21, align 4
+  %4 = fmul <vscale x 4 x float> %wide.load, %wide.load16
+  %gep23 = getelementptr float, ptr %invariant.gep22, i64 %offset.idx
+  store <vscale x 4 x float> %4, ptr %gep23, align 4
+  %index.next = add nuw i64 %index, %2
+  %5 = icmp eq i64 %index.next, %0
+  br i1 %5, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:
+  ret void
+}
+
+declare i64 @llvm.vscale.i64()
+
+attributes #0 = { "target-features"="+sve2" }
diff --git a/llvm/test/CodeGen/AArch64/sve2-vscale-sinking.ll b/llvm/test/CodeGen/AArch64/sve2-vscale-sinking.ll
@@ -0,0 +1,112 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; RUN: opt -codegenprepare -S -o - %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define void @inc_add(i32 %first, i32 %N, ptr %in1, ptr %in2, ptr %out) #0 {
+; CHECK-LABEL: define void @inc_add
+; CHECK-SAME: (i32 [[FIRST:%.*]], i32 [[N:%.*]], ptr [[IN1:%.*]], ptr [[IN2:%.*]], ptr [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds float, ptr [[IN1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[IN2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD16:%.*]] = load <vscale x 4 x float>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD16]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[OUT]], i64 [[INDEX]]
+; CHECK-NEXT:    store <vscale x 4 x float> [[TMP2]], ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %wide.trip.count = zext i32 %N to i64
+  %0 = tail call i64 @llvm.vscale.i64()
+  %1 = shl nuw nsw i64 %0, 2
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %2 = getelementptr inbounds float, ptr %in1, i64 %index
+  %wide.load = load <vscale x 4 x float>, ptr %2, align 4
+  %3 = getelementptr inbounds float, ptr %in2, i64 %index
+  %wide.load16 = load <vscale x 4 x float>, ptr %3, align 4
+  %4 = fmul <vscale x 4 x float> %wide.load, %wide.load16
+  %5 = getelementptr inbounds float, ptr %out, i64 %index
+  store <vscale x 4 x float> %4, ptr %5, align 4
+  %index.next = add nuw i64 %index, %1
+  %6 = icmp eq i64 %index.next, %wide.trip.count
+  br i1 %6, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:
+  ret void
+}
+
+define void @dec_sub(i32 %first, i32 %N, ptr %in1, ptr %in2, ptr %out) #0 {
+; CHECK-LABEL: define void @dec_sub
+; CHECK-SAME: (i32 [[FIRST:%.*]], i32 [[N:%.*]], ptr [[IN1:%.*]], ptr [[IN2:%.*]], ptr [[OUT:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = sub nsw i64 1, [[TMP2]]
+; CHECK-NEXT:    [[INVARIANT_GEP:%.*]] = getelementptr float, ptr [[IN1]], i64 [[TMP3]]
+; CHECK-NEXT:    [[INVARIANT_GEP20:%.*]] = getelementptr float, ptr [[IN2]], i64 [[TMP3]]
+; CHECK-NEXT:    [[INVARIANT_GEP22:%.*]] = getelementptr float, ptr [[OUT]], i64 [[TMP3]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 [[TMP0]], [[INDEX]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr float, ptr [[INVARIANT_GEP]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[GEP]], align 4
+; CHECK-NEXT:    [[GEP21:%.*]] = getelementptr float, ptr [[INVARIANT_GEP20]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[WIDE_LOAD16:%.*]] = load <vscale x 4 x float>, ptr [[GEP21]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD16]]
+; CHECK-NEXT:    [[GEP23:%.*]] = getelementptr float, ptr [[INVARIANT_GEP22]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    store <vscale x 4 x float> [[TMP4]], ptr [[GEP23]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[TMP0]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = zext i32 %N to i64
+  %1 = tail call i64 @llvm.vscale.i64()
+  %2 = shl nuw nsw i64 %1, 2
+  %3 = sub nsw i64 1, %2
+  %invariant.gep = getelementptr float, ptr %in1, i64 %3
+  %invariant.gep20 = getelementptr float, ptr %in2, i64 %3
+  %invariant.gep22 = getelementptr float, ptr %out, i64 %3
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %offset.idx = sub i64 %0, %index
+  %gep = getelementptr float, ptr %invariant.gep, i64 %offset.idx
+  %wide.load = load <vscale x 4 x float>, ptr %gep, align 4
+  %gep21 = getelementptr float, ptr %invariant.gep20, i64 %offset.idx
+  %wide.load16 = load <vscale x 4 x float>, ptr %gep21, align 4
+  %4 = fmul <vscale x 4 x float> %wide.load, %wide.load16
+  %gep23 = getelementptr float, ptr %invariant.gep22, i64 %offset.idx
+  store <vscale x 4 x float> %4, ptr %gep23, align 4
+  %index.next = add nuw i64 %index, %2
+  %5 = icmp eq i64 %index.next, %0
+  br i1 %5, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:
+  ret void
+}
+
+declare i64 @llvm.vscale.i64()
+
+attributes #0 = { "target-features"="+sve2" }