[ARM][SLP] Fix incorrect cost function for SLP Vectorization of ZExt/SExt

nasherm · nasherm · commit bb3c2fc39765 · 2025-03-05T17:06:17.000Z
PR #117350 made changes to the SLP vectorizer which introduced a regression on ARM vectorization benchmarks. This was due to the changes assuming that SExt/ZExt vector instructions have constant cost. This behaviour is expected for RISCV but not on ARM where we take into account source and destination type of SExt/ZExt instructions when calculating vector cost. Change-Id: I6f995dcde26e5aaf62b779b63e52988fb333f941
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -1794,7 +1794,6 @@ InstructionCost ARMTTIImpl::getExtendedReductionCost(
   case ISD::ADD:
     if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
       std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
-
       // The legal cases are:
       //   VADDV u/s 8/16/32
       //   VADDLV u/s 32
diff --git a/llvm/test/Transforms/SLPVectorizer/ARM/vadd-mve.ll b/llvm/test/Transforms/SLPVectorizer/ARM/vadd-mve.ll
@@ -0,0 +1,29 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=slp-vectorizer --mtriple arm-none-eabi -mattr=+mve -S -o - | FileCheck %s
+
+define i64 @vadd_32_64(ptr readonly %a) {
+; CHECK-LABEL: define i64 @vadd_32_64(
+; CHECK-SAME: ptr readonly [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[A]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = sext <4 x i32> [[TMP0]] to <4 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP1]])
+; CHECK-NEXT:    ret i64 [[TMP2]]
+;
+entry:
+  %0 = load i32, ptr %a, align 4
+  %conv = sext i32 %0 to i64
+  %arrayidx1 = getelementptr inbounds nuw i8, ptr %a, i32 4
+  %1 = load i32, ptr %arrayidx1, align 4
+  %conv2 = sext i32 %1 to i64
+  %add = add nsw i64 %conv2, %conv
+  %arrayidx3 = getelementptr inbounds nuw i8, ptr %a, i32 8
+  %2 = load i32, ptr %arrayidx3, align 4
+  %conv4 = sext i32 %2 to i64
+  %add5 = add nsw i64 %add, %conv4
+  %arrayidx6 = getelementptr inbounds nuw i8, ptr %a, i32 12
+  %3 = load i32, ptr %arrayidx6, align 4
+  %conv7 = sext i32 %3 to i64
+  %add8 = add nsw i64 %add5, %conv7
+  ret i64 %add8
+}