[SLP]Fix the analysis for masked compress loads

alexey-bataev · web-flow · commit 3918ef3688dc · 2025-05-20T07:31:16.000-04:00
Need to remove the check for Orders in interleaved loads analysis and estimate shuffle cost without the reordering to correctly handle the costs of masked compress loads. Reviewers: hiraditya, HanKuanChen, RKSimon Reviewed By: HanKuanChen, RKSimon Pull Request: #140647
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -5944,10 +5944,9 @@ static bool isMaskedLoadCompress(
     // Check for potential segmented(interleaved) loads.
     VectorType *AlignedLoadVecTy = getWidenedType(
         ScalarTy, getFullVectorNumberOfElements(TTI, ScalarTy, *Diff + 1));
-    if (!isSafeToLoadUnconditionally(
-            Ptr0, AlignedLoadVecTy, CommonAlignment, DL,
-            cast<LoadInst>(Order.empty() ? VL.back() : VL[Order.back()]), &AC,
-            &DT, &TLI))
+    if (!isSafeToLoadUnconditionally(Ptr0, AlignedLoadVecTy, CommonAlignment,
+                                     DL, cast<LoadInst>(VL.back()), &AC, &DT,
+                                     &TLI))
       AlignedLoadVecTy = LoadVecTy;
     if (TTI.isLegalInterleavedAccessType(AlignedLoadVecTy, CompressMask[1],
                                          CommonAlignment,
@@ -5957,25 +5956,22 @@ static bool isMaskedLoadCompress(
                               Instruction::Load, AlignedLoadVecTy,
                               CompressMask[1], std::nullopt, CommonAlignment,
                               LI->getPointerAddressSpace(), CostKind, IsMasked);
-      if (!Mask.empty())
-        InterleavedCost += ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc,
-                                            VecTy, Mask, CostKind);
       if (InterleavedCost < GatherCost) {
         InterleaveFactor = CompressMask[1];
         LoadVecTy = AlignedLoadVecTy;
         return true;
       }
     }
   }
+  InstructionCost CompressCost = ::getShuffleCost(
+      TTI, TTI::SK_PermuteSingleSrc, LoadVecTy, CompressMask, CostKind);
   if (!Order.empty()) {
     SmallVector<int> NewMask(Sz, PoisonMaskElem);
     for (unsigned I : seq<unsigned>(Sz)) {
       NewMask[I] = CompressMask[Mask[I]];
     }
     CompressMask.swap(NewMask);
   }
-  InstructionCost CompressCost = ::getShuffleCost(
-      TTI, TTI::SK_PermuteSingleSrc, LoadVecTy, CompressMask, CostKind);
   InstructionCost TotalVecCost = VectorGEPCost + LoadCost + CompressCost;
   return TotalVecCost < GatherCost;
 }
@@ -13553,10 +13549,11 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
         SmallVector<Value *> PointerOps(Scalars.size());
         for (auto [I, V] : enumerate(Scalars))
           PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
-        (void)isMaskedLoadCompress(
+        [[maybe_unused]] bool IsVectorized = isMaskedLoadCompress(
             Scalars, PointerOps, E->ReorderIndices, *TTI, *DL, *SE, *AC, *DT,
             *TLI, [](Value *) { return true; }, IsMasked, InterleaveFactor,
             CompressMask, LoadVecTy);
+        assert(IsVectorized && "Failed to vectorize load");
         CompressEntryToData.try_emplace(E, CompressMask, LoadVecTy,
                                         InterleaveFactor, IsMasked);
         Align CommonAlignment = LI0->getAlign();
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-use-ptr.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-use-ptr.ll
@@ -15,10 +15,11 @@ define i16 @test() {
 ; CHECK-NEXT:    [[PEDGE_061_I:%.*]] = phi ptr [ [[INCDEC_PTR_I:%.*]], [[WHILE_BODY_I]] ], [ null, [[ENTRY]] ]
 ; CHECK-NEXT:    [[INCDEC_PTR_I]] = getelementptr [[S]], ptr [[PEDGE_061_I]], i64 -1
 ; CHECK-NEXT:    [[PPREV_0_I]] = getelementptr [[S]], ptr [[PPREV_062_I]], i64 -1
-; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i16> @llvm.experimental.vp.strided.load.v2i16.p0.i64(ptr align 2 [[PPREV_0_I]], i64 4, <2 x i1> splat (i1 true), i32 2)
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i16> [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i16> [[TMP1]], i32 1
-; CHECK-NEXT:    [[CMP_I178:%.*]] = icmp ult i16 [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <3 x i16> @llvm.masked.load.v3i16.p0(ptr [[PPREV_0_I]], i32 2, <3 x i1> <i1 true, i1 false, i1 true>, <3 x i16> poison)
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <3 x i16> [[TMP1]], <3 x i16> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i16> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i16> [[TMP2]], i32 1
+; CHECK-NEXT:    [[CMP_I178:%.*]] = icmp ult i16 [[TMP4]], [[TMP3]]
 ; CHECK-NEXT:    br label [[WHILE_BODY_I]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/unordered-loads-operands.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/unordered-loads-operands.ll
@@ -9,18 +9,20 @@ define void @test(ptr %mdct_forward_x) {
 ; CHECK:       [[FOR_COND]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[MDCT_FORWARD_X]], align 8
 ; CHECK-NEXT:    [[ARRAYIDX2_I_I:%.*]] = getelementptr i8, ptr [[TMP0]], i64 32
+; CHECK-NEXT:    [[ARRAYIDX5_I_I:%.*]] = getelementptr i8, ptr [[TMP0]], i64 40
 ; CHECK-NEXT:    [[ADD_PTR_I:%.*]] = getelementptr i8, ptr [[TMP0]], i64 24
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP0]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x ptr> [[TMP1]], <4 x ptr> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, <4 x ptr> [[TMP2]], <4 x i64> <i64 28, i64 36, i64 24, i64 28>
-; CHECK-NEXT:    [[TMP4:%.*]] = call <2 x float> @llvm.experimental.vp.strided.load.v2f32.p0.i64(ptr align 4 [[ARRAYIDX2_I_I]], i64 -8, <2 x i1> splat (i1 true), i32 2)
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x ptr> [[TMP2]], <4 x ptr> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, <2 x ptr> [[TMP5]], <2 x i64> <i64 48, i64 40>
-; CHECK-NEXT:    [[TMP7:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> [[TMP6]], i32 4, <2 x i1> splat (i1 true), <2 x float> poison)
+; CHECK-NEXT:    [[TMP5:%.*]] = call <3 x float> @llvm.masked.load.v3f32.p0(ptr [[ADD_PTR_I]], i32 4, <3 x i1> <i1 true, i1 false, i1 true>, <3 x float> poison)
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <3 x float> [[TMP5]], <3 x float> poison, <2 x i32> <i32 2, i32 0>
+; CHECK-NEXT:    [[TMP6:%.*]] = call <3 x float> @llvm.masked.load.v3f32.p0(ptr [[ARRAYIDX5_I_I]], i32 4, <3 x i1> <i1 true, i1 false, i1 true>, <3 x float> poison)
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <3 x float> [[TMP6]], <3 x float> poison, <2 x i32> <i32 2, i32 0>
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> [[TMP3]], i32 4, <4 x i1> splat (i1 true), <4 x float> poison)
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 0>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x float> <float poison, float poison, float 0.000000e+00, float poison>, <4 x float> [[TMP10]], <4 x i32> <i32 poison, i32 poison, i32 2, i32 4>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <3 x float> [[TMP6]], <3 x float> poison, <4 x i32> <i32 2, i32 0, i32 2, i32 2>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <3 x float> [[TMP5]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x float> <float poison, float poison, float 0.000000e+00, float poison>, <4 x float> [[TMP22]], <4 x i32> <i32 poison, i32 poison, i32 2, i32 6>
 ; CHECK-NEXT:    [[TMP12:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP11]], <2 x float> [[TMP4]], i64 0)
 ; CHECK-NEXT:    [[TMP13:%.*]] = fsub <4 x float> [[TMP9]], [[TMP12]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = fadd <4 x float> [[TMP9]], [[TMP12]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reordered-masked-loads.ll b/llvm/test/Transforms/SLPVectorizer/X86/reordered-masked-loads.ll
@@ -9,17 +9,16 @@ define void @test() {
 ; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[M1:%.*]] = alloca [[STRUCT_AE:%.*]], align 8
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[M1]], i64 8
+; CHECK-NEXT:    [[ARRAYIDX_I5_I:%.*]] = getelementptr i8, ptr [[M1]], i64 48
 ; CHECK-NEXT:    [[ARRAYIDX_I4:%.*]] = getelementptr i8, ptr null, i64 16
-; CHECK-NEXT:    [[ARRAYIDX_I5_I:%.*]] = getelementptr i8, ptr [[M1]], i64 40
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <5 x double>, ptr [[M1]], align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <5 x double> [[TMP1]], <5 x double> poison, <4 x i32> <i32 0, i32 1, i32 3, i32 4>
+; CHECK-NEXT:    [[TMP4:%.*]] = load <6 x double>, ptr [[M1]], align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <6 x double> [[TMP4]], <6 x double> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 5>
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[ARRAYIDX_I5_I]], align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <7 x double>, ptr [[TMP0]], align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <7 x double> [[TMP4]], <7 x double> poison, <4 x i32> <i32 5, i32 0, i32 3, i32 6>
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <5 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <5 x double> [[TMP1]], <5 x double> [[TMP7]], <4 x i32> <i32 0, i32 3, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <5 x double> [[TMP7]], <5 x double> [[TMP1]], <4 x i32> <i32 0, i32 6, i32 9, i32 1>
 ; CHECK-NEXT:    [[TMP9:%.*]] = fadd <4 x double> [[TMP8]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = fptosi <4 x double> [[TMP9]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP11:%.*]] = sitofp <4 x i32> [[TMP10]] to <4 x double>