[SLP]Enable splat ordering for loads

alexey-bataev · web-flow · commit af3295bd3dcc · 2024-11-15T10:29:43.000-05:00
Enables splat support for loads with lanes> 2 or number of operands> 2. Allows better detect splats of loads and reduces number of shuffles in some cases. X86, AVX512, -O3+LTO Metric: size..text results results0 diff test-suite :: External/SPEC/CFP2006/433.milc/433.milc.test 154867.00 156723.00 1.2% test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 12467735.00 12468023.00 0.0% Better vectorization quality Reviewers: RKSimon Reviewed By: RKSimon Pull Request: #115173
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2386,6 +2386,9 @@ class BoUpSLP {
     /// the whole vector (it is mixed with constants or loop invariant values).
     /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
     bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
+      // Small number of loads - try load matching.
+      if (isa<LoadInst>(Op) && getNumLanes() == 2 && getNumOperands() == 2)
+        return false;
       bool OpAPO = getData(OpIdx, Lane).APO;
       bool IsInvariant = L && L->isLoopInvariant(Op);
       unsigned Cnt = 0;
@@ -2511,23 +2514,23 @@ class BoUpSLP {
         Value *OpLane0 = getValue(OpIdx, FirstLane);
         // Keep track if we have instructions with all the same opcode on one
         // side.
-        if (isa<LoadInst>(OpLane0))
-          ReorderingModes[OpIdx] = ReorderingMode::Load;
-        else if (auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
+        if (auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
           // Check if OpLane0 should be broadcast.
           if (shouldBroadcast(OpLane0, OpIdx, FirstLane) ||
               !canBeVectorized(OpILane0, OpIdx, FirstLane))
             ReorderingModes[OpIdx] = ReorderingMode::Splat;
+          else if (isa<LoadInst>(OpILane0))
+            ReorderingModes[OpIdx] = ReorderingMode::Load;
           else
             ReorderingModes[OpIdx] = ReorderingMode::Opcode;
-        } else if (isa<Constant>(OpLane0))
+        } else if (isa<Constant>(OpLane0)) {
           ReorderingModes[OpIdx] = ReorderingMode::Constant;
-        else if (isa<Argument>(OpLane0))
+        } else if (isa<Argument>(OpLane0)) {
           // Our best hope is a Splat. It may save some cost in some cases.
           ReorderingModes[OpIdx] = ReorderingMode::Splat;
-        else
-          // NOTE: This should be unreachable.
-          ReorderingModes[OpIdx] = ReorderingMode::Failed;
+        } else {
+          llvm_unreachable("Unexpected value kind.");
+        }
       }
 
       // Check that we don't have same operands. No need to reorder if operands
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/loads-ordering.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/loads-ordering.ll
@@ -8,19 +8,13 @@ define fastcc void @rephase(ptr %phases_in, ptr %157, i64 %158) {
 ; CHECK-NEXT:    [[IND_END11:%.*]] = getelementptr i8, ptr [[TMP0]], i64 [[TMP1]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = load double, ptr [[TMP0]], align 8
 ; CHECK-NEXT:    [[IMAG_247:%.*]] = getelementptr i8, ptr [[IND_END11]], i64 408
-; CHECK-NEXT:    [[MUL35_248:%.*]] = fmul double [[TMP2]], 0.000000e+00
-; CHECK-NEXT:    store double [[MUL35_248]], ptr [[IMAG_247]], align 8
-; CHECK-NEXT:    [[ARRAYIDX23_1_249:%.*]] = getelementptr i8, ptr [[IND_END11]], i64 416
-; CHECK-NEXT:    [[MUL_1_250:%.*]] = fmul double [[TMP2]], 0.000000e+00
-; CHECK-NEXT:    store double [[MUL_1_250]], ptr [[ARRAYIDX23_1_249]], align 8
 ; CHECK-NEXT:    [[IMAG_1_251:%.*]] = getelementptr i8, ptr [[IND_END11]], i64 424
-; CHECK-NEXT:    [[TMP3:%.*]] = load double, ptr [[IMAG_1_251]], align 8
-; CHECK-NEXT:    [[MUL35_1_252:%.*]] = fmul double [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    store double [[MUL35_1_252]], ptr [[IMAG_1_251]], align 8
-; CHECK-NEXT:    [[ARRAYIDX23_2_253:%.*]] = getelementptr i8, ptr [[IND_END11]], i64 432
-; CHECK-NEXT:    [[TMP4:%.*]] = load double, ptr [[ARRAYIDX23_2_253]], align 8
-; CHECK-NEXT:    [[MUL_2_254:%.*]] = fmul double [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    store double [[MUL_2_254]], ptr [[ARRAYIDX23_2_253]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[IMAG_1_251]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x double> poison, double [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = call <4 x double> @llvm.vector.insert.v4f64.v2f64(<4 x double> <double 0.000000e+00, double 0.000000e+00, double poison, double poison>, <2 x double> [[TMP3]], i64 2)
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul <4 x double> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    store <4 x double> [[TMP7]], ptr [[IMAG_247]], align 8
 ; CHECK-NEXT:    store double [[TMP2]], ptr [[PHASES_IN]], align 8
 ; CHECK-NEXT:    ret void
 ;