[SLP] Don't try to vectorize allocas

preames · preames · commit 689babdf687f · 2022-03-02T10:08:43.000-08:00
While a collection of allocas are technically vectorizeable - by forming a wider alloca - this was not a transform SLP actually knows how to do.  Instead, we were forming a bundle with missing dependencies, and then relying on the scheduling code to preserve program order if multiple instructions were scheduleable at once.  I haven't been able to write a test case, but I'm 99% sure this was wrong in some edge case.

The unknown op case was flowing down the shufflevector path.  This did result in some splat handling being lost with this change, but the same lack of splat handling is visible in a whole bunch of simple examples for the gather path.  I didn't consider this interesting to fix given how narrow the splat of allocas case is.
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -3910,6 +3910,16 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     return;
   }
 
+  // Avoid attempting to schedule allocas; there are unmodeled dependencies
+  // for "static" alloca status and for reordering with stacksave calls.
+  for (Value *V : VL) {
+    if (isa<AllocaInst>(V)) {
+      LLVM_DEBUG(dbgs() << "SLP: Gathering due to alloca.\n");
+      newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
+      return;
+    }
+  }
+
   if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
     if (SI->getValueOperand()->getType()->isVectorTy()) {
       LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/store_alloca.ll b/llvm/test/Transforms/SLPVectorizer/X86/store_alloca.ll
@@ -32,13 +32,13 @@ define void @ham() #1 {
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8** [[VAR32]] to <4 x i8*>*
 ; CHECK-NEXT:    store <4 x i8*> [[SHUFFLE]], <4 x i8*>* [[TMP2]], align 4
 ; CHECK-NEXT:    [[VAR36:%.*]] = getelementptr inbounds [12 x i8*], [12 x i8*]* [[VAR12]], i32 0, i32 4
+; CHECK-NEXT:    store i8* [[VAR4]], i8** [[VAR36]], align 4
 ; CHECK-NEXT:    [[VAR37:%.*]] = getelementptr inbounds [12 x i8*], [12 x i8*]* [[VAR12]], i32 0, i32 5
+; CHECK-NEXT:    store i8* [[VAR5]], i8** [[VAR37]], align 4
 ; CHECK-NEXT:    [[VAR38:%.*]] = getelementptr inbounds [12 x i8*], [12 x i8*]* [[VAR12]], i32 0, i32 6
+; CHECK-NEXT:    store i8* [[VAR5]], i8** [[VAR38]], align 4
 ; CHECK-NEXT:    [[VAR39:%.*]] = getelementptr inbounds [12 x i8*], [12 x i8*]* [[VAR12]], i32 0, i32 7
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8*> [[TMP1]], i8* [[VAR5]], i32 1
-; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <4 x i8*> [[TMP3]], <4 x i8*> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8** [[VAR36]] to <4 x i8*>*
-; CHECK-NEXT:    store <4 x i8*> [[SHUFFLE1]], <4 x i8*>* [[TMP4]], align 4
+; CHECK-NEXT:    store i8* [[VAR5]], i8** [[VAR39]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %var2 = alloca i8
@@ -78,14 +78,13 @@ define void @spam() #1 {
 ; CHECK-NEXT:    [[VAR5:%.*]] = alloca i8, align 1
 ; CHECK-NEXT:    [[VAR12:%.*]] = alloca [12 x i8*], align 4
 ; CHECK-NEXT:    [[VAR36:%.*]] = getelementptr inbounds [12 x i8*], [12 x i8*]* [[VAR12]], i32 0, i32 4
+; CHECK-NEXT:    store i8* [[VAR4]], i8** [[VAR36]], align 4
 ; CHECK-NEXT:    [[VAR37:%.*]] = getelementptr inbounds [12 x i8*], [12 x i8*]* [[VAR12]], i32 0, i32 5
+; CHECK-NEXT:    store i8* [[VAR5]], i8** [[VAR37]], align 4
 ; CHECK-NEXT:    [[VAR38:%.*]] = getelementptr inbounds [12 x i8*], [12 x i8*]* [[VAR12]], i32 0, i32 6
+; CHECK-NEXT:    store i8* [[VAR5]], i8** [[VAR38]], align 4
 ; CHECK-NEXT:    [[VAR39:%.*]] = getelementptr inbounds [12 x i8*], [12 x i8*]* [[VAR12]], i32 0, i32 7
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i8*> poison, i8* [[VAR4]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i8*> [[TMP1]], i8* [[VAR5]], i32 1
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i8*> [[TMP2]], <4 x i8*> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8** [[VAR36]] to <4 x i8*>*
-; CHECK-NEXT:    store <4 x i8*> [[SHUFFLE]], <4 x i8*>* [[TMP3]], align 4
+; CHECK-NEXT:    store i8* [[VAR5]], i8** [[VAR39]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %var4 = alloca i8