llvm
diff --git a/‎llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
Lines changed: 360 additions & 86 deletions b/‎llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
Lines changed: 360 additions & 86 deletions
diff --git a/‎llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll
Lines changed: 8 additions & 8 deletions b/‎llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll
Lines changed: 8 additions & 8 deletions
diff --git a/‎llvm/test/Transforms/SLPVectorizer/AArch64/reorder-fmuladd-crash.ll
Lines changed: 4 additions & 3 deletions b/‎llvm/test/Transforms/SLPVectorizer/AArch64/reorder-fmuladd-crash.ll
Lines changed: 4 additions & 3 deletions
diff --git a/‎llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll
Lines changed: 11 additions & 11 deletions b/‎llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll
Lines changed: 11 additions & 11 deletions
diff --git a/‎llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll
Lines changed: 17 additions & 17 deletions b/‎llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll
Lines changed: 17 additions & 17 deletions
diff --git a/‎llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll
Lines changed: 8 additions & 8 deletions b/‎llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll
Lines changed: 8 additions & 8 deletions
@@ -76,10 +76,10 @@ define void @dist_vec(ptr nocapture noundef readonly %pA, ptr nocapture noundef
 ; CHECK-NEXT:    [[PB_ADDR_0_LCSSA:%.*]] = phi ptr [ [[PB]], [[ENTRY]] ], [ [[SCEVGEP311]], [[WHILE_END_LOOPEXIT]] ]
 ; CHECK-NEXT:    [[PA_ADDR_0_LCSSA:%.*]] = phi ptr [ [[PA]], [[ENTRY]] ], [ [[SCEVGEP]], [[WHILE_END_LOOPEXIT]] ]
 ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP4FT_0_LCSSA]], <2 x i64> [[TMP4TF_0_LCSSA]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i64> [[TMP4TT_0_LCSSA]], <2 x i64> [[TMP4FF_0_LCSSA]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i64> [[TMP4FF_0_LCSSA]], <2 x i64> [[TMP4TT_0_LCSSA]], <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x i64> [[TMP4FT_0_LCSSA]], <2 x i64> [[TMP4TF_0_LCSSA]], <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <2 x i64> [[TMP4TT_0_LCSSA]], <2 x i64> [[TMP4FF_0_LCSSA]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <2 x i64> [[TMP4FF_0_LCSSA]], <2 x i64> [[TMP4TT_0_LCSSA]], <2 x i32> <i32 1, i32 3>
 ; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP16:%.*]] = add <4 x i64> [[TMP12]], [[TMP15]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = trunc <4 x i64> [[TMP16]] to <4 x i32>
@@ -107,12 +107,12 @@ define void @dist_vec(ptr nocapture noundef readonly %pA, ptr nocapture noundef
 ; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <2 x i32> [[TMP22]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP24:%.*]] = icmp eq <2 x i32> [[TMP23]], zeroinitializer
 ; CHECK-NEXT:    [[TMP25:%.*]] = icmp ne <2 x i32> [[TMP23]], zeroinitializer
-; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <2 x i1> [[TMP24]], <2 x i1> [[TMP25]], <4 x i32> <i32 0, i32 3, i32 3, i32 0>
+; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <2 x i1> [[TMP24]], <2 x i1> [[TMP25]], <4 x i32> <i32 0, i32 3, i32 0, i32 3>
 ; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <2 x i32> poison, i32 [[AND95]], i32 0
 ; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <2 x i32> [[TMP27]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP29:%.*]] = icmp ne <2 x i32> [[TMP28]], zeroinitializer
 ; CHECK-NEXT:    [[TMP30:%.*]] = icmp eq <2 x i32> [[TMP28]], zeroinitializer
-; CHECK-NEXT:    [[TMP31:%.*]] = shufflevector <2 x i1> [[TMP29]], <2 x i1> [[TMP30]], <4 x i32> <i32 0, i32 3, i32 0, i32 3>
+; CHECK-NEXT:    [[TMP31:%.*]] = shufflevector <2 x i1> [[TMP29]], <2 x i1> [[TMP30]], <4 x i32> <i32 0, i32 3, i32 3, i32 0>
 ; CHECK-NEXT:    [[TMP32:%.*]] = select <4 x i1> [[TMP26]], <4 x i1> [[TMP31]], <4 x i1> zeroinitializer
 ; CHECK-NEXT:    [[TMP33:%.*]] = zext <4 x i1> [[TMP32]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP34]] = add <4 x i32> [[TMP21]], [[TMP33]]
@@ -152,12 +152,12 @@ define void @dist_vec(ptr nocapture noundef readonly %pA, ptr nocapture noundef
 ; CHECK-NEXT:    [[TMP40:%.*]] = shufflevector <2 x i32> [[TMP39]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP41:%.*]] = icmp eq <2 x i32> [[TMP40]], zeroinitializer
 ; CHECK-NEXT:    [[TMP42:%.*]] = icmp ne <2 x i32> [[TMP40]], zeroinitializer
-; CHECK-NEXT:    [[TMP43:%.*]] = shufflevector <2 x i1> [[TMP41]], <2 x i1> [[TMP42]], <4 x i32> <i32 0, i32 3, i32 3, i32 0>
+; CHECK-NEXT:    [[TMP43:%.*]] = shufflevector <2 x i1> [[TMP41]], <2 x i1> [[TMP42]], <4 x i32> <i32 0, i32 3, i32 0, i32 3>
 ; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <2 x i32> poison, i32 [[AND134]], i32 0
 ; CHECK-NEXT:    [[TMP45:%.*]] = shufflevector <2 x i32> [[TMP44]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP46:%.*]] = icmp ne <2 x i32> [[TMP45]], zeroinitializer
 ; CHECK-NEXT:    [[TMP47:%.*]] = icmp eq <2 x i32> [[TMP45]], zeroinitializer
-; CHECK-NEXT:    [[TMP48:%.*]] = shufflevector <2 x i1> [[TMP46]], <2 x i1> [[TMP47]], <4 x i32> <i32 0, i32 3, i32 0, i32 3>
+; CHECK-NEXT:    [[TMP48:%.*]] = shufflevector <2 x i1> [[TMP46]], <2 x i1> [[TMP47]], <4 x i32> <i32 0, i32 3, i32 3, i32 0>
 ; CHECK-NEXT:    [[TMP49:%.*]] = select <4 x i1> [[TMP43]], <4 x i1> [[TMP48]], <4 x i1> zeroinitializer
 ; CHECK-NEXT:    [[TMP50:%.*]] = zext <4 x i1> [[TMP49]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP51]] = add <4 x i32> [[TMP38]], [[TMP50]]
@@ -166,9 +166,9 @@ define void @dist_vec(ptr nocapture noundef readonly %pA, ptr nocapture noundef
 ; CHECK-NEXT:    br i1 [[CMP130_NOT]], label [[WHILE_END166]], label [[WHILE_BODY132]]
 ; CHECK:       while.end166:
 ; CHECK-NEXT:    [[TMP52:%.*]] = phi <4 x i32> [ [[TMP35]], [[WHILE_END122]] ], [ [[TMP51]], [[WHILE_BODY132]] ]
-; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <4 x i32> [[TMP52]], i32 2
+; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <4 x i32> [[TMP52]], i32 3
 ; CHECK-NEXT:    store i32 [[TMP53]], ptr [[CTT:%.*]], align 4
-; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <4 x i32> [[TMP52]], i32 3
+; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <4 x i32> [[TMP52]], i32 2
 ; CHECK-NEXT:    store i32 [[TMP54]], ptr [[CFF:%.*]], align 4
 ; CHECK-NEXT:    [[TMP55:%.*]] = extractelement <4 x i32> [[TMP52]], i32 1
 ; CHECK-NEXT:    store i32 [[TMP55]], ptr [[CTF:%.*]], align 4
 
@@ -6,7 +6,7 @@ define i32 @foo(i32 %v1, double %v2) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> <i32 poison, i32 undef>, i32 [[V1:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <2 x i32> [[TMP0]] to <2 x double>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
 ; CHECK-NEXT:    br label [[FOR_COND15_PREHEADER:%.*]]
 ; CHECK:       for.cond15.preheader:
 ; CHECK-NEXT:    br label [[IF_END:%.*]]
@@ -26,14 +26,15 @@ define i32 @foo(i32 %v1, double %v2) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x double>, ptr [[ARRAYIDX43]], align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = fmul <4 x double> [[TMP6]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> undef, <4 x double> [[TMP2]], <4 x double> [[TMP7]])
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> poison, <4 x i32> <i32 3, i32 1, i32 2, i32 0>
+; CHECK-NEXT:    [[TMP9:%.*]] = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> undef, <4 x double> [[TMP2]], <4 x double> [[TMP8]])
 ; CHECK-NEXT:    br label [[SW_EPILOG:%.*]]
 ; CHECK:       sw.bb195:
 ; CHECK-NEXT:    br label [[SW_EPILOG]]
 ; CHECK:       do.body:
 ; CHECK-NEXT:    unreachable
 ; CHECK:       sw.epilog:
-; CHECK-NEXT:    [[TMP9:%.*]] = phi <4 x double> [ undef, [[SW_BB195]] ], [ [[TMP8]], [[SW_BB]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = phi <4 x double> [ undef, [[SW_BB195]] ], [ [[TMP9]], [[SW_BB]] ]
 ; CHECK-NEXT:    ret i32 undef
 ; CHECK:       if.end.1:
 ; CHECK-NEXT:    br label [[FOR_COND15_1:%.*]]
 
@@ -20,17 +20,17 @@ define void @s116_modified(ptr %a) {
 ; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 1
 ; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds float, ptr [[A]], i64 3
 ; CHECK-NEXT:    [[LD0:%.*]] = load float, ptr [[A]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[GEP1]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x float>, ptr [[GEP3]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x float> poison, float [[LD0]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 poison>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 5, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> [[TMP8]], <4 x i32> <i32 0, i32 poison, i32 2, i32 4>
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x float> [[TMP10]], <4 x float> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP12:%.*]] = fmul fast <4 x float> [[TMP9]], [[TMP11]]
-; CHECK-NEXT:    store <4 x float> [[TMP12]], ptr [[A]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[GEP1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[GEP3]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> poison, float [[LD0]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <4 x i32> <i32 0, i32 5, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 poison, i32 2, i32 4>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP10:%.*]] = fmul fast <4 x float> [[TMP7]], [[TMP9]]
+; CHECK-NEXT:    store <4 x float> [[TMP10]], ptr [[A]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %gep1 = getelementptr inbounds float, ptr %a, i64 1
 
@@ -143,16 +143,17 @@ define void @gather_2(ptr %mat1, float %0, float %1) {
 ; CHECK-SAME: ptr [[MAT1:%.*]], float [[TMP0:%.*]], float [[TMP1:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float 0.000000e+00, float 0.000000e+00)
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[TMP0]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP6:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x float> zeroinitializer)
-; CHECK-NEXT:    [[TMP7:%.*]] = fmul float [[TMP2]], 0.000000e+00
+; CHECK-NEXT:    [[TMP3:%.*]] = call float @llvm.fmuladd.f32(float [[TMP1]], float [[TMP0]], float 0.000000e+00)
+; CHECK-NEXT:    [[TMP4:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float 0.000000e+00)
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul float [[TMP2]], 0.000000e+00
+; CHECK-NEXT:    [[TMP6:%.*]] = fmul float [[TMP3]], 0.000000e+00
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul float [[TMP4]], 0.000000e+00
 ; CHECK-NEXT:    [[ARRAYIDX163:%.*]] = getelementptr [4 x [4 x float]], ptr [[MAT1]], i64 0, i64 1
 ; CHECK-NEXT:    [[ARRAYIDX2_I_I_I278:%.*]] = getelementptr [4 x [4 x float]], ptr [[MAT1]], i64 0, i64 1, i64 1
-; CHECK-NEXT:    store float [[TMP7]], ptr [[ARRAYIDX163]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = fmul <2 x float> [[TMP6]], zeroinitializer
-; CHECK-NEXT:    store <2 x float> [[TMP8]], ptr [[ARRAYIDX2_I_I_I278]], align 4
+; CHECK-NEXT:    [[ARRAYIDX5_I_I_I280:%.*]] = getelementptr [4 x [4 x float]], ptr [[MAT1]], i64 0, i64 1, i64 2
+; CHECK-NEXT:    store float [[TMP5]], ptr [[ARRAYIDX163]], align 4
+; CHECK-NEXT:    store float [[TMP6]], ptr [[ARRAYIDX2_I_I_I278]], align 4
+; CHECK-NEXT:    store float [[TMP7]], ptr [[ARRAYIDX5_I_I_I280]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -183,19 +184,18 @@ define i32 @reorder_indices_1(float %0) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = fneg float [[TMP3]]
 ; CHECK-NEXT:    [[NEG11_I:%.*]] = fmul float [[TMP4]], [[TMP0]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call float @llvm.fmuladd.f32(float [[TMP1]], float 0.000000e+00, float [[NEG11_I]])
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[TMP1]], i32 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = fneg <2 x float> [[TMP7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP11:%.*]] = fmul <2 x float> [[TMP8]], [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP13:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP2]], <2 x float> zeroinitializer, <2 x float> [[TMP12]])
-; CHECK-NEXT:    [[TMP14:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP10]], <2 x float> [[TMP13]], <2 x float> zeroinitializer)
-; CHECK-NEXT:    [[TMP15:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP5]], float 0.000000e+00)
-; CHECK-NEXT:    [[TMP16:%.*]] = fmul <2 x float> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[MUL6_I_I_I:%.*]] = fmul float [[TMP15]], 0.000000e+00
-; CHECK-NEXT:    store <2 x float> [[TMP16]], ptr [[NOR1]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP2]], <2 x float> zeroinitializer, <2 x float> [[TMP11]])
+; CHECK-NEXT:    [[TMP13:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP10]], <2 x float> [[TMP12]], <2 x float> zeroinitializer)
+; CHECK-NEXT:    [[TMP14:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP5]], float 0.000000e+00)
+; CHECK-NEXT:    [[TMP15:%.*]] = fmul <2 x float> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[MUL6_I_I_I:%.*]] = fmul float [[TMP14]], 0.000000e+00
+; CHECK-NEXT:    store <2 x float> [[TMP15]], ptr [[NOR1]], align 4
 ; CHECK-NEXT:    store float [[MUL6_I_I_I]], ptr [[ARRAYIDX2_I265]], align 4
 ; CHECK-NEXT:    ret i32 0
 ;
 
@@ -68,10 +68,10 @@ define void @pr35497() local_unnamed_addr #0 {
 ; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> poison, <2 x i32> <i32 1, i32 0>
 ; SSE-NEXT:    [[TMP5:%.*]] = add nuw nsw <2 x i64> [[TMP4]], zeroinitializer
 ; SSE-NEXT:    store <2 x i64> [[TMP5]], ptr undef, align 1
-; SSE-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP5]], i64 [[ADD]], i32 0
-; SSE-NEXT:    [[TMP7:%.*]] = shl <2 x i64> [[TMP6]], <i64 2, i64 2>
-; SSE-NEXT:    [[TMP8:%.*]] = and <2 x i64> [[TMP7]], <i64 20, i64 20>
-; SSE-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> poison, <2 x i32> <i32 1, i32 0>
+; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> poison, <2 x i32> <i32 1, i32 poison>
+; SSE-NEXT:    [[TMP7:%.*]] = insertelement <2 x i64> [[TMP6]], i64 [[ADD]], i32 1
+; SSE-NEXT:    [[TMP8:%.*]] = shl <2 x i64> [[TMP7]], <i64 2, i64 2>
+; SSE-NEXT:    [[TMP9:%.*]] = and <2 x i64> [[TMP8]], <i64 20, i64 20>
 ; SSE-NEXT:    [[TMP10:%.*]] = lshr <2 x i64> [[TMP5]], <i64 6, i64 6>
 ; SSE-NEXT:    [[TMP11:%.*]] = add nuw nsw <2 x i64> [[TMP9]], [[TMP10]]
 ; SSE-NEXT:    store <2 x i64> [[TMP11]], ptr [[ARRAYIDX2_2]], align 1
@@ -88,10 +88,10 @@ define void @pr35497() local_unnamed_addr #0 {
 ; AVX-NEXT:    [[TMP3:%.*]] = and <2 x i64> [[TMP2]], <i64 20, i64 20>
 ; AVX-NEXT:    [[TMP4:%.*]] = add nuw nsw <2 x i64> [[TMP3]], zeroinitializer
 ; AVX-NEXT:    store <2 x i64> [[TMP4]], ptr undef, align 1
-; AVX-NEXT:    [[TMP5:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[ADD]], i32 0
-; AVX-NEXT:    [[TMP6:%.*]] = shl <2 x i64> [[TMP5]], <i64 2, i64 2>
-; AVX-NEXT:    [[TMP7:%.*]] = and <2 x i64> [[TMP6]], <i64 20, i64 20>
-; AVX-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP7]], <2 x i64> poison, <2 x i32> <i32 1, i32 0>
+; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> poison, <2 x i32> <i32 1, i32 poison>
+; AVX-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP5]], i64 [[ADD]], i32 1
+; AVX-NEXT:    [[TMP7:%.*]] = shl <2 x i64> [[TMP6]], <i64 2, i64 2>
+; AVX-NEXT:    [[TMP8:%.*]] = and <2 x i64> [[TMP7]], <i64 20, i64 20>
 ; AVX-NEXT:    [[TMP9:%.*]] = lshr <2 x i64> [[TMP4]], <i64 6, i64 6>
 ; AVX-NEXT:    [[TMP10:%.*]] = add nuw nsw <2 x i64> [[TMP8]], [[TMP9]]
 ; AVX-NEXT:    store <2 x i64> [[TMP10]], ptr [[ARRAYIDX2_2]], align 1