|
| 1 | +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py |
| 2 | +; RUN: opt < %s -mtriple=x86_64 -slp-vectorizer -S -mcpu=skylake-avx512 | FileCheck %s |
| 3 | + |
| 4 | +; The test represents the case with multiple vectorization possibilities |
| 5 | +; but the most effective way to vectorize it is to match both 8-way reductions |
| 6 | +; feeding the insertelement vector build sequence. |
| 7 | + |
| 8 | +declare void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double>, <2 x double*>, i32 immarg, <2 x i1>) |
| 9 | + |
| 10 | +define void @test(double* nocapture readonly %arg, double* nocapture readonly %arg1, double* nocapture %arg2) { |
| 11 | +; CHECK-LABEL: @test( |
| 12 | +; CHECK-NEXT: entry: |
| 13 | +; CHECK-NEXT: [[GEP1_0:%.*]] = getelementptr inbounds double, double* [[ARG:%.*]], i64 1 |
| 14 | +; CHECK-NEXT: [[LD1_0:%.*]] = load double, double* [[GEP1_0]], align 8 |
| 15 | +; CHECK-NEXT: [[GEP2_0:%.*]] = getelementptr inbounds double, double* [[ARG1:%.*]], i64 16 |
| 16 | +; CHECK-NEXT: [[GEP1_1:%.*]] = getelementptr inbounds double, double* [[ARG]], i64 3 |
| 17 | +; CHECK-NEXT: [[LD1_1:%.*]] = load double, double* [[GEP1_1]], align 8 |
| 18 | +; CHECK-NEXT: [[GEP0_1:%.*]] = getelementptr inbounds double, double* [[ARG1]], i64 1 |
| 19 | +; CHECK-NEXT: [[GEP2_1:%.*]] = getelementptr inbounds double, double* [[ARG1]], i64 17 |
| 20 | +; CHECK-NEXT: [[GEP1_2:%.*]] = getelementptr inbounds double, double* [[ARG]], i64 5 |
| 21 | +; CHECK-NEXT: [[LD1_2:%.*]] = load double, double* [[GEP1_2]], align 8 |
| 22 | +; CHECK-NEXT: [[GEP0_2:%.*]] = getelementptr inbounds double, double* [[ARG1]], i64 2 |
| 23 | +; CHECK-NEXT: [[GEP2_2:%.*]] = getelementptr inbounds double, double* [[ARG1]], i64 18 |
| 24 | +; CHECK-NEXT: [[GEP1_3:%.*]] = getelementptr inbounds double, double* [[ARG]], i64 7 |
| 25 | +; CHECK-NEXT: [[LD1_3:%.*]] = load double, double* [[GEP1_3]], align 8 |
| 26 | +; CHECK-NEXT: [[GEP0_3:%.*]] = getelementptr inbounds double, double* [[ARG1]], i64 3 |
| 27 | +; CHECK-NEXT: [[GEP2_3:%.*]] = getelementptr inbounds double, double* [[ARG1]], i64 19 |
| 28 | +; CHECK-NEXT: [[GEP1_4:%.*]] = getelementptr inbounds double, double* [[ARG]], i64 9 |
| 29 | +; CHECK-NEXT: [[LD1_4:%.*]] = load double, double* [[GEP1_4]], align 8 |
| 30 | +; CHECK-NEXT: [[GEP0_4:%.*]] = getelementptr inbounds double, double* [[ARG1]], i64 4 |
| 31 | +; CHECK-NEXT: [[GEP2_4:%.*]] = getelementptr inbounds double, double* [[ARG1]], i64 20 |
| 32 | +; CHECK-NEXT: [[GEP1_5:%.*]] = getelementptr inbounds double, double* [[ARG]], i64 11 |
| 33 | +; CHECK-NEXT: [[LD1_5:%.*]] = load double, double* [[GEP1_5]], align 8 |
| 34 | +; CHECK-NEXT: [[GEP0_5:%.*]] = getelementptr inbounds double, double* [[ARG1]], i64 5 |
| 35 | +; CHECK-NEXT: [[GEP2_5:%.*]] = getelementptr inbounds double, double* [[ARG1]], i64 21 |
| 36 | +; CHECK-NEXT: [[GEP1_6:%.*]] = getelementptr inbounds double, double* [[ARG]], i64 13 |
| 37 | +; CHECK-NEXT: [[LD1_6:%.*]] = load double, double* [[GEP1_6]], align 8 |
| 38 | +; CHECK-NEXT: [[GEP0_6:%.*]] = getelementptr inbounds double, double* [[ARG1]], i64 6 |
| 39 | +; CHECK-NEXT: [[GEP2_6:%.*]] = getelementptr inbounds double, double* [[ARG1]], i64 22 |
| 40 | +; CHECK-NEXT: [[GEP1_7:%.*]] = getelementptr inbounds double, double* [[ARG]], i64 15 |
| 41 | +; CHECK-NEXT: [[LD1_7:%.*]] = load double, double* [[GEP1_7]], align 8 |
| 42 | +; CHECK-NEXT: [[GEP0_7:%.*]] = getelementptr inbounds double, double* [[ARG1]], i64 7 |
| 43 | +; CHECK-NEXT: [[GEP2_7:%.*]] = getelementptr inbounds double, double* [[ARG1]], i64 23 |
| 44 | +; CHECK-NEXT: [[LD0_0:%.*]] = load double, double* [[ARG1]], align 8 |
| 45 | +; CHECK-NEXT: [[LD2_0:%.*]] = load double, double* [[GEP2_0]], align 8 |
| 46 | +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[LD0_0]], i32 0 |
| 47 | +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[LD2_0]], i32 1 |
| 48 | +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[LD1_0]], i32 0 |
| 49 | +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[LD1_0]], i32 1 |
| 50 | +; CHECK-NEXT: [[TMP4:%.*]] = fmul fast <2 x double> [[TMP1]], [[TMP3]] |
| 51 | +; CHECK-NEXT: [[LD0_1:%.*]] = load double, double* [[GEP0_1]], align 8 |
| 52 | +; CHECK-NEXT: [[LD2_1:%.*]] = load double, double* [[GEP2_1]], align 8 |
| 53 | +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> poison, double [[LD0_1]], i32 0 |
| 54 | +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[LD2_1]], i32 1 |
| 55 | +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> poison, double [[LD1_1]], i32 0 |
| 56 | +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[LD1_1]], i32 1 |
| 57 | +; CHECK-NEXT: [[TMP9:%.*]] = fmul fast <2 x double> [[TMP6]], [[TMP8]] |
| 58 | +; CHECK-NEXT: [[TMP10:%.*]] = fadd fast <2 x double> [[TMP4]], [[TMP9]] |
| 59 | +; CHECK-NEXT: [[LD0_2:%.*]] = load double, double* [[GEP0_2]], align 8 |
| 60 | +; CHECK-NEXT: [[LD2_2:%.*]] = load double, double* [[GEP2_2]], align 8 |
| 61 | +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x double> poison, double [[LD0_2]], i32 0 |
| 62 | +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x double> [[TMP11]], double [[LD2_2]], i32 1 |
| 63 | +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x double> poison, double [[LD1_2]], i32 0 |
| 64 | +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x double> [[TMP13]], double [[LD1_2]], i32 1 |
| 65 | +; CHECK-NEXT: [[TMP15:%.*]] = fmul fast <2 x double> [[TMP12]], [[TMP14]] |
| 66 | +; CHECK-NEXT: [[TMP16:%.*]] = fadd fast <2 x double> [[TMP10]], [[TMP15]] |
| 67 | +; CHECK-NEXT: [[LD0_3:%.*]] = load double, double* [[GEP0_3]], align 8 |
| 68 | +; CHECK-NEXT: [[LD2_3:%.*]] = load double, double* [[GEP2_3]], align 8 |
| 69 | +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <2 x double> poison, double [[LD0_3]], i32 0 |
| 70 | +; CHECK-NEXT: [[TMP18:%.*]] = insertelement <2 x double> [[TMP17]], double [[LD2_3]], i32 1 |
| 71 | +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x double> poison, double [[LD1_3]], i32 0 |
| 72 | +; CHECK-NEXT: [[TMP20:%.*]] = insertelement <2 x double> [[TMP19]], double [[LD1_3]], i32 1 |
| 73 | +; CHECK-NEXT: [[TMP21:%.*]] = fmul fast <2 x double> [[TMP18]], [[TMP20]] |
| 74 | +; CHECK-NEXT: [[TMP22:%.*]] = fadd fast <2 x double> [[TMP16]], [[TMP21]] |
| 75 | +; CHECK-NEXT: [[LD0_4:%.*]] = load double, double* [[GEP0_4]], align 8 |
| 76 | +; CHECK-NEXT: [[LD2_4:%.*]] = load double, double* [[GEP2_4]], align 8 |
| 77 | +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <2 x double> poison, double [[LD0_4]], i32 0 |
| 78 | +; CHECK-NEXT: [[TMP24:%.*]] = insertelement <2 x double> [[TMP23]], double [[LD2_4]], i32 1 |
| 79 | +; CHECK-NEXT: [[TMP25:%.*]] = insertelement <2 x double> poison, double [[LD1_4]], i32 0 |
| 80 | +; CHECK-NEXT: [[TMP26:%.*]] = insertelement <2 x double> [[TMP25]], double [[LD1_4]], i32 1 |
| 81 | +; CHECK-NEXT: [[TMP27:%.*]] = fmul fast <2 x double> [[TMP24]], [[TMP26]] |
| 82 | +; CHECK-NEXT: [[TMP28:%.*]] = fadd fast <2 x double> [[TMP22]], [[TMP27]] |
| 83 | +; CHECK-NEXT: [[LD0_5:%.*]] = load double, double* [[GEP0_5]], align 8 |
| 84 | +; CHECK-NEXT: [[LD2_5:%.*]] = load double, double* [[GEP2_5]], align 8 |
| 85 | +; CHECK-NEXT: [[TMP29:%.*]] = insertelement <2 x double> poison, double [[LD0_5]], i32 0 |
| 86 | +; CHECK-NEXT: [[TMP30:%.*]] = insertelement <2 x double> [[TMP29]], double [[LD2_5]], i32 1 |
| 87 | +; CHECK-NEXT: [[TMP31:%.*]] = insertelement <2 x double> poison, double [[LD1_5]], i32 0 |
| 88 | +; CHECK-NEXT: [[TMP32:%.*]] = insertelement <2 x double> [[TMP31]], double [[LD1_5]], i32 1 |
| 89 | +; CHECK-NEXT: [[TMP33:%.*]] = fmul fast <2 x double> [[TMP30]], [[TMP32]] |
| 90 | +; CHECK-NEXT: [[TMP34:%.*]] = fadd fast <2 x double> [[TMP28]], [[TMP33]] |
| 91 | +; CHECK-NEXT: [[LD0_6:%.*]] = load double, double* [[GEP0_6]], align 8 |
| 92 | +; CHECK-NEXT: [[LD2_6:%.*]] = load double, double* [[GEP2_6]], align 8 |
| 93 | +; CHECK-NEXT: [[TMP35:%.*]] = insertelement <2 x double> poison, double [[LD0_6]], i32 0 |
| 94 | +; CHECK-NEXT: [[TMP36:%.*]] = insertelement <2 x double> [[TMP35]], double [[LD2_6]], i32 1 |
| 95 | +; CHECK-NEXT: [[TMP37:%.*]] = insertelement <2 x double> poison, double [[LD1_6]], i32 0 |
| 96 | +; CHECK-NEXT: [[TMP38:%.*]] = insertelement <2 x double> [[TMP37]], double [[LD1_6]], i32 1 |
| 97 | +; CHECK-NEXT: [[TMP39:%.*]] = fmul fast <2 x double> [[TMP36]], [[TMP38]] |
| 98 | +; CHECK-NEXT: [[TMP40:%.*]] = fadd fast <2 x double> [[TMP34]], [[TMP39]] |
| 99 | +; CHECK-NEXT: [[LD0_7:%.*]] = load double, double* [[GEP0_7]], align 8 |
| 100 | +; CHECK-NEXT: [[LD2_7:%.*]] = load double, double* [[GEP2_7]], align 8 |
| 101 | +; CHECK-NEXT: [[TMP41:%.*]] = insertelement <2 x double> poison, double [[LD0_7]], i32 0 |
| 102 | +; CHECK-NEXT: [[TMP42:%.*]] = insertelement <2 x double> [[TMP41]], double [[LD2_7]], i32 1 |
| 103 | +; CHECK-NEXT: [[TMP43:%.*]] = insertelement <2 x double> poison, double [[LD1_7]], i32 0 |
| 104 | +; CHECK-NEXT: [[TMP44:%.*]] = insertelement <2 x double> [[TMP43]], double [[LD1_7]], i32 1 |
| 105 | +; CHECK-NEXT: [[TMP45:%.*]] = fmul fast <2 x double> [[TMP42]], [[TMP44]] |
| 106 | +; CHECK-NEXT: [[TMP46:%.*]] = fadd fast <2 x double> [[TMP40]], [[TMP45]] |
| 107 | +; CHECK-NEXT: [[P:%.*]] = getelementptr inbounds double, double* [[ARG2:%.*]], <2 x i64> <i64 0, i64 16> |
| 108 | +; CHECK-NEXT: call void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double> [[TMP46]], <2 x double*> [[P]], i32 8, <2 x i1> <i1 true, i1 true>) |
| 109 | +; CHECK-NEXT: ret void |
| 110 | +; |
| 111 | +entry: |
| 112 | + %gep1.0 = getelementptr inbounds double, double* %arg, i64 1 |
| 113 | + %ld1.0 = load double, double* %gep1.0, align 8 |
| 114 | + %ld0.0 = load double, double* %arg1, align 8 |
| 115 | + %mul1.0 = fmul fast double %ld0.0, %ld1.0 |
| 116 | + %gep2.0 = getelementptr inbounds double, double* %arg1, i64 16 |
| 117 | + %ld2.0 = load double, double* %gep2.0, align 8 |
| 118 | + %mul2.0 = fmul fast double %ld2.0, %ld1.0 |
| 119 | + %gep1.1 = getelementptr inbounds double, double* %arg, i64 3 |
| 120 | + %ld1.1 = load double, double* %gep1.1, align 8 |
| 121 | + %gep0.1 = getelementptr inbounds double, double* %arg1, i64 1 |
| 122 | + %ld0.1 = load double, double* %gep0.1, align 8 |
| 123 | + %mul1.1 = fmul fast double %ld0.1, %ld1.1 |
| 124 | + %rdx1.0 = fadd fast double %mul1.0, %mul1.1 |
| 125 | + %gep2.1 = getelementptr inbounds double, double* %arg1, i64 17 |
| 126 | + %ld2.1 = load double, double* %gep2.1, align 8 |
| 127 | + %mul2.1 = fmul fast double %ld2.1, %ld1.1 |
| 128 | + %rdx2.0 = fadd fast double %mul2.0, %mul2.1 |
| 129 | + %gep1.2 = getelementptr inbounds double, double* %arg, i64 5 |
| 130 | + %ld1.2 = load double, double* %gep1.2, align 8 |
| 131 | + %gep0.2 = getelementptr inbounds double, double* %arg1, i64 2 |
| 132 | + %ld0.2 = load double, double* %gep0.2, align 8 |
| 133 | + %mul1.2 = fmul fast double %ld0.2, %ld1.2 |
| 134 | + %rdx1.1 = fadd fast double %rdx1.0, %mul1.2 |
| 135 | + %gep2.2 = getelementptr inbounds double, double* %arg1, i64 18 |
| 136 | + %ld2.2 = load double, double* %gep2.2, align 8 |
| 137 | + %mul2.2 = fmul fast double %ld2.2, %ld1.2 |
| 138 | + %rdx2.1 = fadd fast double %rdx2.0, %mul2.2 |
| 139 | + %gep1.3 = getelementptr inbounds double, double* %arg, i64 7 |
| 140 | + %ld1.3 = load double, double* %gep1.3, align 8 |
| 141 | + %gep0.3 = getelementptr inbounds double, double* %arg1, i64 3 |
| 142 | + %ld0.3 = load double, double* %gep0.3, align 8 |
| 143 | + %mul1.3 = fmul fast double %ld0.3, %ld1.3 |
| 144 | + %rdx1.2 = fadd fast double %rdx1.1, %mul1.3 |
| 145 | + %gep2.3 = getelementptr inbounds double, double* %arg1, i64 19 |
| 146 | + %ld2.3 = load double, double* %gep2.3, align 8 |
| 147 | + %mul2.3 = fmul fast double %ld2.3, %ld1.3 |
| 148 | + %rdx2.2 = fadd fast double %rdx2.1, %mul2.3 |
| 149 | + %gep1.4 = getelementptr inbounds double, double* %arg, i64 9 |
| 150 | + %ld1.4 = load double, double* %gep1.4, align 8 |
| 151 | + %gep0.4 = getelementptr inbounds double, double* %arg1, i64 4 |
| 152 | + %ld0.4 = load double, double* %gep0.4, align 8 |
| 153 | + %mul1.4 = fmul fast double %ld0.4, %ld1.4 |
| 154 | + %rdx1.3 = fadd fast double %rdx1.2, %mul1.4 |
| 155 | + %gep2.4 = getelementptr inbounds double, double* %arg1, i64 20 |
| 156 | + %ld2.4 = load double, double* %gep2.4, align 8 |
| 157 | + %mul2.4 = fmul fast double %ld2.4, %ld1.4 |
| 158 | + %rdx2.3 = fadd fast double %rdx2.2, %mul2.4 |
| 159 | + %gep1.5 = getelementptr inbounds double, double* %arg, i64 11 |
| 160 | + %ld1.5 = load double, double* %gep1.5, align 8 |
| 161 | + %gep0.5 = getelementptr inbounds double, double* %arg1, i64 5 |
| 162 | + %ld0.5 = load double, double* %gep0.5, align 8 |
| 163 | + %mul1.5 = fmul fast double %ld0.5, %ld1.5 |
| 164 | + %rdx1.4 = fadd fast double %rdx1.3, %mul1.5 |
| 165 | + %gep2.5 = getelementptr inbounds double, double* %arg1, i64 21 |
| 166 | + %ld2.5 = load double, double* %gep2.5, align 8 |
| 167 | + %mul2.5 = fmul fast double %ld2.5, %ld1.5 |
| 168 | + %rdx2.4 = fadd fast double %rdx2.3, %mul2.5 |
| 169 | + %gep1.6 = getelementptr inbounds double, double* %arg, i64 13 |
| 170 | + %ld1.6 = load double, double* %gep1.6, align 8 |
| 171 | + %gep0.6 = getelementptr inbounds double, double* %arg1, i64 6 |
| 172 | + %ld0.6 = load double, double* %gep0.6, align 8 |
| 173 | + %mul1.6 = fmul fast double %ld0.6, %ld1.6 |
| 174 | + %rdx1.5 = fadd fast double %rdx1.4, %mul1.6 |
| 175 | + %gep2.6 = getelementptr inbounds double, double* %arg1, i64 22 |
| 176 | + %ld2.6 = load double, double* %gep2.6, align 8 |
| 177 | + %mul2.6 = fmul fast double %ld2.6, %ld1.6 |
| 178 | + %rdx2.5 = fadd fast double %rdx2.4, %mul2.6 |
| 179 | + %gep1.7 = getelementptr inbounds double, double* %arg, i64 15 |
| 180 | + %ld1.7 = load double, double* %gep1.7, align 8 |
| 181 | + %gep0.7 = getelementptr inbounds double, double* %arg1, i64 7 |
| 182 | + %ld0.7 = load double, double* %gep0.7, align 8 |
| 183 | + %mul1.7 = fmul fast double %ld0.7, %ld1.7 |
| 184 | + %rdx1 = fadd fast double %rdx1.5, %mul1.7 |
| 185 | + %gep2.7 = getelementptr inbounds double, double* %arg1, i64 23 |
| 186 | + %ld2.7 = load double, double* %gep2.7, align 8 |
| 187 | + %mul2.7 = fmul fast double %ld2.7, %ld1.7 |
| 188 | + %rdx2 = fadd fast double %rdx2.5, %mul2.7 |
| 189 | + %i142 = insertelement <2 x double> poison, double %rdx1, i64 0 |
| 190 | + %i143 = insertelement <2 x double> %i142, double %rdx2, i64 1 |
| 191 | + %p = getelementptr inbounds double, double* %arg2, <2 x i64> <i64 0, i64 16> |
| 192 | + call void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double> %i143, <2 x double*> %p, i32 8, <2 x i1> <i1 true, i1 true>) |
| 193 | + ret void |
| 194 | +} |
0 commit comments