|
| 1 | +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3,-avx,-avx2 | FileCheck %s --check-prefix=SSSE3 |
| 2 | +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,-avx2 | FileCheck %s --check-prefix=AVX |
| 3 | +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2 |
| 4 | +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+sse4a | FileCheck %s --check-prefix=ZNVER1 |
| 5 | +; |
| 6 | +; Check the permutation of a variable shift with i8 vector into a widened shift. |
| 7 | +; |
| 8 | + |
| 9 | +; Transform only occurs on SSSE3 because operand is not a shuffle, and shift |
| 10 | +; amounts cannot be rearranged to quads. Not checking the correctness of |
| 11 | +; untransformed variants here as they are covered by other vector shift checks. |
| 12 | +define <16 x i8> @shl_v16i8(<16 x i8> %a) { |
| 13 | +; SSSE3-LABEL: shl_v16i8: |
| 14 | +; SSSE3: # %bb.0: |
| 15 | +; SSSE3-NEXT: movdqa {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # xmm1 = [8,1,2,12,4,5,6,7,0,9,10,11,3,13,14,15] |
| 16 | +; SSSE3-NEXT: pshufb %xmm1, %xmm0 |
| 17 | +; SSSE3-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,4,1,1,8,1,16,32] |
| 18 | +; SSSE3-NEXT: pshufb %xmm1, %xmm0 |
| 19 | +; SSSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 |
| 20 | +; SSSE3-NEXT: retq |
| 21 | +; |
| 22 | +; AVX-LABEL: shl_v16i8: |
| 23 | +; AVX: # %bb.0: |
| 24 | +; AVX-NOT: pshufb |
| 25 | +; AVX-NOT: vpshufb |
| 26 | +; AVX: retq |
| 27 | +; |
| 28 | +; AVX2-LABEL: shl_v16i8: |
| 29 | +; AVX2: # %bb.0: |
| 30 | +; AVX2-NOT: pshufb |
| 31 | +; AVX2-NOT: vpshufb |
| 32 | +; AVX2: retq |
| 33 | + %shift = shl <16 x i8> %a, <i8 3, i8 0, i8 2, i8 4, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 3, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 5> |
| 34 | + ret <16 x i8> %shift |
| 35 | +} |
| 36 | + |
| 37 | +define <16 x i8> @lshr_v16i8(<16 x i8> %a) { |
| 38 | +; SSSE3-LABEL: lshr_v16i8: |
| 39 | +; SSSE3: # %bb.0: |
| 40 | +; SSSE3-NEXT: pshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # xmm0 = xmm0[2,1,4,3,6,5,8,7,10,9,12,11,14,13,0,15] |
| 41 | +; SSSE3-NEXT: pmulhuw {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16384,2048,8192,16384,32768,8192,2048,4096] |
| 42 | +; SSSE3-NEXT: pshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # xmm0 = xmm0[14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15] |
| 43 | +; SSSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 |
| 44 | +; SSSE3-NEXT: retq |
| 45 | +; |
| 46 | +; AVX-LABEL: lshr_v16i8: |
| 47 | +; AVX: # %bb.0: |
| 48 | +; AVX-NOT: pshufb |
| 49 | +; AVX-NOT: vpshufb |
| 50 | +; AVX: retq |
| 51 | +; |
| 52 | +; AVX2-LABEL: lshr_v16i8: |
| 53 | +; AVX2: # %bb.0: |
| 54 | +; AVX2-NOT: pshufb |
| 55 | +; AVX2-NOT: vpshufb |
| 56 | +; AVX2: retq |
| 57 | + %shift = lshr <16 x i8> %a, <i8 4, i8 2, i8 2, i8 5, i8 5, i8 3, i8 3, i8 2, i8 2, i8 1, i8 1, i8 3, i8 3, i8 5, i8 5, i8 4> |
| 58 | + ret <16 x i8> %shift |
| 59 | +} |
| 60 | + |
| 61 | +define <16 x i8> @ashr_v16i8(<16 x i8> %a) { |
| 62 | +; SSSE3-LABEL: ashr_v16i8: |
| 63 | +; SSSE3: # %bb.0: |
| 64 | +; SSSE3-NEXT: pshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # xmm0 = xmm0[0,12,2,3,4,9,11,7,8,13,10,6,1,14,5,15] |
| 65 | +; SSSE3-NEXT: pmulhuw {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16384,8192,512,8192,4096,1024,32768,2048] |
| 66 | +; SSSE3-NEXT: pshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # xmm0 = xmm0[0,12,2,3,4,14,11,7,8,5,10,6,1,9,13,15] |
| 67 | +; SSSE3-NEXT: pand {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 |
| 68 | +; SSSE3-NEXT: movdqa {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # xmm1 = [32,64,16,16,1,4,2,16,8,1,u,16,32,8,64,4] |
| 69 | +; SSSE3-NEXT: pxor %xmm1, %xmm0 |
| 70 | +; SSSE3-NEXT: psubb %xmm1, %xmm0 |
| 71 | +; SSSE3-NEXT: retq |
| 72 | +; |
| 73 | +; AVX-LABEL: ashr_v16i8: |
| 74 | +; AVX: # %bb.0: |
| 75 | +; AVX-NOT: pshufb |
| 76 | +; AVX-NOT: vpshufb |
| 77 | +; AVX: retq |
| 78 | +; |
| 79 | +; AVX2-LABEL: ashr_v16i8: |
| 80 | +; AVX2: # %bb.0: |
| 81 | +; AVX2-NOT: pshufb |
| 82 | +; AVX2-NOT: vpshufb |
| 83 | +; AVX2: retq |
| 84 | + %shift = ashr <16 x i8> %a, <i8 2, i8 1, i8 3, i8 3, i8 7, i8 5, i8 6, i8 3, i8 4, i8 7, i8 undef, i8 3, i8 2, i8 4, i8 1, i8 5> |
| 85 | + ret <16 x i8> %shift |
| 86 | +} |
| 87 | + |
| 88 | +; Shift amounts cannot be paired. |
| 89 | +define <16 x i8> @not_shl_v16i8(<16 x i8> %a) { |
| 90 | +; SSSE3-LABEL: not_shl_v16i8: |
| 91 | +; SSSE3: # %bb.0: |
| 92 | +; SSSE3-NOT: pshufb |
| 93 | +; SSSE3-NOT: vpshufb |
| 94 | +; SSSE3: retq |
| 95 | +; |
| 96 | +; AVX-LABEL: not_shl_v16i8: |
| 97 | +; AVX: # %bb.0: |
| 98 | +; AVX-NOT: pshufb |
| 99 | +; AVX-NOT: vpshufb |
| 100 | +; AVX: retq |
| 101 | +; |
| 102 | +; AVX2-LABEL: not_shl_v16i8: |
| 103 | +; AVX2: # %bb.0: |
| 104 | +; AVX2-NOT: pshufb |
| 105 | +; AVX2-NOT: vpshufb |
| 106 | +; AVX2: retq |
| 107 | + %shift = shl <16 x i8> %a, <i8 2, i8 1, i8 3, i8 0, i8 7, i8 5, i8 6, i8 4, i8 2, i8 1, i8 3, i8 0, i8 7, i8 5, i8 6, i8 5> |
| 108 | + ret <16 x i8> %shift |
| 109 | +} |
| 110 | + |
| 111 | +; Right shift amounts containing zero and cannot form quads. |
| 112 | +define <16 x i8> @not_lshr_v16i8(<16 x i8> %a) { |
| 113 | +; SSSE3-LABEL: not_lshr_v16i8: |
| 114 | +; SSSE3: # %bb.0: |
| 115 | +; SSSE3-NOT: pshufb |
| 116 | +; SSSE3-NOT: vpshufb |
| 117 | +; SSSE3: retq |
| 118 | +; |
| 119 | +; AVX-LABEL: not_lshr_v16i8: |
| 120 | +; AVX: # %bb.0: |
| 121 | +; AVX-NOT: pshufb |
| 122 | +; AVX-NOT: vpshufb |
| 123 | +; AVX: retq |
| 124 | +; |
| 125 | +; AVX2-LABEL: not_lshr_v16i8: |
| 126 | +; AVX2: # %bb.0: |
| 127 | +; AVX2-NOT: pshufb |
| 128 | +; AVX2-NOT: vpshufb |
| 129 | +; AVX2: retq |
| 130 | + %shift = lshr <16 x i8> %a, <i8 4, i8 2, i8 2, i8 5, i8 5, i8 3, i8 3, i8 2, i8 2, i8 1, i8 1, i8 0, i8 0, i8 5, i8 5, i8 4> |
| 131 | + ret <16 x i8> %shift |
| 132 | +} |
| 133 | + |
| 134 | +; Shift cannot form quads and operand is not shuffle, only transform on SSSE3. |
| 135 | +define <32 x i8> @shl_v32i8(<32 x i8> %a) { |
| 136 | +; SSSE3-LABEL: shl_v32i8: |
| 137 | +; SSSE3: # %bb.0: |
| 138 | +; SSSE3-NEXT: movdqa {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # xmm2 = [0,2,1,3,6,5,4,7,8,9,12,11,10,13,14,15] |
| 139 | +; SSSE3-NEXT: pshufb %xmm2, %xmm0 |
| 140 | +; SSSE3-NEXT: movdqa {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # xmm3 = [1,4,8,2,16,32,64,16] |
| 141 | +; SSSE3-NEXT: pmullw %xmm3, %xmm0 |
| 142 | +; SSSE3-NEXT: pshufb %xmm2, %xmm0 |
| 143 | +; SSSE3-NEXT: movdqa {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # xmm4 = [255,252,255,252,254,248,248,254,240,240,192,224,224,192,240,240] |
| 144 | +; SSSE3-NEXT: pand %xmm4, %xmm0 |
| 145 | +; SSSE3-NEXT: pshufb %xmm2, %xmm1 |
| 146 | +; SSSE3-NEXT: pmullw %xmm3, %xmm1 |
| 147 | +; SSSE3-NEXT: pshufb %xmm2, %xmm1 |
| 148 | +; SSSE3-NEXT: pand %xmm4, %xmm1 |
| 149 | +; SSSE3-NEXT: retq |
| 150 | +; |
| 151 | +; AVX-LABEL: shl_v32i8: |
| 152 | +; AVX: # %bb.0: |
| 153 | +; AVX-NOT: pshufb |
| 154 | +; AVX-NOT: vpshufb |
| 155 | +; AVX: retq |
| 156 | +; |
| 157 | +; AVX2-LABEL: shl_v32i8: |
| 158 | +; AVX2: # %bb.0: |
| 159 | +; AVX2-NOT: pshufb |
| 160 | +; AVX2-NOT: vpshufb |
| 161 | +; AVX2: retq |
| 162 | + %shift = shl <32 x i8> %a, <i8 0, i8 2, i8 0, i8 2, i8 1, i8 3, i8 3, i8 1, i8 4, i8 4, i8 6, i8 5, i8 5, i8 6, i8 4, i8 4, |
| 163 | + i8 0, i8 2, i8 0, i8 2, i8 1, i8 3, i8 3, i8 1, i8 4, i8 4, i8 6, i8 5, i8 5, i8 6, i8 4, i8 4> |
| 164 | + ret <32 x i8> %shift |
| 165 | +} |
| 166 | + |
| 167 | +; For quads only testing on AVX2 as it has vps**vd. |
| 168 | +define <32 x i8> @shl_v32i8_quad(<32 x i8> %a) { |
| 169 | +; AVX2-LABEL: shl_v32i8_quad: |
| 170 | +; AVX2: # %bb.0: |
| 171 | +; AVX2-NEXT: vpshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # ymm0 = ymm0[0,5,13,9,3,6,12,11,2,4,10,14,1,7,8,15,25,29,18,22,24,28,19,23,17,21,26,30,16,20,27,31] |
| 172 | +; AVX2-NEXT: vpsllvd {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 |
| 173 | +; AVX2-NEXT: vpshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # ymm0 = ymm0[0,12,8,4,9,1,5,13,14,3,10,7,6,2,11,15,28,24,18,22,29,25,19,23,20,16,26,30,21,17,27,31] |
| 174 | +; AVX2-NEXT: vpand {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 |
| 175 | +; AVX2-NEXT: retq |
| 176 | +; |
| 177 | +; ZNVER1-LABEL: shl_v32i8_quad: |
| 178 | +; ZNVER1: # %bb.0: |
| 179 | +; ZNVER1-NEXT: vpshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # ymm0 = ymm0[0,5,13,9,3,6,12,11,2,4,10,14,1,7,8,15,25,29,18,22,24,28,19,23,17,21,26,30,16,20,27,31] |
| 180 | +; ZNVER1-NEXT: vpsllvd {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 |
| 181 | +; ZNVER1-NEXT: vpshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # ymm0 = ymm0[0,12,8,4,9,1,5,13,14,3,10,7,6,2,11,15,28,24,18,22,29,25,19,23,20,16,26,30,21,17,27,31] |
| 182 | +; ZNVER1-NEXT: vpand {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 |
| 183 | +; ZNVER1-NEXT: retq |
| 184 | + %shift = shl <32 x i8> %a, <i8 0, i8 2, i8 4, i8 6, i8 4, i8 0, i8 6, i8 2, i8 2, i8 0, i8 4, i8 6, i8 6, i8 0, i8 4, i8 2, |
| 185 | + i8 1, i8 3, i8 5, i8 7, i8 1, i8 3, i8 5, i8 7, i8 7, i8 5, i8 3, i8 1, i8 7, i8 5, i8 3, i8 1> |
| 186 | + ret <32 x i8> %shift |
| 187 | +} |
| 188 | + |
| 189 | +define <32 x i8> @lshr_v32i8_quad(<32 x i8> %a) { |
| 190 | +; AVX2-LABEL: lshr_v32i8_quad: |
| 191 | +; AVX2: # %bb.0: |
| 192 | +; AVX2-NEXT: vpshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # ymm0 = ymm0[0,5,13,9,3,6,12,11,2,4,10,14,1,7,8,15,25,29,18,22,24,28,19,23,17,21,26,30,16,20,27,31] |
| 193 | +; AVX2-NEXT: vpsrlvd {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 |
| 194 | +; AVX2-NEXT: vpshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # ymm0 = ymm0[0,12,8,4,9,1,5,13,14,3,10,7,6,2,11,15,28,24,18,22,29,25,19,23,20,16,26,30,21,17,27,31] |
| 195 | +; AVX2-NEXT: vpand {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 |
| 196 | +; AVX2-NEXT: retq |
| 197 | +; |
| 198 | +; ZNVER1-LABEL: lshr_v32i8_quad: |
| 199 | +; ZNVER1: # %bb.0: |
| 200 | +; ZNVER1-NEXT: vpshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # ymm0 = ymm0[0,5,13,9,3,6,12,11,2,4,10,14,1,7,8,15,25,29,18,22,24,28,19,23,17,21,26,30,16,20,27,31] |
| 201 | +; ZNVER1-NEXT: vpsrlvd {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 |
| 202 | +; ZNVER1-NEXT: vpshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # ymm0 = ymm0[0,12,8,4,9,1,5,13,14,3,10,7,6,2,11,15,28,24,18,22,29,25,19,23,20,16,26,30,21,17,27,31] |
| 203 | +; ZNVER1-NEXT: vpand {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 |
| 204 | +; ZNVER1-NEXT: retq |
| 205 | + %shift = lshr <32 x i8> %a, <i8 0, i8 2, i8 4, i8 6, i8 4, i8 0, i8 6, i8 2, i8 2, i8 0, i8 4, i8 6, i8 6, i8 0, i8 4, i8 2, |
| 206 | + i8 1, i8 3, i8 5, i8 7, i8 1, i8 3, i8 5, i8 7, i8 7, i8 5, i8 3, i8 1, i8 7, i8 5, i8 3, i8 1> |
| 207 | + ret <32 x i8> %shift |
| 208 | +} |
| 209 | + |
| 210 | +; Disabling the transform for AMD Zen because it can schedule two vpmullw 2 |
| 211 | +; cycles faster compared to Intel. |
| 212 | +define <32 x i8> @ashr_v32i8_quad(<32 x i8> %a) { |
| 213 | +; AVX2-LABEL: ashr_v32i8_quad: |
| 214 | +; AVX2: # %bb.0: |
| 215 | +; AVX2-NEXT: vpshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # ymm0 = ymm0[0,5,13,9,3,6,12,11,2,4,10,14,1,7,8,15,25,29,18,22,24,28,19,23,17,21,26,30,16,20,27,31] |
| 216 | +; AVX2-NEXT: vpsrlvd {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 |
| 217 | +; AVX2-NEXT: vpshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # ymm0 = ymm0[0,12,8,4,9,1,5,13,14,3,10,7,6,2,11,15,28,24,18,22,29,25,19,23,20,16,26,30,21,17,27,31] |
| 218 | +; AVX2-NEXT: vpand {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 |
| 219 | +; AVX2-NEXT: vmovdqa {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm1 # ymm1 = [128,32,8,2,8,128,2,32,32,128,8,2,2,128,8,32,64,16,4,1,64,16,4,1,1,4,16,64,1,4,16,64] |
| 220 | +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 |
| 221 | +; AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0 |
| 222 | +; AVX2-NEXT: retq |
| 223 | +; |
| 224 | +; ZNVER1-LABEL: ashr_v32i8_quad: |
| 225 | +; ZNVER1: # %bb.0: |
| 226 | +; ZNVER1-NOT: pshufb |
| 227 | +; ZNVER1-NOT: vpshufb |
| 228 | +; ZNVER1: retq |
| 229 | + %shift = ashr <32 x i8> %a, <i8 0, i8 2, i8 4, i8 6, i8 4, i8 0, i8 6, i8 2, i8 2, i8 0, i8 4, i8 6, i8 6, i8 0, i8 4, i8 2, |
| 230 | + i8 1, i8 3, i8 5, i8 7, i8 1, i8 3, i8 5, i8 7, i8 7, i8 5, i8 3, i8 1, i8 7, i8 5, i8 3, i8 1> |
| 231 | + ret <32 x i8> %shift |
| 232 | +} |
| 233 | + |
| 234 | +; Shift amounts cannot be paired in lane. |
| 235 | +define <32 x i8> @not_shl_v32i8(<32 x i8> %a) { |
| 236 | +; SSSE3-LABEL: not_shl_v32i8: |
| 237 | +; SSSE3: # %bb.0: |
| 238 | +; SSSE3-NOT: pshufb |
| 239 | +; SSSE3-NOT: vpshufb |
| 240 | +; SSSE3: retq |
| 241 | +; |
| 242 | +; AVX-LABEL: not_shl_v32i8: |
| 243 | +; AVX: # %bb.0: |
| 244 | +; AVX-NOT: pshufb |
| 245 | +; AVX-NOT: vpshufb |
| 246 | +; AVX: retq |
| 247 | +; |
| 248 | +; AVX2-LABEL: not_shl_v32i8: |
| 249 | +; AVX2: # %bb.0: |
| 250 | +; AVX2-NOT: pshufb |
| 251 | +; AVX2-NOT: vpshufb |
| 252 | +; AVX2: retq |
| 253 | + %shift = shl <32 x i8> %a, <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 3, |
| 254 | + i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 3, i8 3, i8 3> |
| 255 | + ret <32 x i8> %shift |
| 256 | +} |
| 257 | + |
| 258 | +; Always transform if operand is shuffle and shift amounts can be paired. |
| 259 | +define <16 x i8> @lshr_shuffle_v16i8(<16 x i8> %a) { |
| 260 | +; SSSE3-LABEL: lshr_shuffle_v16i8: |
| 261 | +; SSSE3: # %bb.0: |
| 262 | +; SSSE3-NEXT: pshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # xmm0 = xmm0[0,8,4,12,1,9,5,13,2,10,6,14,3,11,7,15] |
| 263 | +; SSSE3-NEXT: pmulhuw {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [32768,16384,16384,8192,8192,4096,4096,2048] |
| 264 | +; SSSE3-NEXT: pshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # xmm0 = xmm0[0,2,1,3,4,6,5,7,8,10,9,11,12,14,13,15] |
| 265 | +; SSSE3-NEXT: pand {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 |
| 266 | +; SSSE3-NEXT: movdqa {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # xmm1 = [64,32,64,32,32,16,32,16,16,8,16,8,8,4,8,4] |
| 267 | +; SSSE3-NEXT: pxor %xmm1, %xmm0 |
| 268 | +; SSSE3-NEXT: psubb %xmm1, %xmm0 |
| 269 | +; SSSE3-NEXT: retq |
| 270 | +; |
| 271 | +; AVX-LABEL: lshr_shuffle_v16i8: |
| 272 | +; AVX: # %bb.0: |
| 273 | +; AVX-NEXT: vpshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # xmm0 = xmm0[0,8,4,12,1,9,5,13,2,10,6,14,3,11,7,15] |
| 274 | +; AVX-NEXT: vpmulhuw {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [32768,16384,16384,8192,8192,4096,4096,2048] |
| 275 | +; AVX-NEXT: vpshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # xmm0 = xmm0[0,2,1,3,4,6,5,7,8,10,9,11,12,14,13,15] |
| 276 | +; AVX-NEXT: vpand {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 |
| 277 | +; AVX-NEXT: vmovdqa {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # xmm1 = [64,32,64,32,32,16,32,16,16,8,16,8,8,4,8,4] |
| 278 | +; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 |
| 279 | +; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 |
| 280 | +; AVX-NEXT: retq |
| 281 | +; |
| 282 | +; AVX2-LABEL: lshr_shuffle_v16i8: |
| 283 | +; AVX2: # %bb.0: |
| 284 | +; AVX2-NEXT: vpshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # xmm0 = xmm0[0,8,4,12,1,9,5,13,2,10,6,14,3,11,7,15] |
| 285 | +; AVX2-NEXT: vpmulhuw {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [32768,16384,16384,8192,8192,4096,4096,2048] |
| 286 | +; AVX2-NEXT: vpshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # xmm0 = xmm0[0,2,1,3,4,6,5,7,8,10,9,11,12,14,13,15] |
| 287 | +; AVX2-NEXT: vpand {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 |
| 288 | +; AVX2-NEXT: vmovdqa {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # xmm1 = [64,32,64,32,32,16,32,16,16,8,16,8,8,4,8,4] |
| 289 | +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 |
| 290 | +; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 |
| 291 | +; AVX2-NEXT: retq |
| 292 | +; |
| 293 | +; ZNVER1-LABEL: lshr_shuffle_v16i8: |
| 294 | +; ZNVER1: # %bb.0: |
| 295 | +; ZNVER1-NEXT: vpshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # xmm0 = xmm0[0,8,4,12,1,9,5,13,2,10,6,14,3,11,7,15] |
| 296 | +; ZNVER1-NEXT: vpmulhuw {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [32768,16384,16384,8192,8192,4096,4096,2048] |
| 297 | +; ZNVER1-NEXT: vpshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # xmm0 = xmm0[0,2,1,3,4,6,5,7,8,10,9,11,12,14,13,15] |
| 298 | +; ZNVER1-NEXT: vpand {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 |
| 299 | +; ZNVER1-NEXT: vmovdqa {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # xmm1 = [64,32,64,32,32,16,32,16,16,8,16,8,8,4,8,4] |
| 300 | +; ZNVER1-NEXT: vpxor %xmm1, %xmm0, %xmm0 |
| 301 | +; ZNVER1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 |
| 302 | +; ZNVER1-NEXT: retq |
| 303 | + %shuffle = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15> |
| 304 | + %shift = ashr <16 x i8> %shuffle, <i8 1, i8 2, i8 1, i8 2, i8 2, i8 3, i8 2, i8 3, i8 3, i8 4, i8 3, i8 4, i8 4, i8 5, i8 4, i8 5> |
| 305 | + ret <16 x i8> %shift |
| 306 | +} |
0 commit comments