Skip to content

Commit e29d092

Browse files
committed
[X86] getFauxShuffleMask - add ISD::SHL/SRL handling
This is currently mostly the same as the VSHLI/VSRLI handling below, although I've kept them separate as I'm investigating adding non-uniform shift amount handling as a followup
1 parent c6f3b7b commit e29d092

File tree

2 files changed

+57
-33
lines changed

2 files changed

+57
-33
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6270,6 +6270,30 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
62706270
Ops.push_back(Src);
62716271
return true;
62726272
}
6273+
case ISD::SHL:
6274+
case ISD::SRL: {
6275+
// We can only decode 'whole byte' bit shifts as shuffles.
6276+
std::optional<uint64_t> Amt = DAG.getValidShiftAmount(N, DemandedElts);
6277+
if (!Amt || (*Amt % 8) != 0)
6278+
return false;
6279+
6280+
uint64_t ByteShift = *Amt / 8;
6281+
Ops.push_back(N.getOperand(0));
6282+
6283+
// Clear mask to all zeros and insert the shifted byte indices.
6284+
Mask.append(NumSizeInBytes, SM_SentinelZero);
6285+
6286+
if (ISD::SHL == Opcode) {
6287+
for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6288+
for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6289+
Mask[i + j] = i + j - ByteShift;
6290+
} else {
6291+
for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6292+
for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6293+
Mask[i + j - ByteShift] = i + j;
6294+
}
6295+
return true;
6296+
}
62736297
case X86ISD::VSHLI:
62746298
case X86ISD::VSRLI: {
62756299
uint64_t ShiftVal = N.getConstantOperandVal(1);

llvm/test/CodeGen/X86/combine-pmuldq.ll

Lines changed: 33 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -246,19 +246,19 @@ define i32 @PR43159(ptr %a0) {
246246
; AVX2-LABEL: PR43159:
247247
; AVX2: # %bb.0: # %entry
248248
; AVX2-NEXT: vmovdqa (%rdi), %xmm0
249-
; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
250-
; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
251-
; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
249+
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
252250
; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
253-
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
254-
; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
255-
; AVX2-NEXT: vpsubd %xmm2, %xmm0, %xmm0
251+
; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
252+
; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
253+
; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
254+
; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
255+
; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
256256
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
257-
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
258-
; AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm0
259-
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
260-
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
261-
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
257+
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
258+
; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
259+
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
260+
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
261+
; AVX2-NEXT: vpaddd %xmm2, %xmm0, %xmm0
262262
; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
263263
; AVX2-NEXT: vmovd %xmm0, %edi
264264
; AVX2-NEXT: vpextrd $1, %xmm0, %esi
@@ -269,19 +269,19 @@ define i32 @PR43159(ptr %a0) {
269269
; AVX512VL-LABEL: PR43159:
270270
; AVX512VL: # %bb.0: # %entry
271271
; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
272-
; AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
273-
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
274-
; AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
272+
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
275273
; AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
276-
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
277-
; AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
278-
; AVX512VL-NEXT: vpsubd %xmm2, %xmm0, %xmm0
274+
; AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
275+
; AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
276+
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
277+
; AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
278+
; AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0
279279
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
280-
; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
281-
; AVX512VL-NEXT: vpmuludq %xmm2, %xmm0, %xmm0
282-
; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
283-
; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
284-
; AVX512VL-NEXT: vpaddd %xmm1, %xmm0, %xmm0
280+
; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
281+
; AVX512VL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
282+
; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
283+
; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
284+
; AVX512VL-NEXT: vpaddd %xmm2, %xmm0, %xmm0
285285
; AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
286286
; AVX512VL-NEXT: vmovd %xmm0, %edi
287287
; AVX512VL-NEXT: vpextrd $1, %xmm0, %esi
@@ -292,19 +292,19 @@ define i32 @PR43159(ptr %a0) {
292292
; AVX512DQVL-LABEL: PR43159:
293293
; AVX512DQVL: # %bb.0: # %entry
294294
; AVX512DQVL-NEXT: vmovdqa (%rdi), %xmm0
295-
; AVX512DQVL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
296-
; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
297-
; AVX512DQVL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
295+
; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
298296
; AVX512DQVL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
299-
; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
300-
; AVX512DQVL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
301-
; AVX512DQVL-NEXT: vpsubd %xmm2, %xmm0, %xmm0
297+
; AVX512DQVL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
298+
; AVX512DQVL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
299+
; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
300+
; AVX512DQVL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
301+
; AVX512DQVL-NEXT: vpsubd %xmm1, %xmm0, %xmm0
302302
; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
303-
; AVX512DQVL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
304-
; AVX512DQVL-NEXT: vpmuludq %xmm2, %xmm0, %xmm0
305-
; AVX512DQVL-NEXT: vpxor %xmm2, %xmm2, %xmm2
306-
; AVX512DQVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
307-
; AVX512DQVL-NEXT: vpaddd %xmm1, %xmm0, %xmm0
303+
; AVX512DQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
304+
; AVX512DQVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
305+
; AVX512DQVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
306+
; AVX512DQVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
307+
; AVX512DQVL-NEXT: vpaddd %xmm2, %xmm0, %xmm0
308308
; AVX512DQVL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
309309
; AVX512DQVL-NEXT: vmovd %xmm0, %edi
310310
; AVX512DQVL-NEXT: vpextrd $1, %xmm0, %esi

0 commit comments

Comments
 (0)