Skip to content

Commit 5d91d12

Browse files
authored
[X86] getFauxShuffleMask - generalise logical shifts to work with non-uniform shift amounts (#137349)
Still doesn't help pre-AVX2 targets which might have lowered SHL to a MUL by that point.
1 parent b9e3274 commit 5d91d12

File tree

2 files changed

+32
-35
lines changed

2 files changed

+32
-35
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 26 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -6443,25 +6443,36 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
64436443
}
64446444
case ISD::SHL:
64456445
case ISD::SRL: {
6446-
// We can only decode 'whole byte' bit shifts as shuffles.
6447-
std::optional<uint64_t> Amt = DAG.getValidShiftAmount(N, DemandedElts);
6448-
if (!Amt || (*Amt % 8) != 0)
6446+
APInt UndefElts;
6447+
SmallVector<APInt, 32> EltBits;
6448+
if (!getTargetConstantBitsFromNode(N.getOperand(1), NumBitsPerElt,
6449+
UndefElts, EltBits,
6450+
/*AllowWholeUndefs*/ true,
6451+
/*AllowPartialUndefs*/ false))
64496452
return false;
64506453

6451-
uint64_t ByteShift = *Amt / 8;
6452-
Ops.push_back(N.getOperand(0));
6454+
// We can only decode 'whole byte' bit shifts as shuffles.
6455+
for (unsigned I = 0; I != NumElts; ++I)
6456+
if (DemandedElts[I] && !UndefElts[I] &&
6457+
(EltBits[I].urem(8) != 0 || EltBits[I].uge(NumBitsPerElt)))
6458+
return false;
64536459

6454-
// Clear mask to all zeros and insert the shifted byte indices.
6455-
Mask.append(NumSizeInBytes, SM_SentinelZero);
6460+
Mask.append(NumSizeInBytes, SM_SentinelUndef);
6461+
Ops.push_back(N.getOperand(0));
64566462

6457-
if (ISD::SHL == Opcode) {
6458-
for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6459-
for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6460-
Mask[i + j] = i + j - ByteShift;
6461-
} else {
6462-
for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6463-
for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6464-
Mask[i + j - ByteShift] = i + j;
6463+
for (unsigned I = 0; I != NumElts; ++I) {
6464+
if (!DemandedElts[I] || UndefElts[I])
6465+
continue;
6466+
unsigned ByteShift = EltBits[I].getZExtValue() / 8;
6467+
unsigned Lo = I * NumBytesPerElt;
6468+
unsigned Hi = Lo + NumBytesPerElt;
6469+
// Clear mask to all zeros and insert the shifted byte indices.
6470+
std::fill(Mask.begin() + Lo, Mask.begin() + Hi, SM_SentinelZero);
6471+
if (ISD::SHL == Opcode)
6472+
std::iota(Mask.begin() + Lo + ByteShift, Mask.begin() + Hi, Lo);
6473+
else
6474+
std::iota(Mask.begin() + Lo, Mask.begin() + Hi - ByteShift,
6475+
Lo + ByteShift);
64656476
}
64666477
return true;
64676478
}

llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll

Lines changed: 6 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -769,22 +769,10 @@ define <16 x i8> @combine_lshr_pshufb(<4 x i32> %a0) {
769769
; SSE-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[3,5,6,7,4,10,11],zero,xmm0[9,14,15],zero,zero
770770
; SSE-NEXT: retq
771771
;
772-
; AVX1-LABEL: combine_lshr_pshufb:
773-
; AVX1: # %bb.0:
774-
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[3,5,6,7,4,10,11],zero,xmm0[9,14,15],zero,zero
775-
; AVX1-NEXT: retq
776-
;
777-
; AVX2-LABEL: combine_lshr_pshufb:
778-
; AVX2: # %bb.0:
779-
; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
780-
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,2,3,0,5,6,7,4,9,10,11,8,12,13,14,15]
781-
; AVX2-NEXT: retq
782-
;
783-
; AVX512F-LABEL: combine_lshr_pshufb:
784-
; AVX512F: # %bb.0:
785-
; AVX512F-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
786-
; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,2,3,0,5,6,7,4,9,10,11,8,12,13,14,15]
787-
; AVX512F-NEXT: retq
772+
; AVX-LABEL: combine_lshr_pshufb:
773+
; AVX: # %bb.0:
774+
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[3,5,6,7,4,10,11],zero,xmm0[9,14,15],zero,zero
775+
; AVX-NEXT: retq
788776
%shr = lshr <4 x i32> %a0, <i32 24, i32 0, i32 8, i32 16>
789777
%bc = bitcast <4 x i32> %shr to <16 x i8>
790778
%shuffle = shufflevector <16 x i8> %bc, <16 x i8> poison, <16 x i32> <i32 1, i32 2, i32 3, i32 0, i32 5, i32 6, i32 7, i32 4, i32 9, i32 10, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
@@ -817,14 +805,12 @@ define <16 x i8> @combine_shl_pshufb(<4 x i32> %a0) {
817805
;
818806
; AVX2-LABEL: combine_shl_pshufb:
819807
; AVX2: # %bb.0:
820-
; AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
821-
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,2,3,0,5,6,7,4,9,10,11,8,12,13,14,15]
808+
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6],zero,zero,xmm0[8,9],zero,zero,zero,xmm0[12,13]
822809
; AVX2-NEXT: retq
823810
;
824811
; AVX512F-LABEL: combine_shl_pshufb:
825812
; AVX512F: # %bb.0:
826-
; AVX512F-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
827-
; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,2,3,0,5,6,7,4,9,10,11,8,12,13,14,15]
813+
; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6],zero,zero,xmm0[8,9],zero,zero,zero,xmm0[12,13]
828814
; AVX512F-NEXT: retq
829815
%shr = shl <4 x i32> %a0, <i32 0, i32 8, i32 16, i32 16>
830816
%bc = bitcast <4 x i32> %shr to <16 x i8>

0 commit comments

Comments
 (0)