-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[X86] getFauxShuffleMask - generalise logical shifts to work with non-uniform shift amounts #137349
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
…-uniform shift amounts Still doesn't help pre-AVX2 targets which might have lowered SHL to a MUL by that point.
@llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesStill doesn't help pre-AVX2 targets which might have lowered SHL to a MUL by that point. Full diff: https://github.com/llvm/llvm-project/pull/137349.diff 2 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 0fc50dc1a87b6..b07843523a15b 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -6443,25 +6443,36 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
}
case ISD::SHL:
case ISD::SRL: {
- // We can only decode 'whole byte' bit shifts as shuffles.
- std::optional<uint64_t> Amt = DAG.getValidShiftAmount(N, DemandedElts);
- if (!Amt || (*Amt % 8) != 0)
+ APInt UndefElts;
+ SmallVector<APInt, 32> EltBits;
+ if (!getTargetConstantBitsFromNode(N.getOperand(1), NumBitsPerElt,
+ UndefElts, EltBits,
+ /*AllowWholeUndefs*/ true,
+ /*AllowPartialUndefs*/ false))
return false;
- uint64_t ByteShift = *Amt / 8;
- Ops.push_back(N.getOperand(0));
+ // We can only decode 'whole byte' bit shifts as shuffles.
+ for (unsigned I = 0; I != NumElts; ++I)
+ if (DemandedElts[I] && !UndefElts[I] &&
+ (EltBits[I].urem(8) != 0 || EltBits[I].uge(NumBitsPerElt)))
+ return false;
- // Clear mask to all zeros and insert the shifted byte indices.
- Mask.append(NumSizeInBytes, SM_SentinelZero);
+ Mask.append(NumSizeInBytes, SM_SentinelUndef);
+ Ops.push_back(N.getOperand(0));
- if (ISD::SHL == Opcode) {
- for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
- for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
- Mask[i + j] = i + j - ByteShift;
- } else {
- for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
- for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
- Mask[i + j - ByteShift] = i + j;
+ for (unsigned I = 0; I != NumElts; ++I) {
+ if (!DemandedElts[I] || UndefElts[I])
+ continue;
+ unsigned ByteShift = EltBits[I].getZExtValue() / 8;
+ unsigned Lo = I * NumBytesPerElt;
+ unsigned Hi = Lo + NumBytesPerElt;
+ // Clear mask to all zeros and insert the shifted byte indices.
+ std::fill(Mask.begin() + Lo, Mask.begin() + Hi, SM_SentinelZero);
+ if (ISD::SHL == Opcode)
+ std::iota(Mask.begin() + Lo + ByteShift, Mask.begin() + Hi, Lo);
+ else
+ std::iota(Mask.begin() + Lo, Mask.begin() + Hi - ByteShift,
+ Lo + ByteShift);
}
return true;
}
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
index 469ef262a05f6..12d494c32b656 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
@@ -769,22 +769,10 @@ define <16 x i8> @combine_lshr_pshufb(<4 x i32> %a0) {
; SSE-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[3,5,6,7,4,10,11],zero,xmm0[9,14,15],zero,zero
; SSE-NEXT: retq
;
-; AVX1-LABEL: combine_lshr_pshufb:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[3,5,6,7,4,10,11],zero,xmm0[9,14,15],zero,zero
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: combine_lshr_pshufb:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,2,3,0,5,6,7,4,9,10,11,8,12,13,14,15]
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: combine_lshr_pshufb:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,2,3,0,5,6,7,4,9,10,11,8,12,13,14,15]
-; AVX512F-NEXT: retq
+; AVX-LABEL: combine_lshr_pshufb:
+; AVX: # %bb.0:
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[3,5,6,7,4,10,11],zero,xmm0[9,14,15],zero,zero
+; AVX-NEXT: retq
%shr = lshr <4 x i32> %a0, <i32 24, i32 0, i32 8, i32 16>
%bc = bitcast <4 x i32> %shr to <16 x i8>
%shuffle = shufflevector <16 x i8> %bc, <16 x i8> poison, <16 x i32> <i32 1, i32 2, i32 3, i32 0, i32 5, i32 6, i32 7, i32 4, i32 9, i32 10, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
@@ -817,14 +805,12 @@ define <16 x i8> @combine_shl_pshufb(<4 x i32> %a0) {
;
; AVX2-LABEL: combine_shl_pshufb:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,2,3,0,5,6,7,4,9,10,11,8,12,13,14,15]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6],zero,zero,xmm0[8,9],zero,zero,zero,xmm0[12,13]
; AVX2-NEXT: retq
;
; AVX512F-LABEL: combine_shl_pshufb:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,2,3,0,5,6,7,4,9,10,11,8,12,13,14,15]
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6],zero,zero,xmm0[8,9],zero,zero,zero,xmm0[12,13]
; AVX512F-NEXT: retq
%shr = shl <4 x i32> %a0, <i32 0, i32 8, i32 16, i32 16>
%bc = bitcast <4 x i32> %shr to <16 x i8>
|
// We can only decode 'whole byte' bit shifts as shuffles. | ||
for (unsigned I = 0; I != NumElts; ++I) | ||
if (DemandedElts[I] && !UndefElts[I] && | ||
(EltBits[I].urem(8) != 0 || EltBits[I].uge(NumBitsPerElt))) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In which case EltBits[I].uge(NumBitsPerElt)
can be true?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It can happen, and should generate poison, but tbh I just wanted to avoid the issue - I also wondered if I'll need to support the AVX2 X86ISD logical shift variants at some point which do handle out of bounds shift amounts.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM with a question.
…-uniform shift amounts (llvm#137349) Still doesn't help pre-AVX2 targets which might have lowered SHL to a MUL by that point.
…-uniform shift amounts (llvm#137349) Still doesn't help pre-AVX2 targets which might have lowered SHL to a MUL by that point.
…-uniform shift amounts (llvm#137349) Still doesn't help pre-AVX2 targets which might have lowered SHL to a MUL by that point.
…-uniform shift amounts (llvm#137349) Still doesn't help pre-AVX2 targets which might have lowered SHL to a MUL by that point.
…-uniform shift amounts (llvm#137349) Still doesn't help pre-AVX2 targets which might have lowered SHL to a MUL by that point.
Still doesn't help pre-AVX2 targets which might have lowered SHL to a MUL by that point.