Skip to content

Commit 071671e

Browse files
committed
[X86] Allow pre-SSE41 targets to extract multiple v16i8 elements coming from the same DWORD/WORD super-element
Pre-SSE41 targets tended to have weak (serial) GPR<->VEC moves, meaning we only allowed a single v16i8 extraction before spilling the vector to stack and loading the i8 elements instead. But this didn't make use of the DWORD/WORD extraction we had to use could extract multiple i8 elements at the same time. This patch attempts to determine if all uses of a vector are element extractions, and works out whether all the extractions share the same WORD or (lowest) DWORD, in which case we can perform a single extraction and just shift/truncate the individual elements. Differential Revision: https://reviews.llvm.org/D156350
1 parent cce3599 commit 071671e

File tree

4 files changed

+198
-195
lines changed

4 files changed

+198
-195
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 52 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -17648,6 +17648,40 @@ static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
1764817648
DAG.getIntPtrConstant(0, dl));
1764917649
}
1765017650

17651+
// Helper to find all the extracted elements from a vector.
17652+
static APInt getExtractedDemandedElts(SDNode *N) {
17653+
MVT VT = N->getSimpleValueType(0);
17654+
unsigned NumElts = VT.getVectorNumElements();
17655+
APInt DemandedElts = APInt::getZero(NumElts);
17656+
for (SDNode *User : N->uses()) {
17657+
switch (User->getOpcode()) {
17658+
case X86ISD::PEXTRB:
17659+
case X86ISD::PEXTRW:
17660+
case ISD::EXTRACT_VECTOR_ELT:
17661+
if (!isa<ConstantSDNode>(User->getOperand(1))) {
17662+
DemandedElts.setAllBits();
17663+
return DemandedElts;
17664+
}
17665+
DemandedElts.setBit(User->getConstantOperandVal(1));
17666+
break;
17667+
case ISD::BITCAST: {
17668+
if (!User->getValueType(0).isSimple() ||
17669+
!User->getValueType(0).isVector()) {
17670+
DemandedElts.setAllBits();
17671+
return DemandedElts;
17672+
}
17673+
APInt DemandedSrcElts = getExtractedDemandedElts(User);
17674+
DemandedElts |= APIntOps::ScaleBitMask(DemandedSrcElts, NumElts);
17675+
break;
17676+
}
17677+
default:
17678+
DemandedElts.setAllBits();
17679+
return DemandedElts;
17680+
}
17681+
}
17682+
return DemandedElts;
17683+
}
17684+
1765117685
SDValue
1765217686
X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
1765317687
SelectionDAG &DAG) const {
@@ -17739,13 +17773,16 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
1773917773
if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
1774017774
return Res;
1774117775

17742-
// TODO: We only extract a single element from v16i8, we can probably afford
17743-
// to be more aggressive here before using the default approach of spilling to
17744-
// stack.
17745-
if (VT == MVT::i8 && Op->isOnlyUserOf(Vec.getNode())) {
17776+
// Only extract a single element from a v16i8 source - determine the common
17777+
// DWORD/WORD that all extractions share, and extract the sub-byte.
17778+
// TODO: Add QWORD MOVQ extraction?
17779+
if (VT == MVT::i8) {
17780+
APInt DemandedElts = getExtractedDemandedElts(Vec.getNode());
17781+
assert(DemandedElts.getBitWidth() == 16 && "Vector width mismatch");
17782+
1774617783
// Extract either the lowest i32 or any i16, and extract the sub-byte.
1774717784
int DWordIdx = IdxVal / 4;
17748-
if (DWordIdx == 0) {
17785+
if (DWordIdx == 0 && DemandedElts == (DemandedElts & 15)) {
1774917786
SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
1775017787
DAG.getBitcast(MVT::v4i32, Vec),
1775117788
DAG.getIntPtrConstant(DWordIdx, dl));
@@ -17757,14 +17794,16 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
1775717794
}
1775817795

1775917796
int WordIdx = IdxVal / 2;
17760-
SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
17761-
DAG.getBitcast(MVT::v8i16, Vec),
17762-
DAG.getIntPtrConstant(WordIdx, dl));
17763-
int ShiftVal = (IdxVal % 2) * 8;
17764-
if (ShiftVal != 0)
17765-
Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
17766-
DAG.getConstant(ShiftVal, dl, MVT::i8));
17767-
return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
17797+
if (DemandedElts == (DemandedElts & (3 << (WordIdx * 2)))) {
17798+
SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
17799+
DAG.getBitcast(MVT::v8i16, Vec),
17800+
DAG.getIntPtrConstant(WordIdx, dl));
17801+
int ShiftVal = (IdxVal % 2) * 8;
17802+
if (ShiftVal != 0)
17803+
Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
17804+
DAG.getConstant(ShiftVal, dl, MVT::i8));
17805+
return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
17806+
}
1776817807
}
1776917808

1777017809
if (VT == MVT::f16 || VT.getSizeInBits() == 32) {

llvm/test/CodeGen/X86/bitcast-vector-bool.ll

Lines changed: 28 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -214,23 +214,14 @@ define i1 @trunc_v8i16_cmp(<8 x i16> %a0) nounwind {
214214
}
215215

216216
define i8 @bitcast_v16i8_to_v2i8(<16 x i8> %a0) nounwind {
217-
; SSE2-SSSE3-LABEL: bitcast_v16i8_to_v2i8:
218-
; SSE2-SSSE3: # %bb.0:
219-
; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax
220-
; SSE2-SSSE3-NEXT: movd %eax, %xmm0
221-
; SSE2-SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
222-
; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
223-
; SSE2-SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al
224-
; SSE2-SSSE3-NEXT: retq
225-
;
226-
; SSE41-LABEL: bitcast_v16i8_to_v2i8:
227-
; SSE41: # %bb.0:
228-
; SSE41-NEXT: pmovmskb %xmm0, %ecx
229-
; SSE41-NEXT: movl %ecx, %eax
230-
; SSE41-NEXT: shrl $8, %eax
231-
; SSE41-NEXT: addb %cl, %al
232-
; SSE41-NEXT: # kill: def $al killed $al killed $eax
233-
; SSE41-NEXT: retq
217+
; SSE-LABEL: bitcast_v16i8_to_v2i8:
218+
; SSE: # %bb.0:
219+
; SSE-NEXT: pmovmskb %xmm0, %ecx
220+
; SSE-NEXT: movl %ecx, %eax
221+
; SSE-NEXT: shrl $8, %eax
222+
; SSE-NEXT: addb %cl, %al
223+
; SSE-NEXT: # kill: def $al killed $al killed $eax
224+
; SSE-NEXT: retq
234225
;
235226
; AVX12-LABEL: bitcast_v16i8_to_v2i8:
236227
; AVX12: # %bb.0:
@@ -447,25 +438,15 @@ define i1 @trunc_v8i132_cmp(<8 x i32> %a0) nounwind {
447438
}
448439

449440
define i8 @bitcast_v16i16_to_v2i8(<16 x i16> %a0) nounwind {
450-
; SSE2-SSSE3-LABEL: bitcast_v16i16_to_v2i8:
451-
; SSE2-SSSE3: # %bb.0:
452-
; SSE2-SSSE3-NEXT: packsswb %xmm1, %xmm0
453-
; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax
454-
; SSE2-SSSE3-NEXT: movd %eax, %xmm0
455-
; SSE2-SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
456-
; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
457-
; SSE2-SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al
458-
; SSE2-SSSE3-NEXT: retq
459-
;
460-
; SSE41-LABEL: bitcast_v16i16_to_v2i8:
461-
; SSE41: # %bb.0:
462-
; SSE41-NEXT: packsswb %xmm1, %xmm0
463-
; SSE41-NEXT: pmovmskb %xmm0, %ecx
464-
; SSE41-NEXT: movl %ecx, %eax
465-
; SSE41-NEXT: shrl $8, %eax
466-
; SSE41-NEXT: addb %cl, %al
467-
; SSE41-NEXT: # kill: def $al killed $al killed $eax
468-
; SSE41-NEXT: retq
441+
; SSE-LABEL: bitcast_v16i16_to_v2i8:
442+
; SSE: # %bb.0:
443+
; SSE-NEXT: packsswb %xmm1, %xmm0
444+
; SSE-NEXT: pmovmskb %xmm0, %ecx
445+
; SSE-NEXT: movl %ecx, %eax
446+
; SSE-NEXT: shrl $8, %eax
447+
; SSE-NEXT: addb %cl, %al
448+
; SSE-NEXT: # kill: def $al killed $al killed $eax
449+
; SSE-NEXT: retq
469450
;
470451
; AVX1-LABEL: bitcast_v16i16_to_v2i8:
471452
; AVX1: # %bb.0:
@@ -776,29 +757,17 @@ define i1 @trunc_v8i64_cmp(<8 x i64> %a0) nounwind {
776757
}
777758

778759
define i8 @bitcast_v16i32_to_v2i8(<16 x i32> %a0) nounwind {
779-
; SSE2-SSSE3-LABEL: bitcast_v16i32_to_v2i8:
780-
; SSE2-SSSE3: # %bb.0:
781-
; SSE2-SSSE3-NEXT: packssdw %xmm3, %xmm2
782-
; SSE2-SSSE3-NEXT: packssdw %xmm1, %xmm0
783-
; SSE2-SSSE3-NEXT: packsswb %xmm2, %xmm0
784-
; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax
785-
; SSE2-SSSE3-NEXT: movd %eax, %xmm0
786-
; SSE2-SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
787-
; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
788-
; SSE2-SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al
789-
; SSE2-SSSE3-NEXT: retq
790-
;
791-
; SSE41-LABEL: bitcast_v16i32_to_v2i8:
792-
; SSE41: # %bb.0:
793-
; SSE41-NEXT: packssdw %xmm3, %xmm2
794-
; SSE41-NEXT: packssdw %xmm1, %xmm0
795-
; SSE41-NEXT: packsswb %xmm2, %xmm0
796-
; SSE41-NEXT: pmovmskb %xmm0, %ecx
797-
; SSE41-NEXT: movl %ecx, %eax
798-
; SSE41-NEXT: shrl $8, %eax
799-
; SSE41-NEXT: addb %cl, %al
800-
; SSE41-NEXT: # kill: def $al killed $al killed $eax
801-
; SSE41-NEXT: retq
760+
; SSE-LABEL: bitcast_v16i32_to_v2i8:
761+
; SSE: # %bb.0:
762+
; SSE-NEXT: packssdw %xmm3, %xmm2
763+
; SSE-NEXT: packssdw %xmm1, %xmm0
764+
; SSE-NEXT: packsswb %xmm2, %xmm0
765+
; SSE-NEXT: pmovmskb %xmm0, %ecx
766+
; SSE-NEXT: movl %ecx, %eax
767+
; SSE-NEXT: shrl $8, %eax
768+
; SSE-NEXT: addb %cl, %al
769+
; SSE-NEXT: # kill: def $al killed $al killed $eax
770+
; SSE-NEXT: retq
802771
;
803772
; AVX1-LABEL: bitcast_v16i32_to_v2i8:
804773
; AVX1: # %bb.0:

llvm/test/CodeGen/X86/pr63108.ll

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,10 @@ define i32 @PR63108() {
3434
; SSE-NEXT: psrld $16, %xmm0
3535
; SSE-NEXT: pxor %xmm2, %xmm0
3636
; SSE-NEXT: .LBB0_5: # %for.cond.cleanup
37-
; SSE-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
38-
; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %ecx
39-
; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
37+
; SSE-NEXT: movd %xmm0, %eax
38+
; SSE-NEXT: movsbl %al, %ecx
39+
; SSE-NEXT: shrl $8, %eax
40+
; SSE-NEXT: movsbl %al, %eax
4041
; SSE-NEXT: addl %ecx, %eax
4142
; SSE-NEXT: retq
4243
;

0 commit comments

Comments
 (0)