Skip to content

Commit 9f2bd97

Browse files
authored
[X86] combineConcatVectorOps - convert X86ISD::PACKSS/US concatenation to use combineConcatVectorOps recursion (#130575)
Only concatenate X86ISD::PACKSS/US nodes if at least one operand is beneficial to concatenate
1 parent c4280db commit 9f2bd97

File tree

3 files changed

+25
-41
lines changed

3 files changed

+25
-41
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -58481,9 +58481,13 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
5848158481
MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
5848258482
SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
5848358483
NumOps * SrcVT.getVectorNumElements());
58484-
return DAG.getNode(Op0.getOpcode(), DL, VT,
58485-
ConcatSubOperand(SrcVT, Ops, 0),
58486-
ConcatSubOperand(SrcVT, Ops, 1));
58484+
SDValue Concat0 = CombineSubOperand(SrcVT, Ops, 0);
58485+
SDValue Concat1 = CombineSubOperand(SrcVT, Ops, 1);
58486+
if (Concat0 || Concat1)
58487+
return DAG.getNode(
58488+
Op0.getOpcode(), DL, VT,
58489+
Concat0 ? Concat0 : ConcatSubOperand(SrcVT, Ops, 0),
58490+
Concat1 ? Concat1 : ConcatSubOperand(SrcVT, Ops, 1));
5848758491
}
5848858492
break;
5848958493
case X86ISD::PALIGNR:

llvm/test/CodeGen/X86/vector-pack-512.ll

Lines changed: 12 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -245,21 +245,12 @@ define <64 x i8> @concat_trunc_packuswb_512(<32 x i16> %a0, <32 x i16> %a1) noun
245245
}
246246

247247
define <32 x i16> @concat_packsswd_int_2x256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2, <8 x i32> %a3) {
248-
; AVX512F-LABEL: concat_packsswd_int_2x256:
249-
; AVX512F: # %bb.0:
250-
; AVX512F-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
251-
; AVX512F-NEXT: vpackssdw %ymm3, %ymm2, %ymm1
252-
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
253-
; AVX512F-NEXT: retq
254-
;
255-
; AVX512BW-LABEL: concat_packsswd_int_2x256:
256-
; AVX512BW: # %bb.0:
257-
; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
258-
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
259-
; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
260-
; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
261-
; AVX512BW-NEXT: vpackssdw %zmm1, %zmm0, %zmm0
262-
; AVX512BW-NEXT: retq
248+
; AVX512-LABEL: concat_packsswd_int_2x256:
249+
; AVX512: # %bb.0:
250+
; AVX512-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
251+
; AVX512-NEXT: vpackssdw %ymm3, %ymm2, %ymm1
252+
; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
253+
; AVX512-NEXT: retq
263254
%lo = tail call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a0, <8 x i32> %a1)
264255
%hi = tail call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a2, <8 x i32> %a3)
265256
%res = shufflevector <16 x i16> %lo, <16 x i16> %hi, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
@@ -268,21 +259,12 @@ define <32 x i16> @concat_packsswd_int_2x256(<8 x i32> %a0, <8 x i32> %a1, <8 x
268259
declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>)
269260

270261
define <32 x i16> @concat_packuswd_int_2x256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2, <8 x i32> %a3) {
271-
; AVX512F-LABEL: concat_packuswd_int_2x256:
272-
; AVX512F: # %bb.0:
273-
; AVX512F-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
274-
; AVX512F-NEXT: vpackusdw %ymm3, %ymm2, %ymm1
275-
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
276-
; AVX512F-NEXT: retq
277-
;
278-
; AVX512BW-LABEL: concat_packuswd_int_2x256:
279-
; AVX512BW: # %bb.0:
280-
; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
281-
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
282-
; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
283-
; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
284-
; AVX512BW-NEXT: vpackusdw %zmm1, %zmm0, %zmm0
285-
; AVX512BW-NEXT: retq
262+
; AVX512-LABEL: concat_packuswd_int_2x256:
263+
; AVX512: # %bb.0:
264+
; AVX512-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
265+
; AVX512-NEXT: vpackusdw %ymm3, %ymm2, %ymm1
266+
; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
267+
; AVX512-NEXT: retq
286268
%lo = tail call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a0, <8 x i32> %a1)
287269
%hi = tail call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a2, <8 x i32> %a3)
288270
%res = shufflevector <16 x i16> %lo, <16 x i16> %hi, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>

llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -817,15 +817,13 @@ define <32 x i8> @concat_alignr_unnecessary(<16 x i8> %a0, <16 x i8> noundef %a1
817817
ret <32 x i8> %res
818818
}
819819

820-
; TODO: Not beneficial to concatenate both inputs just to create a 256-bit packss
821-
define <32 x i8> @concat_packsr_unnecessary(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2) nounwind {
822-
; CHECK-LABEL: concat_packsr_unnecessary:
820+
; Not beneficial to concatenate both inputs just to create a 256-bit packss
821+
define <32 x i8> @concat_packss_unnecessary(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2) nounwind {
822+
; CHECK-LABEL: concat_packss_unnecessary:
823823
; CHECK: # %bb.0:
824-
; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
825-
; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
826-
; CHECK-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
827-
; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
828-
; CHECK-NEXT: vpacksswb %ymm1, %ymm0, %ymm0
824+
; CHECK-NEXT: vpacksswb %xmm1, %xmm0, %xmm1
825+
; CHECK-NEXT: vpacksswb %xmm2, %xmm0, %xmm0
826+
; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
829827
; CHECK-NEXT: ret{{[l|q]}}
830828
%lo = tail call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a0, <8 x i16> %a1)
831829
%hi = tail call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a0, <8 x i16> %a2)

0 commit comments

Comments
 (0)