Skip to content

Commit 9d2351a

Browse files
committed
[X86] matchPMADDWD - add matching for (add (X, (pmaddwd Y, Z)) reassociation patterns.
Allows us to match pmaddwd accumulation patterns, and folding to vpdpwssd instructions on VNNI targets Fixes #118433
1 parent ce4801d commit 9d2351a

File tree

2 files changed

+55
-107
lines changed

2 files changed

+55
-107
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -56471,9 +56471,12 @@ static SDValue matchPMADDWD(SelectionDAG &DAG, SDNode *N,
5647156471
!isPowerOf2_32(VT.getVectorNumElements()))
5647256472
return SDValue();
5647356473

56474-
SDValue Op0, Op1;
56474+
SDValue Op0, Op1, Accum;
5647556475
if (!sd_match(N, m_Add(m_AllOf(m_Opc(ISD::BUILD_VECTOR), m_Value(Op0)),
56476-
m_AllOf(m_Opc(ISD::BUILD_VECTOR), m_Value(Op1)))))
56476+
m_AllOf(m_Opc(ISD::BUILD_VECTOR), m_Value(Op1)))) &&
56477+
!sd_match(N, m_Add(m_AllOf(m_Opc(ISD::BUILD_VECTOR), m_Value(Op0)),
56478+
m_Add(m_Value(Accum), m_AllOf(m_Opc(ISD::BUILD_VECTOR),
56479+
m_Value(Op1))))))
5647756480
return SDValue();
5647856481

5647956482
// Check if one of Op0,Op1 is of the form:
@@ -56549,7 +56552,10 @@ static SDValue matchPMADDWD(SelectionDAG &DAG, SDNode *N,
5654956552
InVT.getVectorNumElements() / 2);
5655056553
return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
5655156554
};
56552-
return SplitOpsAndApply(DAG, Subtarget, DL, VT, { N0, N1 }, PMADDBuilder);
56555+
SDValue R = SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMADDBuilder);
56556+
if (Accum)
56557+
R = DAG.getNode(ISD::ADD, DL, VT, R, Accum);
56558+
return R;
5655356559
}
5655456560

5655556561
// Attempt to turn this pattern into PMADDWD.

llvm/test/CodeGen/X86/vpdpwssd.ll

Lines changed: 46 additions & 104 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2-
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=znver4 | FileCheck %s --check-prefixes=CHECK,ZNVER
3-
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=znver5 | FileCheck %s --check-prefixes=CHECK,ZNVER
2+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=znver4 | FileCheck %s --check-prefixes=CHECK,ZNVER,AVX512BW-VNNI
3+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=znver5 | FileCheck %s --check-prefixes=CHECK,ZNVER,AVX-VNNI
44
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni,+fast-dpwssd | FileCheck %s --check-prefixes=CHECK,AVX512-VNNI
55
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni,+avx512vl,+fast-dpwssd | FileCheck %s --check-prefixes=CHECK,AVX512VL-VNNI
66

@@ -16,56 +16,28 @@ define <16 x i32> @vpdpwssd_test(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2) {
1616
define <16 x i32> @vpdpwssd_v16i32_accumulate(<32 x i16> %a0, <32 x i16> %a1, <16 x i32> %a2) {
1717
; ZNVER-LABEL: vpdpwssd_v16i32_accumulate:
1818
; ZNVER: # %bb.0:
19-
; ZNVER-NEXT: vpmovsxwd %ymm0, %zmm3
20-
; ZNVER-NEXT: vpmovsxwd %ymm1, %zmm4
21-
; ZNVER-NEXT: vextracti64x4 $1, %zmm0, %ymm0
22-
; ZNVER-NEXT: vextracti64x4 $1, %zmm1, %ymm1
23-
; ZNVER-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
24-
; ZNVER-NEXT: vpmovsxwd %ymm0, %zmm0
25-
; ZNVER-NEXT: vpmovsxwd %ymm1, %zmm1
26-
; ZNVER-NEXT: vpmulld %zmm4, %zmm3, %zmm3
27-
; ZNVER-NEXT: vpmovsxbd {{.*#+}} zmm4 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
28-
; ZNVER-NEXT: vpmulld %zmm1, %zmm0, %zmm0
29-
; ZNVER-NEXT: vpermi2d %zmm0, %zmm3, %zmm5
30-
; ZNVER-NEXT: vpermi2d %zmm0, %zmm3, %zmm4
31-
; ZNVER-NEXT: vpaddd %zmm2, %zmm5, %zmm0
32-
; ZNVER-NEXT: vpaddd %zmm4, %zmm0, %zmm0
19+
; ZNVER-NEXT: vpdpwssd %zmm1, %zmm0, %zmm2
20+
; ZNVER-NEXT: vmovdqa64 %zmm2, %zmm0
3321
; ZNVER-NEXT: retq
3422
;
3523
; AVX512-VNNI-LABEL: vpdpwssd_v16i32_accumulate:
3624
; AVX512-VNNI: # %bb.0:
37-
; AVX512-VNNI-NEXT: vpmovsxwd %ymm0, %zmm3
38-
; AVX512-VNNI-NEXT: vextracti64x4 $1, %zmm0, %ymm0
39-
; AVX512-VNNI-NEXT: vpmovsxwd %ymm0, %zmm0
40-
; AVX512-VNNI-NEXT: vpmovsxwd %ymm1, %zmm4
41-
; AVX512-VNNI-NEXT: vpmulld %zmm4, %zmm3, %zmm3
42-
; AVX512-VNNI-NEXT: vextracti64x4 $1, %zmm1, %ymm1
43-
; AVX512-VNNI-NEXT: vpmovsxwd %ymm1, %zmm1
44-
; AVX512-VNNI-NEXT: vpmulld %zmm1, %zmm0, %zmm0
45-
; AVX512-VNNI-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
46-
; AVX512-VNNI-NEXT: vpermi2d %zmm0, %zmm3, %zmm1
47-
; AVX512-VNNI-NEXT: vpmovsxbd {{.*#+}} zmm4 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
48-
; AVX512-VNNI-NEXT: vpermi2d %zmm0, %zmm3, %zmm4
49-
; AVX512-VNNI-NEXT: vpaddd %zmm2, %zmm1, %zmm0
50-
; AVX512-VNNI-NEXT: vpaddd %zmm4, %zmm0, %zmm0
25+
; AVX512-VNNI-NEXT: vextracti64x4 $1, %zmm1, %ymm3
26+
; AVX512-VNNI-NEXT: vextracti64x4 $1, %zmm0, %ymm4
27+
; AVX512-VNNI-NEXT: vpmaddwd %ymm3, %ymm4, %ymm3
28+
; AVX512-VNNI-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0
29+
; AVX512-VNNI-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
30+
; AVX512-VNNI-NEXT: vpaddd %zmm2, %zmm0, %zmm0
5131
; AVX512-VNNI-NEXT: retq
5232
;
5333
; AVX512VL-VNNI-LABEL: vpdpwssd_v16i32_accumulate:
5434
; AVX512VL-VNNI: # %bb.0:
55-
; AVX512VL-VNNI-NEXT: vpmovsxwd %ymm0, %zmm3
56-
; AVX512VL-VNNI-NEXT: vextracti64x4 $1, %zmm0, %ymm0
57-
; AVX512VL-VNNI-NEXT: vpmovsxwd %ymm0, %zmm0
58-
; AVX512VL-VNNI-NEXT: vpmovsxwd %ymm1, %zmm4
59-
; AVX512VL-VNNI-NEXT: vpmulld %zmm4, %zmm3, %zmm3
60-
; AVX512VL-VNNI-NEXT: vextracti64x4 $1, %zmm1, %ymm1
61-
; AVX512VL-VNNI-NEXT: vpmovsxwd %ymm1, %zmm1
62-
; AVX512VL-VNNI-NEXT: vpmulld %zmm1, %zmm0, %zmm0
63-
; AVX512VL-VNNI-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
64-
; AVX512VL-VNNI-NEXT: vpermi2d %zmm0, %zmm3, %zmm1
65-
; AVX512VL-VNNI-NEXT: vpmovsxbd {{.*#+}} zmm4 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
66-
; AVX512VL-VNNI-NEXT: vpermi2d %zmm0, %zmm3, %zmm4
67-
; AVX512VL-VNNI-NEXT: vpaddd %zmm2, %zmm1, %zmm0
68-
; AVX512VL-VNNI-NEXT: vpaddd %zmm4, %zmm0, %zmm0
35+
; AVX512VL-VNNI-NEXT: vextracti64x4 $1, %zmm1, %ymm3
36+
; AVX512VL-VNNI-NEXT: vextracti64x4 $1, %zmm0, %ymm4
37+
; AVX512VL-VNNI-NEXT: vpmaddwd %ymm3, %ymm4, %ymm3
38+
; AVX512VL-VNNI-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0
39+
; AVX512VL-VNNI-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
40+
; AVX512VL-VNNI-NEXT: vpaddd %zmm2, %zmm0, %zmm0
6941
; AVX512VL-VNNI-NEXT: retq
7042
%x0 = sext <32 x i16> %a0 to <32 x i32>
7143
%x1 = sext <32 x i16> %a1 to <32 x i32>
@@ -78,43 +50,28 @@ define <16 x i32> @vpdpwssd_v16i32_accumulate(<32 x i16> %a0, <32 x i16> %a1, <1
7850
}
7951

8052
define <8 x i32> @vpdpwssd_v8i32_accumulate(<16 x i16> %a0, <16 x i16> %a1, <8 x i32> %a2) {
81-
; ZNVER-LABEL: vpdpwssd_v8i32_accumulate:
82-
; ZNVER: # %bb.0:
83-
; ZNVER-NEXT: vpmovsxwd %ymm0, %zmm0
84-
; ZNVER-NEXT: vpmovsxwd %ymm1, %zmm1
85-
; ZNVER-NEXT: vpmulld %zmm1, %zmm0, %zmm0
86-
; ZNVER-NEXT: vpmovqd %zmm0, %ymm1
87-
; ZNVER-NEXT: vextracti64x4 $1, %zmm0, %ymm3
88-
; ZNVER-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm3[1,3],ymm0[5,7],ymm3[5,7]
89-
; ZNVER-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
90-
; ZNVER-NEXT: vpaddd %ymm2, %ymm0, %ymm0
91-
; ZNVER-NEXT: vpaddd %ymm0, %ymm1, %ymm0
92-
; ZNVER-NEXT: retq
53+
; AVX512BW-VNNI-LABEL: vpdpwssd_v8i32_accumulate:
54+
; AVX512BW-VNNI: # %bb.0:
55+
; AVX512BW-VNNI-NEXT: vpdpwssd %ymm1, %ymm0, %ymm2
56+
; AVX512BW-VNNI-NEXT: vmovdqa %ymm2, %ymm0
57+
; AVX512BW-VNNI-NEXT: retq
58+
;
59+
; AVX-VNNI-LABEL: vpdpwssd_v8i32_accumulate:
60+
; AVX-VNNI: # %bb.0:
61+
; AVX-VNNI-NEXT: {vex} vpdpwssd %ymm1, %ymm0, %ymm2
62+
; AVX-VNNI-NEXT: vmovdqa %ymm2, %ymm0
63+
; AVX-VNNI-NEXT: retq
9364
;
9465
; AVX512-VNNI-LABEL: vpdpwssd_v8i32_accumulate:
9566
; AVX512-VNNI: # %bb.0:
96-
; AVX512-VNNI-NEXT: vpmovsxwd %ymm0, %zmm0
97-
; AVX512-VNNI-NEXT: vpmovsxwd %ymm1, %zmm1
98-
; AVX512-VNNI-NEXT: vpmulld %zmm1, %zmm0, %zmm0
99-
; AVX512-VNNI-NEXT: vpmovqd %zmm0, %ymm1
100-
; AVX512-VNNI-NEXT: vextracti64x4 $1, %zmm0, %ymm3
101-
; AVX512-VNNI-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm3[1,3],ymm0[5,7],ymm3[5,7]
102-
; AVX512-VNNI-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
103-
; AVX512-VNNI-NEXT: vpaddd %ymm2, %ymm1, %ymm1
104-
; AVX512-VNNI-NEXT: vpaddd %ymm0, %ymm1, %ymm0
67+
; AVX512-VNNI-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0
68+
; AVX512-VNNI-NEXT: vpaddd %ymm2, %ymm0, %ymm0
10569
; AVX512-VNNI-NEXT: retq
10670
;
10771
; AVX512VL-VNNI-LABEL: vpdpwssd_v8i32_accumulate:
10872
; AVX512VL-VNNI: # %bb.0:
109-
; AVX512VL-VNNI-NEXT: vpmovsxwd %ymm0, %zmm0
110-
; AVX512VL-VNNI-NEXT: vpmovsxwd %ymm1, %zmm1
111-
; AVX512VL-VNNI-NEXT: vpmulld %zmm1, %zmm0, %zmm0
112-
; AVX512VL-VNNI-NEXT: vpmovqd %zmm0, %ymm1
113-
; AVX512VL-VNNI-NEXT: vextracti64x4 $1, %zmm0, %ymm3
114-
; AVX512VL-VNNI-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm3[1,3],ymm0[5,7],ymm3[5,7]
115-
; AVX512VL-VNNI-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
116-
; AVX512VL-VNNI-NEXT: vpaddd %ymm2, %ymm1, %ymm1
117-
; AVX512VL-VNNI-NEXT: vpaddd %ymm0, %ymm1, %ymm0
73+
; AVX512VL-VNNI-NEXT: vpdpwssd %ymm1, %ymm0, %ymm2
74+
; AVX512VL-VNNI-NEXT: vmovdqa %ymm2, %ymm0
11875
; AVX512VL-VNNI-NEXT: retq
11976
%x0 = sext <16 x i16> %a0 to <16 x i32>
12077
%x1 = sext <16 x i16> %a1 to <16 x i32>
@@ -127,43 +84,28 @@ define <8 x i32> @vpdpwssd_v8i32_accumulate(<16 x i16> %a0, <16 x i16> %a1, <8 x
12784
}
12885

12986
define <4 x i32> @vpdpwssd_v4i32_accumulate(<8 x i16> %a0, <8 x i16> %a1, <4 x i32> %a2) {
130-
; ZNVER-LABEL: vpdpwssd_v4i32_accumulate:
131-
; ZNVER: # %bb.0:
132-
; ZNVER-NEXT: vpmovsxwd %xmm0, %ymm0
133-
; ZNVER-NEXT: vpmovsxwd %xmm1, %ymm1
134-
; ZNVER-NEXT: vpmulld %ymm1, %ymm0, %ymm0
135-
; ZNVER-NEXT: vpmovqd %ymm0, %xmm1
136-
; ZNVER-NEXT: vextracti128 $1, %ymm0, %xmm3
137-
; ZNVER-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm3[1,3]
138-
; ZNVER-NEXT: vpaddd %xmm2, %xmm1, %xmm1
139-
; ZNVER-NEXT: vpaddd %xmm1, %xmm0, %xmm0
140-
; ZNVER-NEXT: vzeroupper
141-
; ZNVER-NEXT: retq
87+
; AVX512BW-VNNI-LABEL: vpdpwssd_v4i32_accumulate:
88+
; AVX512BW-VNNI: # %bb.0:
89+
; AVX512BW-VNNI-NEXT: vpdpwssd %xmm1, %xmm0, %xmm2
90+
; AVX512BW-VNNI-NEXT: vmovdqa %xmm2, %xmm0
91+
; AVX512BW-VNNI-NEXT: retq
92+
;
93+
; AVX-VNNI-LABEL: vpdpwssd_v4i32_accumulate:
94+
; AVX-VNNI: # %bb.0:
95+
; AVX-VNNI-NEXT: {vex} vpdpwssd %xmm1, %xmm0, %xmm2
96+
; AVX-VNNI-NEXT: vmovdqa %xmm2, %xmm0
97+
; AVX-VNNI-NEXT: retq
14298
;
14399
; AVX512-VNNI-LABEL: vpdpwssd_v4i32_accumulate:
144100
; AVX512-VNNI: # %bb.0:
145-
; AVX512-VNNI-NEXT: vpmovsxwd %xmm0, %ymm0
146-
; AVX512-VNNI-NEXT: vpmovsxwd %xmm1, %ymm1
147-
; AVX512-VNNI-NEXT: vpmulld %ymm1, %ymm0, %ymm0
148-
; AVX512-VNNI-NEXT: vextracti128 $1, %ymm0, %xmm1
149-
; AVX512-VNNI-NEXT: vshufps {{.*#+}} xmm3 = xmm0[0,2],xmm1[0,2]
150-
; AVX512-VNNI-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
151-
; AVX512-VNNI-NEXT: vpaddd %xmm2, %xmm3, %xmm1
152-
; AVX512-VNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0
153-
; AVX512-VNNI-NEXT: vzeroupper
101+
; AVX512-VNNI-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0
102+
; AVX512-VNNI-NEXT: vpaddd %xmm2, %xmm0, %xmm0
154103
; AVX512-VNNI-NEXT: retq
155104
;
156105
; AVX512VL-VNNI-LABEL: vpdpwssd_v4i32_accumulate:
157106
; AVX512VL-VNNI: # %bb.0:
158-
; AVX512VL-VNNI-NEXT: vpmovsxwd %xmm0, %ymm0
159-
; AVX512VL-VNNI-NEXT: vpmovsxwd %xmm1, %ymm1
160-
; AVX512VL-VNNI-NEXT: vpmulld %ymm1, %ymm0, %ymm0
161-
; AVX512VL-VNNI-NEXT: vpmovqd %ymm0, %xmm1
162-
; AVX512VL-VNNI-NEXT: vextracti128 $1, %ymm0, %xmm3
163-
; AVX512VL-VNNI-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm3[1,3]
164-
; AVX512VL-VNNI-NEXT: vpaddd %xmm2, %xmm1, %xmm1
165-
; AVX512VL-VNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0
166-
; AVX512VL-VNNI-NEXT: vzeroupper
107+
; AVX512VL-VNNI-NEXT: vpdpwssd %xmm1, %xmm0, %xmm2
108+
; AVX512VL-VNNI-NEXT: vmovdqa %xmm2, %xmm0
167109
; AVX512VL-VNNI-NEXT: retq
168110
%x0 = sext <8 x i16> %a0 to <8 x i32>
169111
%x1 = sext <8 x i16> %a1 to <8 x i32>

0 commit comments

Comments
 (0)