Skip to content
This repository was archived by the owner on Mar 28, 2020. It is now read-only.

Commit 196a560

Browse files
committed
[X86] Add 128 and 256-bit VPOPCNTDQ instructions. Adjust some tablegen classes LZCNT/POPCNT.
I think when this instruction was first published it was only for a Knights CPU and thus VLX version was missing. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@320910 91177308-0d34-0410-b5e6-96231b3b80d8
1 parent 311afa8 commit 196a560

File tree

5 files changed

+324
-64
lines changed

5 files changed

+324
-64
lines changed

lib/Target/X86/X86InstrAVX512.td

Lines changed: 33 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -9563,82 +9563,50 @@ let Predicates = [HasAVX512, NoVLX] in {
95639563
sub_xmm)>;
95649564
}
95659565

9566-
multiclass avx512_ctlz<bits<8> opc, string OpcodeStr, OpndItins itins,
9567-
Predicate prd> {
9568-
defm NAME : avx512_unary_rm_vl_dq<opc, opc, OpcodeStr, ctlz, itins, prd>;
9566+
// Use 512bit version to implement 128/256 bit.
9567+
multiclass avx512_unary_lowering<string InstrStr, SDNode OpNode,
9568+
AVX512VLVectorVTInfo _, Predicate prd> {
9569+
let Predicates = [prd, NoVLX] in {
9570+
def : Pat<(_.info256.VT(OpNode _.info256.RC:$src1)),
9571+
(EXTRACT_SUBREG
9572+
(!cast<Instruction>(InstrStr # "Zrr")
9573+
(INSERT_SUBREG(_.info512.VT(IMPLICIT_DEF)),
9574+
_.info256.RC:$src1,
9575+
_.info256.SubRegIdx)),
9576+
_.info256.SubRegIdx)>;
9577+
9578+
def : Pat<(_.info128.VT(OpNode _.info128.RC:$src1)),
9579+
(EXTRACT_SUBREG
9580+
(!cast<Instruction>(InstrStr # "Zrr")
9581+
(INSERT_SUBREG(_.info512.VT(IMPLICIT_DEF)),
9582+
_.info128.RC:$src1,
9583+
_.info128.SubRegIdx)),
9584+
_.info128.SubRegIdx)>;
9585+
}
95699586
}
95709587

95719588
// FIXME: Is there a better scheduler itinerary for VPLZCNT?
9572-
defm VPLZCNT : avx512_ctlz<0x44, "vplzcnt", SSE_INTALU_ITINS_P, HasCDI>;
9589+
defm VPLZCNT : avx512_unary_rm_vl_dq<0x44, 0x44, "vplzcnt", ctlz,
9590+
SSE_INTALU_ITINS_P, HasCDI>;
95739591

95749592
// FIXME: Is there a better scheduler itinerary for VPCONFLICT?
95759593
defm VPCONFLICT : avx512_unary_rm_vl_dq<0xC4, 0xC4, "vpconflict", X86Conflict,
95769594
SSE_INTALU_ITINS_P, HasCDI>;
95779595

95789596
// VPLZCNT: Use 512bit version to implement 128/256 bit in case NoVLX.
9579-
let Predicates = [HasCDI, NoVLX] in {
9580-
def : Pat<(v4i64 (ctlz VR256X:$src)),
9581-
(EXTRACT_SUBREG
9582-
(VPLZCNTQZrr
9583-
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)),
9584-
sub_ymm)>;
9585-
def : Pat<(v2i64 (ctlz VR128X:$src)),
9586-
(EXTRACT_SUBREG
9587-
(VPLZCNTQZrr
9588-
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm)),
9589-
sub_xmm)>;
9590-
9591-
def : Pat<(v8i32 (ctlz VR256X:$src)),
9592-
(EXTRACT_SUBREG
9593-
(VPLZCNTDZrr
9594-
(INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)),
9595-
sub_ymm)>;
9596-
def : Pat<(v4i32 (ctlz VR128X:$src)),
9597-
(EXTRACT_SUBREG
9598-
(VPLZCNTDZrr
9599-
(INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm)),
9600-
sub_xmm)>;
9601-
}
9597+
defm : avx512_unary_lowering<"VPLZCNTQ", ctlz, avx512vl_i64_info, HasCDI>;
9598+
defm : avx512_unary_lowering<"VPLZCNTD", ctlz, avx512vl_i32_info, HasCDI>;
96029599

96039600
//===---------------------------------------------------------------------===//
96049601
// Counts number of ones - VPOPCNTD and VPOPCNTQ
96059602
//===---------------------------------------------------------------------===//
96069603

9607-
multiclass avx512_unary_rmb_popcnt<bits<8> opc, string OpcodeStr,
9608-
OpndItins itins, X86VectorVTInfo VTInfo> {
9609-
let Predicates = [HasVPOPCNTDQ] in
9610-
defm Z : avx512_unary_rmb<opc, OpcodeStr, ctpop, itins, VTInfo>, EVEX_V512;
9611-
}
9612-
9613-
// Use 512bit version to implement 128/256 bit.
9614-
multiclass avx512_unary_lowering<SDNode OpNode, AVX512VLVectorVTInfo _, Predicate prd> {
9615-
let Predicates = [prd] in {
9616-
def Z256_Alt : Pat<(_.info256.VT(OpNode _.info256.RC:$src1)),
9617-
(EXTRACT_SUBREG
9618-
(!cast<Instruction>(NAME # "Zrr")
9619-
(INSERT_SUBREG(_.info512.VT(IMPLICIT_DEF)),
9620-
_.info256.RC:$src1,
9621-
_.info256.SubRegIdx)),
9622-
_.info256.SubRegIdx)>;
9623-
9624-
def Z128_Alt : Pat<(_.info128.VT(OpNode _.info128.RC:$src1)),
9625-
(EXTRACT_SUBREG
9626-
(!cast<Instruction>(NAME # "Zrr")
9627-
(INSERT_SUBREG(_.info512.VT(IMPLICIT_DEF)),
9628-
_.info128.RC:$src1,
9629-
_.info128.SubRegIdx)),
9630-
_.info128.SubRegIdx)>;
9631-
}
9632-
}
9633-
96349604
// FIXME: Is there a better scheduler itinerary for VPOPCNTD/VPOPCNTQ?
9635-
defm VPOPCNTD : avx512_unary_rmb_popcnt<0x55, "vpopcntd", SSE_INTALU_ITINS_P,
9636-
v16i32_info>,
9637-
avx512_unary_lowering<ctpop, avx512vl_i32_info, HasVPOPCNTDQ>;
9605+
defm VPOPCNT : avx512_unary_rm_vl_dq<0x55, 0x55, "vpopcnt", ctpop,
9606+
SSE_INTALU_ITINS_P, HasVPOPCNTDQ>;
96389607

9639-
defm VPOPCNTQ : avx512_unary_rmb_popcnt<0x55, "vpopcntq", SSE_INTALU_ITINS_P,
9640-
v8i64_info>,
9641-
avx512_unary_lowering<ctpop, avx512vl_i64_info, HasVPOPCNTDQ>, VEX_W;
9608+
defm : avx512_unary_lowering<"VPOPCNTQ", ctpop, avx512vl_i64_info, HasVPOPCNTDQ>;
9609+
defm : avx512_unary_lowering<"VPOPCNTD", ctpop, avx512vl_i32_info, HasVPOPCNTDQ>;
96429610

96439611
//===---------------------------------------------------------------------===//
96449612
// Replicate Single FP - MOVSHDUP and MOVSLDUP
@@ -10631,11 +10599,12 @@ defm VPDPWSSDS : VNNI_common<0x53, "vpdpwssds", X86Vpdpwssds, SSE_PMADD>;
1063110599

1063210600
// FIXME: Is there a better scheduler itinerary for VPOPCNTB/VPOPCNTW?
1063310601
defm VPOPCNTB : avx512_unary_rm_vl<0x54, "vpopcntb", ctpop, SSE_INTALU_ITINS_P,
10634-
avx512vl_i8_info, HasBITALG>,
10635-
avx512_unary_lowering<ctpop, avx512vl_i8_info, HasBITALG>;
10602+
avx512vl_i8_info, HasBITALG>;
1063610603
defm VPOPCNTW : avx512_unary_rm_vl<0x54, "vpopcntw", ctpop, SSE_INTALU_ITINS_P,
10637-
avx512vl_i16_info, HasBITALG>,
10638-
avx512_unary_lowering<ctpop, avx512vl_i16_info, HasBITALG>, VEX_W;
10604+
avx512vl_i16_info, HasBITALG>, VEX_W;
10605+
10606+
defm : avx512_unary_lowering<"VPOPCNTB", ctpop, avx512vl_i8_info, HasBITALG>;
10607+
defm : avx512_unary_lowering<"VPOPCNTW", ctpop, avx512vl_i16_info, HasBITALG>;
1063910608

1064010609
multiclass VPSHUFBITQMB_rm<OpndItins itins, X86VectorVTInfo VTI> {
1064110610
defm rr : AVX512_maskable_cmp<0x8F, MRMSrcReg, VTI, (outs VTI.KRC:$dst),

test/CodeGen/X86/vector-popcnt-128.ll

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
77
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
88
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512VPOPCNTDQ
9+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512VPOPCNTDQVL
910
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bitalg | FileCheck %s --check-prefix=ALL --check-prefix=BITALG_NOVLX
1011
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bitalg,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=BITALG
1112

@@ -120,6 +121,11 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
120121
; AVX512VPOPCNTDQ-NEXT: vzeroupper
121122
; AVX512VPOPCNTDQ-NEXT: retq
122123
;
124+
; AVX512VPOPCNTDQVL-LABEL: testv2i64:
125+
; AVX512VPOPCNTDQVL: # %bb.0:
126+
; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0
127+
; AVX512VPOPCNTDQVL-NEXT: retq
128+
;
123129
; BITALG_NOVLX-LABEL: testv2i64:
124130
; BITALG_NOVLX: # %bb.0:
125131
; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
@@ -290,6 +296,11 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
290296
; AVX512VPOPCNTDQ-NEXT: vzeroupper
291297
; AVX512VPOPCNTDQ-NEXT: retq
292298
;
299+
; AVX512VPOPCNTDQVL-LABEL: testv4i32:
300+
; AVX512VPOPCNTDQVL: # %bb.0:
301+
; AVX512VPOPCNTDQVL-NEXT: vpopcntd %xmm0, %xmm0
302+
; AVX512VPOPCNTDQVL-NEXT: retq
303+
;
293304
; BITALG_NOVLX-LABEL: testv4i32:
294305
; BITALG_NOVLX: # %bb.0:
295306
; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
@@ -448,6 +459,14 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
448459
; AVX512VPOPCNTDQ-NEXT: vzeroupper
449460
; AVX512VPOPCNTDQ-NEXT: retq
450461
;
462+
; AVX512VPOPCNTDQVL-LABEL: testv8i16:
463+
; AVX512VPOPCNTDQVL: # %bb.0:
464+
; AVX512VPOPCNTDQVL-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
465+
; AVX512VPOPCNTDQVL-NEXT: vpopcntq %zmm0, %zmm0
466+
; AVX512VPOPCNTDQVL-NEXT: vpmovqw %zmm0, %xmm0
467+
; AVX512VPOPCNTDQVL-NEXT: vzeroupper
468+
; AVX512VPOPCNTDQVL-NEXT: retq
469+
;
451470
; BITALG_NOVLX-LABEL: testv8i16:
452471
; BITALG_NOVLX: # %bb.0:
453472
; BITALG_NOVLX-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
@@ -565,6 +584,14 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
565584
; AVX512VPOPCNTDQ-NEXT: vzeroupper
566585
; AVX512VPOPCNTDQ-NEXT: retq
567586
;
587+
; AVX512VPOPCNTDQVL-LABEL: testv16i8:
588+
; AVX512VPOPCNTDQVL: # %bb.0:
589+
; AVX512VPOPCNTDQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
590+
; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0
591+
; AVX512VPOPCNTDQVL-NEXT: vpmovdb %zmm0, %xmm0
592+
; AVX512VPOPCNTDQVL-NEXT: vzeroupper
593+
; AVX512VPOPCNTDQVL-NEXT: retq
594+
;
568595
; BITALG_NOVLX-LABEL: testv16i8:
569596
; BITALG_NOVLX: # %bb.0:
570597
; BITALG_NOVLX-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0

test/CodeGen/X86/vector-popcnt-256.ll

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
33
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
44
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512VPOPCNTDQ
5+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512VPOPCNTDQVL
56
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bitalg | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=BITALG_NOVLX
67
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bitalg,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=BITALG
78

@@ -50,6 +51,11 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
5051
; AVX512VPOPCNTDQ-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
5152
; AVX512VPOPCNTDQ-NEXT: retq
5253
;
54+
; AVX512VPOPCNTDQVL-LABEL: testv4i64:
55+
; AVX512VPOPCNTDQVL: # %bb.0:
56+
; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0
57+
; AVX512VPOPCNTDQVL-NEXT: retq
58+
;
5359
; BITALG_NOVLX-LABEL: testv4i64:
5460
; BITALG_NOVLX: # %bb.0:
5561
; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
@@ -138,6 +144,11 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
138144
; AVX512VPOPCNTDQ-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
139145
; AVX512VPOPCNTDQ-NEXT: retq
140146
;
147+
; AVX512VPOPCNTDQVL-LABEL: testv8i32:
148+
; AVX512VPOPCNTDQVL: # %bb.0:
149+
; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0
150+
; AVX512VPOPCNTDQVL-NEXT: retq
151+
;
141152
; BITALG_NOVLX-LABEL: testv8i32:
142153
; BITALG_NOVLX: # %bb.0:
143154
; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
@@ -226,6 +237,13 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
226237
; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0
227238
; AVX512VPOPCNTDQ-NEXT: retq
228239
;
240+
; AVX512VPOPCNTDQVL-LABEL: testv16i16:
241+
; AVX512VPOPCNTDQVL: # %bb.0:
242+
; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
243+
; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0
244+
; AVX512VPOPCNTDQVL-NEXT: vpmovdw %zmm0, %ymm0
245+
; AVX512VPOPCNTDQVL-NEXT: retq
246+
;
229247
; BITALG_NOVLX-LABEL: testv16i16:
230248
; BITALG_NOVLX: # %bb.0:
231249
; BITALG_NOVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
@@ -286,6 +304,18 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
286304
; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0
287305
; AVX512VPOPCNTDQ-NEXT: retq
288306
;
307+
; AVX512VPOPCNTDQVL-LABEL: testv32i8:
308+
; AVX512VPOPCNTDQVL: # %bb.0:
309+
; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
310+
; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm2
311+
; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
312+
; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2
313+
; AVX512VPOPCNTDQVL-NEXT: vpsrlw $4, %ymm0, %ymm0
314+
; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0
315+
; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm0, %ymm3, %ymm0
316+
; AVX512VPOPCNTDQVL-NEXT: vpaddb %ymm2, %ymm0, %ymm0
317+
; AVX512VPOPCNTDQVL-NEXT: retq
318+
;
289319
; BITALG_NOVLX-LABEL: testv32i8:
290320
; BITALG_NOVLX: # %bb.0:
291321
; BITALG_NOVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0

0 commit comments

Comments
 (0)