Skip to content

Commit 5d2ece1

Browse files
authored
[X86] Allow znver3/4/5 targets to use double-shift instructions by default (#132720)
While still not as fast as Intel targets, recent AMD znver3 + later CPUs are not as microcoded+bottlenecked as previous AMD targets (now only ~2cy rthroughput) which improves on the expanded 3*shift+not+or sequence we expand to as an alternative. Noticed while triaging #132601
1 parent 3a3d1bf commit 5d2ece1

File tree

3 files changed

+81
-179
lines changed

3 files changed

+81
-179
lines changed

llvm/lib/Target/X86/X86.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1574,7 +1574,7 @@ def ProcessorFeatures {
15741574
FeatureVPCLMULQDQ];
15751575
list<SubtargetFeature> ZN3AdditionalTuning = [TuningMacroFusion];
15761576
list<SubtargetFeature> ZN3Tuning =
1577-
!listconcat(ZN2Tuning, ZN3AdditionalTuning);
1577+
!listremove(!listconcat(ZN2Tuning, ZN3AdditionalTuning), [TuningSlowSHLD]);
15781578
list<SubtargetFeature> ZN3Features =
15791579
!listconcat(ZN2Features, ZN3AdditionalFeatures);
15801580

llvm/test/CodeGen/X86/shift-i512.ll

Lines changed: 40 additions & 154 deletions
Original file line numberDiff line numberDiff line change
@@ -48,46 +48,20 @@ define <8 x i64> @shl_i512_1(<8 x i64> %a) {
4848
;
4949
; ZNVER4-LABEL: shl_i512_1:
5050
; ZNVER4: # %bb.0:
51-
; ZNVER4-NEXT: vextracti32x4 $3, %zmm0, %xmm1
52-
; ZNVER4-NEXT: vmovq %xmm0, %rdx
53-
; ZNVER4-NEXT: vpextrq $1, %xmm0, %r9
54-
; ZNVER4-NEXT: vpextrq $1, %xmm1, %rax
55-
; ZNVER4-NEXT: vmovq %xmm1, %rcx
5651
; ZNVER4-NEXT: vextracti32x4 $2, %zmm0, %xmm1
57-
; ZNVER4-NEXT: shrq $63, %rdx
58-
; ZNVER4-NEXT: vpextrq $1, %xmm1, %rsi
59-
; ZNVER4-NEXT: vmovq %xmm1, %rdi
60-
; ZNVER4-NEXT: vextracti128 $1, %ymm0, %xmm1
61-
; ZNVER4-NEXT: leaq (%rdx,%r9,2), %rdx
62-
; ZNVER4-NEXT: shrq $63, %r9
63-
; ZNVER4-NEXT: vpsllq $1, %xmm0, %xmm0
64-
; ZNVER4-NEXT: vmovq %xmm1, %r10
65-
; ZNVER4-NEXT: vpextrq $1, %xmm1, %r8
66-
; ZNVER4-NEXT: leaq (%r9,%r10,2), %r9
67-
; ZNVER4-NEXT: shrq $63, %r10
68-
; ZNVER4-NEXT: vmovq %rdx, %xmm4
69-
; ZNVER4-NEXT: leaq (%r10,%r8,2), %r10
70-
; ZNVER4-NEXT: shrq $63, %r8
71-
; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
72-
; ZNVER4-NEXT: leaq (%r8,%rdi,2), %r8
73-
; ZNVER4-NEXT: shrq $63, %rdi
74-
; ZNVER4-NEXT: leaq (%rdi,%rsi,2), %rdi
75-
; ZNVER4-NEXT: shrq $63, %rsi
76-
; ZNVER4-NEXT: leaq (%rsi,%rcx,2), %rsi
77-
; ZNVER4-NEXT: shrq $63, %rcx
78-
; ZNVER4-NEXT: vmovq %r8, %xmm3
79-
; ZNVER4-NEXT: leaq (%rcx,%rax,2), %rax
80-
; ZNVER4-NEXT: vmovq %rsi, %xmm2
81-
; ZNVER4-NEXT: vmovq %rax, %xmm1
82-
; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
83-
; ZNVER4-NEXT: vmovq %rdi, %xmm2
84-
; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
85-
; ZNVER4-NEXT: vmovq %r10, %xmm3
52+
; ZNVER4-NEXT: vextracti128 $1, %ymm0, %xmm2
53+
; ZNVER4-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
54+
; ZNVER4-NEXT: vpsllq $1, %xmm0, %xmm4
8655
; ZNVER4-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
87-
; ZNVER4-NEXT: vmovq %r9, %xmm2
88-
; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
89-
; ZNVER4-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
90-
; ZNVER4-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
56+
; ZNVER4-NEXT: vpshldq $1, %xmm3, %xmm2, %xmm3
57+
; ZNVER4-NEXT: vextracti64x4 $1, %zmm0, %ymm2
58+
; ZNVER4-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
59+
; ZNVER4-NEXT: vpshldq $1, %ymm1, %ymm2, %ymm1
60+
; ZNVER4-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3
61+
; ZNVER4-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1
62+
; ZNVER4-NEXT: vpshufd {{.*#+}} zmm3 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
63+
; ZNVER4-NEXT: vpshldq $1, %zmm0, %zmm3, %zmm0
64+
; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[2],zmm0[2],zmm1[4],zmm0[4],zmm1[6],zmm0[6]
9165
; ZNVER4-NEXT: retq
9266
%d = bitcast <8 x i64> %a to i512
9367
%s = shl i512 %d, 1
@@ -142,65 +116,21 @@ define <8 x i64> @lshr_i512_1(<8 x i64> %a) {
142116
;
143117
; ZNVER4-LABEL: lshr_i512_1:
144118
; ZNVER4: # %bb.0:
145-
; ZNVER4-NEXT: pushq %rbx
146-
; ZNVER4-NEXT: .cfi_def_cfa_offset 16
147-
; ZNVER4-NEXT: .cfi_offset %rbx, -16
119+
; ZNVER4-NEXT: vextracti32x4 $2, %zmm0, %xmm3
148120
; ZNVER4-NEXT: vextracti128 $1, %ymm0, %xmm1
149-
; ZNVER4-NEXT: vmovq %xmm0, %r10
150-
; ZNVER4-NEXT: vpextrq $1, %xmm0, %rsi
151-
; ZNVER4-NEXT: vpextrq $1, %xmm1, %rcx
152-
; ZNVER4-NEXT: vmovq %xmm1, %r9
153-
; ZNVER4-NEXT: vextracti32x4 $2, %zmm0, %xmm1
154-
; ZNVER4-NEXT: vextracti32x4 $3, %zmm0, %xmm0
155-
; ZNVER4-NEXT: shrq %r10
156-
; ZNVER4-NEXT: vpextrq $1, %xmm0, %rax
157-
; ZNVER4-NEXT: vmovq %xmm0, %rdx
158-
; ZNVER4-NEXT: vmovq %xmm1, %rdi
159-
; ZNVER4-NEXT: vpextrq $1, %xmm1, %r11
160-
; ZNVER4-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
161-
; ZNVER4-NEXT: movq %rdx, %r8
162-
; ZNVER4-NEXT: shrq %r8
163-
; ZNVER4-NEXT: shlq $63, %rax
164-
; ZNVER4-NEXT: movq %rdi, %rbx
165-
; ZNVER4-NEXT: shrq %rbx
166-
; ZNVER4-NEXT: shlq $63, %rdx
167-
; ZNVER4-NEXT: shlq $63, %rdi
168-
; ZNVER4-NEXT: vpsrlq $1, %xmm0, %xmm0
169-
; ZNVER4-NEXT: orq %r8, %rax
170-
; ZNVER4-NEXT: movq %r11, %r8
171-
; ZNVER4-NEXT: shlq $63, %r8
172-
; ZNVER4-NEXT: shrq %r11
173-
; ZNVER4-NEXT: orq %rbx, %r8
174-
; ZNVER4-NEXT: movq %r9, %rbx
175-
; ZNVER4-NEXT: orq %r11, %rdx
176-
; ZNVER4-NEXT: movq %rsi, %r11
177-
; ZNVER4-NEXT: shrq %r11
178-
; ZNVER4-NEXT: shlq $63, %rbx
179-
; ZNVER4-NEXT: shrq %r9
180-
; ZNVER4-NEXT: shlq $63, %rsi
181-
; ZNVER4-NEXT: vmovq %rax, %xmm4
182-
; ZNVER4-NEXT: orq %r11, %rbx
183-
; ZNVER4-NEXT: movq %rcx, %r11
184-
; ZNVER4-NEXT: shlq $63, %r11
185-
; ZNVER4-NEXT: shrq %rcx
186-
; ZNVER4-NEXT: orq %r10, %rsi
187-
; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm4[0],xmm0[0]
188-
; ZNVER4-NEXT: orq %r9, %r11
189-
; ZNVER4-NEXT: orq %rdi, %rcx
190-
; ZNVER4-NEXT: vmovq %rbx, %xmm3
191-
; ZNVER4-NEXT: vmovq %rcx, %xmm1
192-
; ZNVER4-NEXT: vmovq %r11, %xmm2
193-
; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
194-
; ZNVER4-NEXT: vmovq %rsi, %xmm2
195-
; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
196-
; ZNVER4-NEXT: vmovq %r8, %xmm3
197-
; ZNVER4-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
198-
; ZNVER4-NEXT: vmovq %rdx, %xmm2
199-
; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
200-
; ZNVER4-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0
201-
; ZNVER4-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
202-
; ZNVER4-NEXT: popq %rbx
203-
; ZNVER4-NEXT: .cfi_def_cfa_offset 8
121+
; ZNVER4-NEXT: vextracti32x4 $3, %zmm0, %xmm2
122+
; ZNVER4-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3]
123+
; ZNVER4-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
124+
; ZNVER4-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[2,3,2,3,6,7,6,7]
125+
; ZNVER4-NEXT: vpshldq $63, %xmm4, %xmm2, %xmm4
126+
; ZNVER4-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
127+
; ZNVER4-NEXT: vpshldq $63, %ymm3, %ymm1, %ymm1
128+
; ZNVER4-NEXT: vpshufd {{.*#+}} zmm3 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
129+
; ZNVER4-NEXT: vpsrlq $1, %xmm2, %xmm2
130+
; ZNVER4-NEXT: vpshldq $63, %zmm0, %zmm3, %zmm0
131+
; ZNVER4-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
132+
; ZNVER4-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
133+
; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
204134
; ZNVER4-NEXT: retq
205135
%d = bitcast <8 x i64> %a to i512
206136
%s = lshr i512 %d, 1
@@ -255,65 +185,21 @@ define <8 x i64> @ashr_i512_1(<8 x i64> %a) {
255185
;
256186
; ZNVER4-LABEL: ashr_i512_1:
257187
; ZNVER4: # %bb.0:
258-
; ZNVER4-NEXT: pushq %rbx
259-
; ZNVER4-NEXT: .cfi_def_cfa_offset 16
260-
; ZNVER4-NEXT: .cfi_offset %rbx, -16
188+
; ZNVER4-NEXT: vextracti32x4 $2, %zmm0, %xmm3
261189
; ZNVER4-NEXT: vextracti128 $1, %ymm0, %xmm1
262-
; ZNVER4-NEXT: vmovq %xmm0, %r10
263-
; ZNVER4-NEXT: vpextrq $1, %xmm0, %rsi
264-
; ZNVER4-NEXT: vpextrq $1, %xmm1, %rcx
265-
; ZNVER4-NEXT: vmovq %xmm1, %r9
266-
; ZNVER4-NEXT: vextracti32x4 $2, %zmm0, %xmm1
267-
; ZNVER4-NEXT: vextracti32x4 $3, %zmm0, %xmm0
268-
; ZNVER4-NEXT: shrq %r10
269-
; ZNVER4-NEXT: vpextrq $1, %xmm0, %rax
270-
; ZNVER4-NEXT: vmovq %xmm0, %rdx
271-
; ZNVER4-NEXT: vmovq %xmm1, %rdi
272-
; ZNVER4-NEXT: vpextrq $1, %xmm1, %r11
273-
; ZNVER4-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
274-
; ZNVER4-NEXT: movq %rdx, %r8
275-
; ZNVER4-NEXT: shrq %r8
276-
; ZNVER4-NEXT: shlq $63, %rax
277-
; ZNVER4-NEXT: movq %rdi, %rbx
278-
; ZNVER4-NEXT: shrq %rbx
279-
; ZNVER4-NEXT: shlq $63, %rdx
280-
; ZNVER4-NEXT: shlq $63, %rdi
281-
; ZNVER4-NEXT: vpsraq $1, %xmm0, %xmm0
282-
; ZNVER4-NEXT: orq %r8, %rax
283-
; ZNVER4-NEXT: movq %r11, %r8
284-
; ZNVER4-NEXT: shlq $63, %r8
285-
; ZNVER4-NEXT: shrq %r11
286-
; ZNVER4-NEXT: orq %rbx, %r8
287-
; ZNVER4-NEXT: movq %r9, %rbx
288-
; ZNVER4-NEXT: orq %r11, %rdx
289-
; ZNVER4-NEXT: movq %rsi, %r11
290-
; ZNVER4-NEXT: shrq %r11
291-
; ZNVER4-NEXT: shlq $63, %rbx
292-
; ZNVER4-NEXT: shrq %r9
293-
; ZNVER4-NEXT: shlq $63, %rsi
294-
; ZNVER4-NEXT: vmovq %rax, %xmm4
295-
; ZNVER4-NEXT: orq %r11, %rbx
296-
; ZNVER4-NEXT: movq %rcx, %r11
297-
; ZNVER4-NEXT: shlq $63, %r11
298-
; ZNVER4-NEXT: shrq %rcx
299-
; ZNVER4-NEXT: orq %r10, %rsi
300-
; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm4[0],xmm0[0]
301-
; ZNVER4-NEXT: orq %r9, %r11
302-
; ZNVER4-NEXT: orq %rdi, %rcx
303-
; ZNVER4-NEXT: vmovq %rbx, %xmm3
304-
; ZNVER4-NEXT: vmovq %rcx, %xmm1
305-
; ZNVER4-NEXT: vmovq %r11, %xmm2
306-
; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
307-
; ZNVER4-NEXT: vmovq %rsi, %xmm2
308-
; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
309-
; ZNVER4-NEXT: vmovq %r8, %xmm3
310-
; ZNVER4-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
311-
; ZNVER4-NEXT: vmovq %rdx, %xmm2
312-
; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
313-
; ZNVER4-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0
314-
; ZNVER4-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
315-
; ZNVER4-NEXT: popq %rbx
316-
; ZNVER4-NEXT: .cfi_def_cfa_offset 8
190+
; ZNVER4-NEXT: vextracti32x4 $3, %zmm0, %xmm2
191+
; ZNVER4-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3]
192+
; ZNVER4-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
193+
; ZNVER4-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[2,3,2,3,6,7,6,7]
194+
; ZNVER4-NEXT: vpshldq $63, %xmm4, %xmm2, %xmm4
195+
; ZNVER4-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
196+
; ZNVER4-NEXT: vpshldq $63, %ymm3, %ymm1, %ymm1
197+
; ZNVER4-NEXT: vpshufd {{.*#+}} zmm3 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
198+
; ZNVER4-NEXT: vpsraq $1, %xmm2, %xmm2
199+
; ZNVER4-NEXT: vpshldq $63, %zmm0, %zmm3, %zmm0
200+
; ZNVER4-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
201+
; ZNVER4-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
202+
; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
317203
; ZNVER4-NEXT: retq
318204
%d = bitcast <8 x i64> %a to i512
319205
%s = ashr i512 %d, 1

llvm/test/CodeGen/X86/x86-64-double-shifts-var.ll

Lines changed: 40 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,12 @@
1212
; RUN: llc < %s -mtriple=x86_64-- -mcpu=bdver1 | FileCheck %s --check-prefixes=BMI
1313
; RUN: llc < %s -mtriple=x86_64-- -mcpu=bdver2 | FileCheck %s --check-prefixes=BMI
1414
; RUN: llc < %s -mtriple=x86_64-- -mcpu=bdver3 | FileCheck %s --check-prefixes=BMI
15-
; RUN: llc < %s -mtriple=x86_64-- -mcpu=bdver4 | FileCheck %s --check-prefixes=BMI2
16-
; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver1 | FileCheck %s --check-prefixes=BMI2
17-
; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver2 | FileCheck %s --check-prefixes=BMI2
18-
; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver3 | FileCheck %s --check-prefixes=BMI2
19-
; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver4 | FileCheck %s --check-prefixes=BMI2
20-
; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver5 | FileCheck %s --check-prefixes=BMI2
15+
; RUN: llc < %s -mtriple=x86_64-- -mcpu=bdver4 | FileCheck %s --check-prefixes=BMI2-SLOW
16+
; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver1 | FileCheck %s --check-prefixes=BMI2-SLOW
17+
; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver2 | FileCheck %s --check-prefixes=BMI2-SLOW
18+
; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver3 | FileCheck %s --check-prefixes=BMI2-FAST
19+
; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver4 | FileCheck %s --check-prefixes=BMI2-FAST
20+
; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver5 | FileCheck %s --check-prefixes=BMI2-FAST
2121

2222
; Verify that for the X86_64 processors that are known to have poor latency
2323
; double precision shift instructions we do not generate 'shld' or 'shrd'
@@ -53,15 +53,23 @@ define i64 @lshift(i64 %a, i64 %b, i32 %c) nounwind readnone {
5353
; BMI-NEXT: orq %rdi, %rax
5454
; BMI-NEXT: retq
5555
;
56-
; BMI2-LABEL: lshift:
57-
; BMI2: # %bb.0: # %entry
58-
; BMI2-NEXT: # kill: def $edx killed $edx def $rdx
59-
; BMI2-NEXT: shlxq %rdx, %rdi, %rcx
60-
; BMI2-NEXT: notb %dl
61-
; BMI2-NEXT: shrq %rsi
62-
; BMI2-NEXT: shrxq %rdx, %rsi, %rax
63-
; BMI2-NEXT: orq %rcx, %rax
64-
; BMI2-NEXT: retq
56+
; BMI2-SLOW-LABEL: lshift:
57+
; BMI2-SLOW: # %bb.0: # %entry
58+
; BMI2-SLOW-NEXT: # kill: def $edx killed $edx def $rdx
59+
; BMI2-SLOW-NEXT: shlxq %rdx, %rdi, %rcx
60+
; BMI2-SLOW-NEXT: notb %dl
61+
; BMI2-SLOW-NEXT: shrq %rsi
62+
; BMI2-SLOW-NEXT: shrxq %rdx, %rsi, %rax
63+
; BMI2-SLOW-NEXT: orq %rcx, %rax
64+
; BMI2-SLOW-NEXT: retq
65+
;
66+
; BMI2-FAST-LABEL: lshift:
67+
; BMI2-FAST: # %bb.0: # %entry
68+
; BMI2-FAST-NEXT: movl %edx, %ecx
69+
; BMI2-FAST-NEXT: movq %rdi, %rax
70+
; BMI2-FAST-NEXT: # kill: def $cl killed $cl killed $ecx
71+
; BMI2-FAST-NEXT: shldq %cl, %rsi, %rax
72+
; BMI2-FAST-NEXT: retq
6573
entry:
6674
%sh_prom = zext i32 %c to i64
6775
%shl = shl i64 %a, %sh_prom
@@ -100,15 +108,23 @@ define i64 @rshift(i64 %a, i64 %b, i32 %c) nounwind readnone {
100108
; BMI-NEXT: orq %rdi, %rax
101109
; BMI-NEXT: retq
102110
;
103-
; BMI2-LABEL: rshift:
104-
; BMI2: # %bb.0: # %entry
105-
; BMI2-NEXT: # kill: def $edx killed $edx def $rdx
106-
; BMI2-NEXT: shrxq %rdx, %rdi, %rcx
107-
; BMI2-NEXT: notb %dl
108-
; BMI2-NEXT: addq %rsi, %rsi
109-
; BMI2-NEXT: shlxq %rdx, %rsi, %rax
110-
; BMI2-NEXT: orq %rcx, %rax
111-
; BMI2-NEXT: retq
111+
; BMI2-SLOW-LABEL: rshift:
112+
; BMI2-SLOW: # %bb.0: # %entry
113+
; BMI2-SLOW-NEXT: # kill: def $edx killed $edx def $rdx
114+
; BMI2-SLOW-NEXT: shrxq %rdx, %rdi, %rcx
115+
; BMI2-SLOW-NEXT: notb %dl
116+
; BMI2-SLOW-NEXT: addq %rsi, %rsi
117+
; BMI2-SLOW-NEXT: shlxq %rdx, %rsi, %rax
118+
; BMI2-SLOW-NEXT: orq %rcx, %rax
119+
; BMI2-SLOW-NEXT: retq
120+
;
121+
; BMI2-FAST-LABEL: rshift:
122+
; BMI2-FAST: # %bb.0: # %entry
123+
; BMI2-FAST-NEXT: movl %edx, %ecx
124+
; BMI2-FAST-NEXT: movq %rdi, %rax
125+
; BMI2-FAST-NEXT: # kill: def $cl killed $cl killed $ecx
126+
; BMI2-FAST-NEXT: shrdq %cl, %rsi, %rax
127+
; BMI2-FAST-NEXT: retq
112128
entry:
113129
%sh_prom = zext i32 %c to i64
114130
%shr = lshr i64 %a, %sh_prom

0 commit comments

Comments
 (0)