Skip to content

Commit becd418

Browse files
authored
[CGP] Despeculate ctlz/cttz with "illegal" integer types (#137197)
The code below the removed check looks generic enough to support arbitrary integer widths. This change helps 32-bit targets avoid expensive expansion/libcalls in the case of zero input. Pull Request: #137197
1 parent 705ceff commit becd418

File tree

12 files changed

+387
-520
lines changed

12 files changed

+387
-520
lines changed

llvm/lib/CodeGen/CodeGenPrepare.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2552,9 +2552,9 @@ static bool despeculateCountZeros(IntrinsicInst *CountZeros,
25522552
(IntrinsicID == Intrinsic::ctlz && TLI->isCheapToSpeculateCtlz(Ty)))
25532553
return false;
25542554

2555-
// Only handle legal scalar cases. Anything else requires too much work.
2555+
// Only handle scalar cases. Anything else requires too much work.
25562556
unsigned SizeInBits = Ty->getScalarSizeInBits();
2557-
if (Ty->isVectorTy() || SizeInBits > DL->getLargestLegalIntTypeSizeInBits())
2557+
if (Ty->isVectorTy())
25582558
return false;
25592559

25602560
// Bail if the value is never zero.

llvm/test/CodeGen/ARM/cttz.ll

Lines changed: 76 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -221,87 +221,99 @@ define i64 @test_i64(i64 %a) {
221221
;
222222
; CHECK-6M-LABEL: test_i64:
223223
; CHECK-6M: @ %bb.0:
224-
; CHECK-6M-NEXT: .save {r4, r5, r7, lr}
225-
; CHECK-6M-NEXT: push {r4, r5, r7, lr}
224+
; CHECK-6M-NEXT: .save {r4, r5, r6, lr}
225+
; CHECK-6M-NEXT: push {r4, r5, r6, lr}
226+
; CHECK-6M-NEXT: mov r3, r1
226227
; CHECK-6M-NEXT: mov r2, r0
227-
; CHECK-6M-NEXT: ldr r5, .LCPI3_0
228-
; CHECK-6M-NEXT: adr r3, .LCPI3_1
228+
; CHECK-6M-NEXT: movs r1, #0
229+
; CHECK-6M-NEXT: orrs r0, r3
230+
; CHECK-6M-NEXT: beq .LBB3_6
231+
; CHECK-6M-NEXT: @ %bb.1: @ %cond.false
232+
; CHECK-6M-NEXT: ldr r6, .LCPI3_0
233+
; CHECK-6M-NEXT: adr r4, .LCPI3_1
229234
; CHECK-6M-NEXT: movs r0, #32
230-
; CHECK-6M-NEXT: cmp r1, #0
231-
; CHECK-6M-NEXT: mov r4, r0
232-
; CHECK-6M-NEXT: beq .LBB3_2
233-
; CHECK-6M-NEXT: @ %bb.1:
234-
; CHECK-6M-NEXT: rsbs r4, r1, #0
235-
; CHECK-6M-NEXT: ands r4, r1
236-
; CHECK-6M-NEXT: muls r4, r5, r4
237-
; CHECK-6M-NEXT: lsrs r1, r4, #27
238-
; CHECK-6M-NEXT: ldrb r4, [r3, r1]
239-
; CHECK-6M-NEXT: .LBB3_2:
240-
; CHECK-6M-NEXT: adds r4, #32
241-
; CHECK-6M-NEXT: rsbs r1, r2, #0
242-
; CHECK-6M-NEXT: ands r1, r2
243-
; CHECK-6M-NEXT: muls r5, r1, r5
244-
; CHECK-6M-NEXT: lsrs r1, r5, #27
235+
; CHECK-6M-NEXT: cmp r3, #0
236+
; CHECK-6M-NEXT: mov r5, r0
237+
; CHECK-6M-NEXT: beq .LBB3_3
238+
; CHECK-6M-NEXT: @ %bb.2: @ %cond.false
239+
; CHECK-6M-NEXT: rsbs r5, r3, #0
240+
; CHECK-6M-NEXT: ands r5, r3
241+
; CHECK-6M-NEXT: muls r5, r6, r5
242+
; CHECK-6M-NEXT: lsrs r3, r5, #27
243+
; CHECK-6M-NEXT: ldrb r5, [r4, r3]
244+
; CHECK-6M-NEXT: .LBB3_3: @ %cond.false
245+
; CHECK-6M-NEXT: adds r5, #32
246+
; CHECK-6M-NEXT: rsbs r3, r2, #0
247+
; CHECK-6M-NEXT: ands r3, r2
248+
; CHECK-6M-NEXT: muls r6, r3, r6
249+
; CHECK-6M-NEXT: lsrs r3, r6, #27
245250
; CHECK-6M-NEXT: cmp r2, #0
246-
; CHECK-6M-NEXT: bne .LBB3_5
247-
; CHECK-6M-NEXT: @ %bb.3:
248-
; CHECK-6M-NEXT: beq .LBB3_6
249-
; CHECK-6M-NEXT: .LBB3_4:
250-
; CHECK-6M-NEXT: movs r1, #0
251-
; CHECK-6M-NEXT: pop {r4, r5, r7, pc}
252-
; CHECK-6M-NEXT: .LBB3_5:
253-
; CHECK-6M-NEXT: ldrb r0, [r3, r1]
254-
; CHECK-6M-NEXT: bne .LBB3_4
251+
; CHECK-6M-NEXT: bne .LBB3_7
252+
; CHECK-6M-NEXT: @ %bb.4: @ %cond.false
253+
; CHECK-6M-NEXT: beq .LBB3_8
254+
; CHECK-6M-NEXT: .LBB3_5: @ %cond.end
255+
; CHECK-6M-NEXT: pop {r4, r5, r6, pc}
255256
; CHECK-6M-NEXT: .LBB3_6:
256-
; CHECK-6M-NEXT: mov r0, r4
257-
; CHECK-6M-NEXT: movs r1, #0
258-
; CHECK-6M-NEXT: pop {r4, r5, r7, pc}
257+
; CHECK-6M-NEXT: movs r0, #64
258+
; CHECK-6M-NEXT: pop {r4, r5, r6, pc}
259+
; CHECK-6M-NEXT: .LBB3_7: @ %cond.false
260+
; CHECK-6M-NEXT: ldrb r0, [r4, r3]
261+
; CHECK-6M-NEXT: bne .LBB3_5
262+
; CHECK-6M-NEXT: .LBB3_8: @ %cond.false
263+
; CHECK-6M-NEXT: mov r0, r5
264+
; CHECK-6M-NEXT: pop {r4, r5, r6, pc}
259265
; CHECK-6M-NEXT: .p2align 2
260-
; CHECK-6M-NEXT: @ %bb.7:
266+
; CHECK-6M-NEXT: @ %bb.9:
261267
; CHECK-6M-NEXT: .LCPI3_0:
262268
; CHECK-6M-NEXT: .long 125613361 @ 0x77cb531
263269
; CHECK-6M-NEXT: .LCPI3_1:
264270
; CHECK-6M-NEXT: .ascii "\000\001\034\002\035\016\030\003\036\026\024\017\031\021\004\b\037\033\r\027\025\023\020\007\032\f\022\006\013\005\n\t"
265271
;
266272
; CHECK-8MBASE-LABEL: test_i64:
267273
; CHECK-8MBASE: @ %bb.0:
268-
; CHECK-8MBASE-NEXT: .save {r4, r5, r7, lr}
269-
; CHECK-8MBASE-NEXT: push {r4, r5, r7, lr}
274+
; CHECK-8MBASE-NEXT: .save {r4, r5, r6, lr}
275+
; CHECK-8MBASE-NEXT: push {r4, r5, r6, lr}
276+
; CHECK-8MBASE-NEXT: mov r3, r1
270277
; CHECK-8MBASE-NEXT: mov r2, r0
271-
; CHECK-8MBASE-NEXT: movw r5, #46385
272-
; CHECK-8MBASE-NEXT: movt r5, #1916
273-
; CHECK-8MBASE-NEXT: adr r3, .LCPI3_0
278+
; CHECK-8MBASE-NEXT: movs r1, #0
279+
; CHECK-8MBASE-NEXT: orrs r0, r3
280+
; CHECK-8MBASE-NEXT: beq .LBB3_6
281+
; CHECK-8MBASE-NEXT: @ %bb.1: @ %cond.false
282+
; CHECK-8MBASE-NEXT: movw r6, #46385
283+
; CHECK-8MBASE-NEXT: movt r6, #1916
284+
; CHECK-8MBASE-NEXT: adr r4, .LCPI3_0
274285
; CHECK-8MBASE-NEXT: movs r0, #32
275-
; CHECK-8MBASE-NEXT: mov r4, r0
276-
; CHECK-8MBASE-NEXT: cbz r1, .LBB3_2
277-
; CHECK-8MBASE-NEXT: @ %bb.1:
278-
; CHECK-8MBASE-NEXT: rsbs r4, r1, #0
279-
; CHECK-8MBASE-NEXT: ands r4, r1
280-
; CHECK-8MBASE-NEXT: muls r4, r5, r4
281-
; CHECK-8MBASE-NEXT: lsrs r1, r4, #27
282-
; CHECK-8MBASE-NEXT: ldrb r4, [r3, r1]
283-
; CHECK-8MBASE-NEXT: .LBB3_2:
284-
; CHECK-8MBASE-NEXT: adds r4, #32
285-
; CHECK-8MBASE-NEXT: rsbs r1, r2, #0
286-
; CHECK-8MBASE-NEXT: ands r1, r2
287-
; CHECK-8MBASE-NEXT: muls r5, r1, r5
288-
; CHECK-8MBASE-NEXT: lsrs r1, r5, #27
286+
; CHECK-8MBASE-NEXT: mov r5, r0
287+
; CHECK-8MBASE-NEXT: cbz r3, .LBB3_3
288+
; CHECK-8MBASE-NEXT: @ %bb.2: @ %cond.false
289+
; CHECK-8MBASE-NEXT: rsbs r5, r3, #0
290+
; CHECK-8MBASE-NEXT: ands r5, r3
291+
; CHECK-8MBASE-NEXT: muls r5, r6, r5
292+
; CHECK-8MBASE-NEXT: lsrs r3, r5, #27
293+
; CHECK-8MBASE-NEXT: ldrb r5, [r4, r3]
294+
; CHECK-8MBASE-NEXT: .LBB3_3: @ %cond.false
295+
; CHECK-8MBASE-NEXT: adds r5, #32
296+
; CHECK-8MBASE-NEXT: rsbs r3, r2, #0
297+
; CHECK-8MBASE-NEXT: ands r3, r2
298+
; CHECK-8MBASE-NEXT: muls r6, r3, r6
299+
; CHECK-8MBASE-NEXT: lsrs r3, r6, #27
289300
; CHECK-8MBASE-NEXT: cmp r2, #0
290-
; CHECK-8MBASE-NEXT: bne .LBB3_5
291-
; CHECK-8MBASE-NEXT: @ %bb.3:
292-
; CHECK-8MBASE-NEXT: beq .LBB3_6
293-
; CHECK-8MBASE-NEXT: .LBB3_4:
294-
; CHECK-8MBASE-NEXT: movs r1, #0
295-
; CHECK-8MBASE-NEXT: pop {r4, r5, r7, pc}
296-
; CHECK-8MBASE-NEXT: .LBB3_5:
297-
; CHECK-8MBASE-NEXT: ldrb r0, [r3, r1]
298-
; CHECK-8MBASE-NEXT: bne .LBB3_4
301+
; CHECK-8MBASE-NEXT: bne .LBB3_7
302+
; CHECK-8MBASE-NEXT: @ %bb.4: @ %cond.false
303+
; CHECK-8MBASE-NEXT: beq .LBB3_8
304+
; CHECK-8MBASE-NEXT: .LBB3_5: @ %cond.end
305+
; CHECK-8MBASE-NEXT: pop {r4, r5, r6, pc}
299306
; CHECK-8MBASE-NEXT: .LBB3_6:
300-
; CHECK-8MBASE-NEXT: mov r0, r4
301-
; CHECK-8MBASE-NEXT: movs r1, #0
302-
; CHECK-8MBASE-NEXT: pop {r4, r5, r7, pc}
307+
; CHECK-8MBASE-NEXT: movs r0, #64
308+
; CHECK-8MBASE-NEXT: pop {r4, r5, r6, pc}
309+
; CHECK-8MBASE-NEXT: .LBB3_7: @ %cond.false
310+
; CHECK-8MBASE-NEXT: ldrb r0, [r4, r3]
311+
; CHECK-8MBASE-NEXT: bne .LBB3_5
312+
; CHECK-8MBASE-NEXT: .LBB3_8: @ %cond.false
313+
; CHECK-8MBASE-NEXT: mov r0, r5
314+
; CHECK-8MBASE-NEXT: pop {r4, r5, r6, pc}
303315
; CHECK-8MBASE-NEXT: .p2align 2
304-
; CHECK-8MBASE-NEXT: @ %bb.7:
316+
; CHECK-8MBASE-NEXT: @ %bb.9:
305317
; CHECK-8MBASE-NEXT: .LCPI3_0:
306318
; CHECK-8MBASE-NEXT: .ascii "\000\001\034\002\035\016\030\003\036\026\024\017\031\021\004\b\037\033\r\027\025\023\020\007\032\f\022\006\013\005\n\t"
307319
%tmp = call i64 @llvm.cttz.i64(i64 %a, i1 false)

llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb.ll

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -62,15 +62,18 @@ declare i64 @llvm.ctlz.i64(i64, i1)
6262
define i64 @ctlz_i64(i64 %a) nounwind {
6363
; RV32I-LABEL: ctlz_i64:
6464
; RV32I: # %bb.0:
65+
; RV32I-NEXT: or a2, a0, a1
66+
; RV32I-NEXT: beqz a2, .LBB1_3
67+
; RV32I-NEXT: # %bb.1: # %cond.false
6568
; RV32I-NEXT: lui a2, 349525
6669
; RV32I-NEXT: lui a3, 209715
6770
; RV32I-NEXT: lui a6, 61681
6871
; RV32I-NEXT: addi a5, a2, 1365
6972
; RV32I-NEXT: addi a4, a3, 819
7073
; RV32I-NEXT: addi a3, a6, -241
7174
; RV32I-NEXT: li a2, 32
72-
; RV32I-NEXT: beqz a1, .LBB1_2
73-
; RV32I-NEXT: # %bb.1:
75+
; RV32I-NEXT: beqz a1, .LBB1_4
76+
; RV32I-NEXT: # %bb.2: # %cond.false
7477
; RV32I-NEXT: srli a0, a1, 1
7578
; RV32I-NEXT: or a0, a1, a0
7679
; RV32I-NEXT: srli a1, a0, 2
@@ -99,7 +102,11 @@ define i64 @ctlz_i64(i64 %a) nounwind {
99102
; RV32I-NEXT: sub a0, a2, a0
100103
; RV32I-NEXT: li a1, 0
101104
; RV32I-NEXT: ret
102-
; RV32I-NEXT: .LBB1_2:
105+
; RV32I-NEXT: .LBB1_3:
106+
; RV32I-NEXT: li a1, 0
107+
; RV32I-NEXT: li a0, 64
108+
; RV32I-NEXT: ret
109+
; RV32I-NEXT: .LBB1_4:
103110
; RV32I-NEXT: srli a1, a0, 1
104111
; RV32I-NEXT: or a0, a0, a1
105112
; RV32I-NEXT: srli a1, a0, 2
@@ -195,14 +202,17 @@ declare i64 @llvm.cttz.i64(i64, i1)
195202
define i64 @cttz_i64(i64 %a) nounwind {
196203
; RV32I-LABEL: cttz_i64:
197204
; RV32I: # %bb.0:
205+
; RV32I-NEXT: or a2, a0, a1
206+
; RV32I-NEXT: beqz a2, .LBB3_3
207+
; RV32I-NEXT: # %bb.1: # %cond.false
198208
; RV32I-NEXT: lui a2, 349525
199209
; RV32I-NEXT: lui a3, 209715
200210
; RV32I-NEXT: lui a5, 61681
201211
; RV32I-NEXT: addi a4, a2, 1365
202212
; RV32I-NEXT: addi a3, a3, 819
203213
; RV32I-NEXT: addi a2, a5, -241
204-
; RV32I-NEXT: beqz a0, .LBB3_2
205-
; RV32I-NEXT: # %bb.1:
214+
; RV32I-NEXT: beqz a0, .LBB3_4
215+
; RV32I-NEXT: # %bb.2: # %cond.false
206216
; RV32I-NEXT: not a1, a0
207217
; RV32I-NEXT: addi a0, a0, -1
208218
; RV32I-NEXT: and a0, a1, a0
@@ -223,7 +233,11 @@ define i64 @cttz_i64(i64 %a) nounwind {
223233
; RV32I-NEXT: srli a0, a0, 24
224234
; RV32I-NEXT: li a1, 0
225235
; RV32I-NEXT: ret
226-
; RV32I-NEXT: .LBB3_2:
236+
; RV32I-NEXT: .LBB3_3:
237+
; RV32I-NEXT: li a1, 0
238+
; RV32I-NEXT: li a0, 64
239+
; RV32I-NEXT: ret
240+
; RV32I-NEXT: .LBB3_4:
227241
; RV32I-NEXT: not a0, a1
228242
; RV32I-NEXT: addi a1, a1, -1
229243
; RV32I-NEXT: and a0, a0, a1

0 commit comments

Comments
 (0)