Skip to content

Commit ca2b902

Browse files
author
Leon Clark
committed
Remove unnecessary add instructions in ctlz.i8
Add custom lowering for ctlz.i8 to avoid multiple add/sub operations.
1 parent d65a7d1 commit ca2b902

File tree

4 files changed

+35
-13
lines changed

4 files changed

+35
-13
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -446,6 +446,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
446446
{ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF},
447447
MVT::i64, Custom);
448448

449+
setOperationAction({ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, MVT::i8, Custom);
450+
449451
static const MVT::SimpleValueType VectorIntTypes[] = {
450452
MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32,
451453
MVT::v9i32, MVT::v10i32, MVT::v11i32, MVT::v12i32};
@@ -1397,6 +1399,10 @@ void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
13971399
if (SDValue Lowered = lowerFEXP(SDValue(N, 0), DAG))
13981400
Results.push_back(Lowered);
13991401
return;
1402+
case ISD::CTLZ:
1403+
case ISD::CTLZ_ZERO_UNDEF:
1404+
replaceCTLZResults(SDValue(N, 0u), DAG, Results);
1405+
return;
14001406
default:
14011407
return;
14021408
}
@@ -3062,6 +3068,25 @@ static bool isCttzOpc(unsigned Opc) {
30623068
return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
30633069
}
30643070

3071+
void AMDGPUTargetLowering::replaceCTLZResults(
3072+
SDValue Op, SelectionDAG &DAG, SmallVectorImpl<SDValue> &Results) const {
3073+
auto SL = SDLoc(Op);
3074+
auto Arg = Op.getOperand(0u);
3075+
auto ResultVT = Op.getValueType();
3076+
3077+
if (ResultVT != MVT::i8)
3078+
return;
3079+
3080+
assert(isCtlzOpc(Op.getOpcode()));
3081+
assert(ResultVT == Arg.getValueType());
3082+
3083+
auto SubVal = DAG.getConstant(24u, SL, MVT::i32);
3084+
auto NewOp = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Arg);
3085+
NewOp = DAG.getNode(Op.getOpcode(), SL, MVT::i32, NewOp);
3086+
NewOp = DAG.getNode(ISD::SUB, SL, MVT::i32, NewOp, SubVal);
3087+
Results.push_back(DAG.getNode(ISD::TRUNCATE, SL, ResultVT, NewOp));
3088+
}
3089+
30653090
SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const {
30663091
SDLoc SL(Op);
30673092
SDValue Src = Op.getOperand(0);

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,9 @@ class AMDGPUTargetLowering : public TargetLowering {
8484
SDNodeFlags Flags) const;
8585
SDValue lowerFEXP(SDValue Op, SelectionDAG &DAG) const;
8686

87+
void replaceCTLZResults(SDValue Op, SelectionDAG &DAG,
88+
SmallVectorImpl<SDValue> &Results) const;
89+
8790
SDValue LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const;
8891

8992
SDValue LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, bool Signed) const;

llvm/test/CodeGen/AMDGPU/ctlz.ll

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -514,8 +514,7 @@ define amdgpu_kernel void @v_ctlz_i8(ptr addrspace(1) noalias %out, ptr addrspac
514514
; VI-NEXT: s_waitcnt vmcnt(0)
515515
; VI-NEXT: v_ffbh_u32_e32 v0, v0
516516
; VI-NEXT: v_min_u32_e32 v0, 32, v0
517-
; VI-NEXT: v_add_u32_e32 v0, vcc, -16, v0
518-
; VI-NEXT: v_add_u16_e32 v0, -8, v0
517+
; VI-NEXT: v_subrev_u32_e32 v0, vcc, 24, v0
519518
; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0
520519
; VI-NEXT: s_endpgm
521520
;
@@ -558,8 +557,7 @@ define amdgpu_kernel void @v_ctlz_i8(ptr addrspace(1) noalias %out, ptr addrspac
558557
; GFX10-NEXT: s_waitcnt vmcnt(0)
559558
; GFX10-NEXT: v_ffbh_u32_e32 v1, v1
560559
; GFX10-NEXT: v_min_u32_e32 v1, 32, v1
561-
; GFX10-NEXT: v_add_nc_u32_e32 v1, -16, v1
562-
; GFX10-NEXT: v_add_nc_u16 v1, v1, -8
560+
; GFX10-NEXT: v_subrev_nc_u32_e32 v1, 24, v1
563561
; GFX10-NEXT: global_store_byte v0, v1, s[0:1]
564562
; GFX10-NEXT: s_endpgm
565563
;
@@ -586,9 +584,7 @@ define amdgpu_kernel void @v_ctlz_i8(ptr addrspace(1) noalias %out, ptr addrspac
586584
; GFX11-NEXT: v_clz_i32_u32_e32 v1, v1
587585
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
588586
; GFX11-NEXT: v_min_u32_e32 v1, 32, v1
589-
; GFX11-NEXT: v_add_nc_u32_e32 v1, -16, v1
590-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
591-
; GFX11-NEXT: v_add_nc_u16 v1, v1, -8
587+
; GFX11-NEXT: v_subrev_nc_u32_e32 v1, 24, v1
592588
; GFX11-NEXT: global_store_b8 v0, v1, s[0:1]
593589
; GFX11-NEXT: s_nop 0
594590
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)

llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -329,10 +329,10 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
329329
; VI-NEXT: s_waitcnt lgkmcnt(0)
330330
; VI-NEXT: s_and_b32 s2, s2, 0xff
331331
; VI-NEXT: s_flbit_i32_b32 s2, s2
332-
; VI-NEXT: s_add_i32 s2, s2, -16
332+
; VI-NEXT: s_sub_i32 s2, s2, 24
333333
; VI-NEXT: v_mov_b32_e32 v0, s0
334-
; VI-NEXT: v_add_u16_e64 v2, s2, -8
335334
; VI-NEXT: v_mov_b32_e32 v1, s1
335+
; VI-NEXT: v_mov_b32_e32 v2, s2
336336
; VI-NEXT: flat_store_byte v[0:1], v2
337337
; VI-NEXT: s_endpgm
338338
;
@@ -606,8 +606,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
606606
; VI-NEXT: flat_load_ubyte v0, v[0:1]
607607
; VI-NEXT: s_waitcnt vmcnt(0)
608608
; VI-NEXT: v_ffbh_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
609-
; VI-NEXT: v_add_u32_e32 v1, vcc, -16, v1
610-
; VI-NEXT: v_add_u16_e32 v1, -8, v1
609+
; VI-NEXT: v_subrev_u32_e32 v1, vcc, 24, v1
611610
; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0
612611
; VI-NEXT: v_cndmask_b32_e32 v2, 32, v1, vcc
613612
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -1118,8 +1117,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p
11181117
; VI-NEXT: flat_load_ubyte v0, v[0:1]
11191118
; VI-NEXT: s_waitcnt vmcnt(0)
11201119
; VI-NEXT: v_ffbh_u32_e32 v0, v0
1121-
; VI-NEXT: v_add_u32_e32 v0, vcc, -16, v0
1122-
; VI-NEXT: v_add_u16_e32 v2, -8, v0
1120+
; VI-NEXT: v_subrev_u32_e32 v2, vcc, 24, v0
11231121
; VI-NEXT: v_mov_b32_e32 v0, s0
11241122
; VI-NEXT: v_mov_b32_e32 v1, s1
11251123
; VI-NEXT: flat_store_byte v[0:1], v2

0 commit comments

Comments
 (0)