Skip to content

Commit c28fa08

Browse files
author
Manish Kausik H
committed
[LegalizeDAG] Optimize CodeGen for ISD::CTLZ_ZERO_UNDEF
Previously we had the same instructions being generated for `ISD::CTLZ` and `ISD::CTLZ_ZERO_UNDEF` which did not take advantage of the fact that zero is an invalid input for `ISD::CTLZ_ZERO_UNDEF`. This commit separates codegen for the two cases to allow for the optimization for the latter case. The details of the optimization are outlined in #82075 Fixes #82075
1 parent 7d913c5 commit c28fa08

File tree

16 files changed

+290
-241
lines changed

16 files changed

+290
-241
lines changed

llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2458,13 +2458,20 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
24582458
NewOpc = TargetOpcode::G_CTTZ_ZERO_UNDEF;
24592459
}
24602460

2461+
unsigned SizeDiff = WideTy.getSizeInBits() - CurTy.getSizeInBits();
2462+
2463+
if (MI.getOpcode() == TargetOpcode::G_CTLZ_ZERO_UNDEF) {
2464+
// An optimization where the result is the CTLZ after the left shift by
2465+
// (Difference in widety and current ty).
2466+
MIBSrc = MIRBuilder.buildShl(WideTy, MIBSrc,
2467+
MIRBuilder.buildConstant(WideTy, SizeDiff));
2468+
}
2469+
24612470
// Perform the operation at the larger size.
24622471
auto MIBNewOp = MIRBuilder.buildInstr(NewOpc, {WideTy}, {MIBSrc});
24632472
// This is already the correct result for CTPOP and CTTZs
2464-
if (MI.getOpcode() == TargetOpcode::G_CTLZ ||
2465-
MI.getOpcode() == TargetOpcode::G_CTLZ_ZERO_UNDEF) {
2473+
if (MI.getOpcode() == TargetOpcode::G_CTLZ) {
24662474
// The correct result is NewOp - (Difference in widety and current ty).
2467-
unsigned SizeDiff = WideTy.getSizeInBits() - CurTy.getSizeInBits();
24682475
MIBNewOp = MIRBuilder.buildSub(
24692476
WideTy, MIBNewOp, MIRBuilder.buildConstant(WideTy, SizeDiff));
24702477
}

llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5026,7 +5026,6 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
50265026
case ISD::CTTZ:
50275027
case ISD::CTTZ_ZERO_UNDEF:
50285028
case ISD::CTLZ:
5029-
case ISD::CTLZ_ZERO_UNDEF:
50305029
case ISD::CTPOP:
50315030
// Zero extend the argument unless its cttz, then use any_extend.
50325031
if (Node->getOpcode() == ISD::CTTZ ||
@@ -5047,15 +5046,30 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
50475046
// Perform the larger operation. For CTPOP and CTTZ_ZERO_UNDEF, this is
50485047
// already the correct result.
50495048
Tmp1 = DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1);
5050-
if (Node->getOpcode() == ISD::CTLZ ||
5051-
Node->getOpcode() == ISD::CTLZ_ZERO_UNDEF) {
5049+
if (Node->getOpcode() == ISD::CTLZ) {
50525050
// Tmp1 = Tmp1 - (sizeinbits(NVT) - sizeinbits(Old VT))
50535051
Tmp1 = DAG.getNode(ISD::SUB, dl, NVT, Tmp1,
50545052
DAG.getConstant(NVT.getSizeInBits() -
50555053
OVT.getSizeInBits(), dl, NVT));
50565054
}
50575055
Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, OVT, Tmp1));
50585056
break;
5057+
case ISD::CTLZ_ZERO_UNDEF:
5058+
// We know that the argument is unlikely to be zero, hence we can take a
5059+
// different approach as compared to ISD::CTLZ
5060+
5061+
// Any Extend the argument
5062+
Tmp1 = DAG.getNode(ISD::ANY_EXTEND, dl, NVT, Node->getOperand(0));
5063+
5064+
// Tmp1 = Tmp1 << (sizeinbits(NVT) - sizeinbits(Old VT))
5065+
Tmp2 = DAG.getShiftAmountConstant(NVT.getSizeInBits() - OVT.getSizeInBits(),
5066+
Tmp1.getValueType(), dl);
5067+
Tmp1 = DAG.getNode(ISD::SHL, dl, NVT, Tmp1, Tmp2);
5068+
5069+
// Perform the larger operation
5070+
Tmp1 = DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1);
5071+
Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, OVT, Tmp1));
5072+
break;
50595073
case ISD::BITREVERSE:
50605074
case ISD::BSWAP: {
50615075
unsigned DiffBits = NVT.getSizeInBits() - OVT.getSizeInBits();

llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp

Lines changed: 37 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -646,21 +646,44 @@ SDValue DAGTypeLegalizer::PromoteIntRes_CTLZ(SDNode *N) {
646646
}
647647
}
648648

649-
// Zero extend to the promoted type and do the count there.
650-
SDValue Op = ZExtPromotedInteger(N->getOperand(0));
649+
unsigned CtlzOpcode = N->getOpcode();
650+
if (CtlzOpcode == ISD::CTLZ || CtlzOpcode == ISD::VP_CTLZ) {
651+
// Zero extend to the promoted type and do the count there.
652+
SDValue Op = ZExtPromotedInteger(N->getOperand(0));
653+
654+
// Subtract off the extra leading bits in the bigger type.
655+
SDValue ExtractLeadingBits = DAG.getConstant(
656+
NVT.getScalarSizeInBits() - OVT.getScalarSizeInBits(), dl, NVT);
657+
if (!N->isVPOpcode())
658+
return DAG.getNode(ISD::SUB, dl, NVT,
659+
DAG.getNode(N->getOpcode(), dl, NVT, Op),
660+
ExtractLeadingBits);
661+
SDValue Mask = N->getOperand(1);
662+
SDValue EVL = N->getOperand(2);
663+
return DAG.getNode(ISD::VP_SUB, dl, NVT,
664+
DAG.getNode(N->getOpcode(), dl, NVT, Op, Mask, EVL),
665+
ExtractLeadingBits, Mask, EVL);
666+
}
667+
if (CtlzOpcode == ISD::CTLZ_ZERO_UNDEF ||
668+
CtlzOpcode == ISD::VP_CTLZ_ZERO_UNDEF) {
669+
// Any Extend the argument
670+
SDValue Op = GetPromotedInteger(N->getOperand(0));
671+
672+
// Op = Op << (sizeinbits(NVT) - sizeinbits(Old VT))
673+
unsigned SHLAmount = NVT.getScalarSizeInBits() - OVT.getScalarSizeInBits();
674+
auto ShiftConst =
675+
DAG.getShiftAmountConstant(SHLAmount, Op.getValueType(), dl);
676+
if (!N->isVPOpcode()) {
677+
Op = DAG.getNode(ISD::SHL, dl, NVT, Op, ShiftConst);
678+
return DAG.getNode(CtlzOpcode, dl, NVT, Op);
679+
}
651680

652-
// Subtract off the extra leading bits in the bigger type.
653-
SDValue ExtractLeadingBits = DAG.getConstant(
654-
NVT.getScalarSizeInBits() - OVT.getScalarSizeInBits(), dl, NVT);
655-
if (!N->isVPOpcode())
656-
return DAG.getNode(ISD::SUB, dl, NVT,
657-
DAG.getNode(N->getOpcode(), dl, NVT, Op),
658-
ExtractLeadingBits);
659-
SDValue Mask = N->getOperand(1);
660-
SDValue EVL = N->getOperand(2);
661-
return DAG.getNode(ISD::VP_SUB, dl, NVT,
662-
DAG.getNode(N->getOpcode(), dl, NVT, Op, Mask, EVL),
663-
ExtractLeadingBits, Mask, EVL);
681+
SDValue Mask = N->getOperand(1);
682+
SDValue EVL = N->getOperand(2);
683+
Op = DAG.getNode(ISD::VP_SHL, dl, NVT, Op, ShiftConst, Mask, EVL);
684+
return DAG.getNode(CtlzOpcode, dl, NVT, Op, Mask, EVL);
685+
}
686+
llvm_unreachable("Invalid CTLZ Opcode");
664687
}
665688

666689
SDValue DAGTypeLegalizer::PromoteIntRes_CTPOP_PARITY(SDNode *N) {
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2+
; RUN: llc < %s --mtriple=aarch64 | FileCheck %s
3+
4+
declare i8 @llvm.ctlz.i8(i8, i1 immarg)
5+
declare <8 x i8> @llvm.ctlz.v8i8(<8 x i8>, i1 immarg)
6+
declare i11 @llvm.ctlz.i11(i11, i1 immarg)
7+
8+
define i32 @clz_nzu8(i8 %self) {
9+
; CHECK-LABEL: clz_nzu8:
10+
; CHECK: // %bb.0: // %start
11+
; CHECK-NEXT: lsl w8, w0, #24
12+
; CHECK-NEXT: clz w0, w8
13+
; CHECK-NEXT: ret
14+
start:
15+
%ctlz_res = call i8 @llvm.ctlz.i8(i8 %self, i1 true)
16+
%ret = zext i8 %ctlz_res to i32
17+
ret i32 %ret
18+
}
19+
20+
; non standard bit size argument to ctlz
21+
define i32 @clz_nzu11(i11 %self) {
22+
; CHECK-LABEL: clz_nzu11:
23+
; CHECK: // %bb.0:
24+
; CHECK-NEXT: lsl w8, w0, #21
25+
; CHECK-NEXT: clz w0, w8
26+
; CHECK-NEXT: ret
27+
%ctlz_res = call i11 @llvm.ctlz.i11(i11 %self, i1 true)
28+
%ret = zext i11 %ctlz_res to i32
29+
ret i32 %ret
30+
}
31+
32+
; vector type argument to ctlz intrinsic
33+
define <8 x i32> @clz_vec_nzu8(<8 x i8> %self) {
34+
; CHECK-LABEL: clz_vec_nzu8:
35+
; CHECK: // %bb.0:
36+
; CHECK-NEXT: clz v0.8b, v0.8b
37+
; CHECK-NEXT: ushll v0.8h, v0.8b, #0
38+
; CHECK-NEXT: ushll2 v1.4s, v0.8h, #0
39+
; CHECK-NEXT: ushll v0.4s, v0.4h, #0
40+
; CHECK-NEXT: ret
41+
%ctlz_res = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %self, i1 true)
42+
%ret = zext <8 x i8> %ctlz_res to <8 x i32>
43+
ret <8 x i32> %ret
44+
}

llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz-zero-undef.mir

Lines changed: 20 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -83,12 +83,11 @@ body: |
8383
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
8484
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
8585
; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]]
86-
; CHECK-NEXT: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[AND]](s32)
8786
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
88-
; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[CTLZ_ZERO_UNDEF]], [[C1]]
89-
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[SUB]](s32)
90-
; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
91-
; CHECK-NEXT: $vgpr0 = COPY [[AND1]](s32)
87+
; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C1]](s32)
88+
; CHECK-NEXT: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[SHL]](s32)
89+
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[CTLZ_ZERO_UNDEF]](s32)
90+
; CHECK-NEXT: $vgpr0 = COPY [[COPY1]](s32)
9291
%0:_(s32) = COPY $vgpr0
9392
%1:_(s16) = G_TRUNC %0
9493
%2:_(s16) = G_CTLZ_ZERO_UNDEF %1
@@ -151,16 +150,14 @@ body: |
151150
; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
152151
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
153152
; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C1]]
154-
; CHECK-NEXT: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[AND]](s32)
155-
; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[CTLZ_ZERO_UNDEF]], [[C]]
156-
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[SUB]](s32)
157-
; CHECK-NEXT: [[CTLZ_ZERO_UNDEF1:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[LSHR]](s32)
158-
; CHECK-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[CTLZ_ZERO_UNDEF1]], [[C]]
159-
; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[SUB1]](s32)
160-
; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]]
161-
; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]]
162-
; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32)
163-
; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL]]
153+
; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C]](s32)
154+
; CHECK-NEXT: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[SHL]](s32)
155+
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[CTLZ_ZERO_UNDEF]](s32)
156+
; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[C]](s32)
157+
; CHECK-NEXT: [[CTLZ_ZERO_UNDEF1:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[SHL1]](s32)
158+
; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[CTLZ_ZERO_UNDEF1]](s32)
159+
; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[COPY2]], [[C]](s32)
160+
; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY1]], [[SHL2]]
164161
; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
165162
; CHECK-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>)
166163
%0:_(<2 x s16>) = COPY $vgpr0
@@ -181,12 +178,11 @@ body: |
181178
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
182179
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 127
183180
; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]]
184-
; CHECK-NEXT: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[AND]](s32)
185181
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 25
186-
; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[CTLZ_ZERO_UNDEF]], [[C1]]
187-
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[SUB]](s32)
188-
; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
189-
; CHECK-NEXT: $vgpr0 = COPY [[AND1]](s32)
182+
; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C1]](s32)
183+
; CHECK-NEXT: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[SHL]](s32)
184+
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[CTLZ_ZERO_UNDEF]](s32)
185+
; CHECK-NEXT: $vgpr0 = COPY [[COPY1]](s32)
190186
%0:_(s32) = COPY $vgpr0
191187
%1:_(s7) = G_TRUNC %0
192188
%2:_(s7) = G_CTLZ_ZERO_UNDEF %1
@@ -207,11 +203,10 @@ body: |
207203
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
208204
; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 8589934591
209205
; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C]]
210-
; CHECK-NEXT: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[AND]](s64)
211-
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 31
212-
; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C1]](s64)
213-
; CHECK-NEXT: [[USUBO:%[0-9]+]]:_(s32), [[USUBO1:%[0-9]+]]:_(s1) = G_USUBO [[CTLZ_ZERO_UNDEF]], [[UV]]
214-
; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[USUBO]](s32)
206+
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 31
207+
; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[AND]], [[C1]](s32)
208+
; CHECK-NEXT: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[SHL]](s64)
209+
; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[CTLZ_ZERO_UNDEF]](s32)
215210
; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[ZEXT]](s64)
216211
%0:_(s64) = COPY $vgpr0_vgpr1
217212
%1:_(s33) = G_TRUNC %0

llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll

Lines changed: 12 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -372,8 +372,8 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
372372
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
373373
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
374374
; GFX9-GISEL-NEXT: s_and_b32 s0, s4, 0xff
375+
; GFX9-GISEL-NEXT: s_lshl_b32 s0, s0, 24
375376
; GFX9-GISEL-NEXT: s_flbit_i32_b32 s0, s0
376-
; GFX9-GISEL-NEXT: s_sub_i32 s0, s0, 24
377377
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
378378
; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[2:3]
379379
; GFX9-GISEL-NEXT: s_endpgm
@@ -404,9 +404,8 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
404404
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
405405
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
406406
; VI-NEXT: s_waitcnt lgkmcnt(0)
407-
; VI-NEXT: s_and_b32 s2, s2, 0xffff
407+
; VI-NEXT: s_lshl_b32 s2, s2, 16
408408
; VI-NEXT: s_flbit_i32_b32 s2, s2
409-
; VI-NEXT: s_add_i32 s2, s2, -16
410409
; VI-NEXT: v_mov_b32_e32 v0, s0
411410
; VI-NEXT: v_mov_b32_e32 v1, s1
412411
; VI-NEXT: v_mov_b32_e32 v2, s2
@@ -449,8 +448,8 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
449448
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
450449
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
451450
; GFX9-GISEL-NEXT: s_and_b32 s0, s4, 0xffff
451+
; GFX9-GISEL-NEXT: s_lshl_b32 s0, s0, 16
452452
; GFX9-GISEL-NEXT: s_flbit_i32_b32 s0, s0
453-
; GFX9-GISEL-NEXT: s_sub_i32 s0, s0, 16
454453
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
455454
; GFX9-GISEL-NEXT: global_store_short v1, v0, s[2:3]
456455
; GFX9-GISEL-NEXT: s_endpgm
@@ -651,9 +650,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
651650
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
652651
; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3]
653652
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
654-
; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v1
655-
; GFX9-GISEL-NEXT: v_subrev_u32_e32 v2, 24, v2
656-
; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xff, v2
653+
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v1
654+
; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v2
657655
; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
658656
; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc
659657
; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[0:1]
@@ -708,8 +706,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
708706
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v2
709707
; VI-NEXT: s_waitcnt vmcnt(0)
710708
; VI-NEXT: v_or_b32_e32 v0, v1, v0
711-
; VI-NEXT: v_ffbh_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
712-
; VI-NEXT: v_add_u32_e32 v1, vcc, -16, v1
709+
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0
710+
; VI-NEXT: v_ffbh_u32_e32 v1, v1
713711
; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0
714712
; VI-NEXT: v_cndmask_b32_e32 v2, 32, v1, vcc
715713
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -756,9 +754,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
756754
; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1
757755
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
758756
; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1
759-
; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v1
760-
; GFX9-GISEL-NEXT: v_subrev_u32_e32 v2, 16, v2
761-
; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2
757+
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v1
758+
; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v2
762759
; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
763760
; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc
764761
; GFX9-GISEL-NEXT: global_store_short v0, v1, s[0:1]
@@ -1164,8 +1161,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p
11641161
; GFX9-GISEL-NEXT: global_load_ubyte v0, v[0:1], off
11651162
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
11661163
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
1164+
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 24, v0
11671165
; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0
1168-
; GFX9-GISEL-NEXT: v_subrev_u32_e32 v0, 24, v0
11691166
; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[0:1]
11701167
; GFX9-GISEL-NEXT: s_endpgm
11711168
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1702,8 +1699,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noa
17021699
; GFX9-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v3, vcc
17031700
; GFX9-GISEL-NEXT: global_load_ubyte v0, v[0:1], off
17041701
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
1705-
; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0
1706-
; GFX9-GISEL-NEXT: v_subrev_u32_e32 v1, 24, v1
1702+
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v1, 24, v0
1703+
; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1
17071704
; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
17081705
; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc
17091706
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0

llvm/test/CodeGen/Hexagon/bitmanip.ll

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -50,13 +50,10 @@ define i16 @ctlz_i16(i16 %a0) #0 {
5050
; CHECK: .cfi_startproc
5151
; CHECK-NEXT: // %bb.0:
5252
; CHECK-NEXT: {
53-
; CHECK-NEXT: r0 = zxth(r0)
53+
; CHECK-NEXT: r0 = aslh(r0)
5454
; CHECK-NEXT: }
5555
; CHECK-NEXT: {
5656
; CHECK-NEXT: r0 = cl0(r0)
57-
; CHECK-NEXT: }
58-
; CHECK-NEXT: {
59-
; CHECK-NEXT: r0 = add(r0,#-16)
6057
; CHECK-NEXT: jumpr r31
6158
; CHECK-NEXT: }
6259
%v0 = tail call i16 @llvm.ctlz.i16(i16 %a0, i1 true) #1

0 commit comments

Comments
 (0)