Skip to content

Commit 27ce7f0

Browse files
author
Leon Clark
committed
Add optimisations to GISel and address comments.
1 parent a9c548c commit 27ce7f0

File tree

5 files changed

+77
-36
lines changed

5 files changed

+77
-36
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3093,12 +3093,14 @@ SDValue AMDGPUTargetLowering::lowerCTLZResults(SDValue Op,
30933093

30943094
auto const NumBits = ResultVT.getFixedSizeInBits();
30953095
auto NumExtBits = DAG.getConstant(32u - NumBits, SL, MVT::i32);
3096-
auto NewOp = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Arg);
3096+
auto NewOp = SDValue();
30973097

30983098
if (Opc == ISD::CTLZ_ZERO_UNDEF) {
3099+
NewOp = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Arg);
30993100
NewOp = DAG.getNode(ISD::SHL, SL, MVT::i32, NewOp, NumExtBits);
31003101
NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
31013102
} else {
3103+
NewOp = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Arg);
31023104
NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
31033105
NewOp = DAG.getNode(ISD::SUB, SL, MVT::i32, NewOp, NumExtBits);
31043106
}

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

Lines changed: 49 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1270,7 +1270,30 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
12701270
.custom();
12711271

12721272
// The 64-bit versions produce 32-bit results, but only on the SALU.
1273-
getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
1273+
// getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
1274+
// .legalFor({{S32, S32}, {S32, S64}})
1275+
// .clampScalar(0, S32, S32)
1276+
// .clampScalar(1, S32, S64)
1277+
// .scalarize(0)
1278+
// .widenScalarToNextPow2(0, 32)
1279+
// .widenScalarToNextPow2(1, 32);
1280+
1281+
getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF)
1282+
.legalFor({{S32, S32}, {S32, S64}})
1283+
.customFor({{S32, S8}, {S32, S16}})
1284+
.clampScalar(0, S32, S32)
1285+
.clampScalar(1, S32, S64)
1286+
.scalarize(0)
1287+
.widenScalarToNextPow2(0, 32)
1288+
.widenScalarToNextPow2(1, 32);
1289+
// .custom();
1290+
1291+
// .legalFor({S32})
1292+
// .customFor({S64})
1293+
// .clampScalar(0, S32, S64)
1294+
// .scalarize(0);
1295+
1296+
getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF)
12741297
.legalFor({{S32, S32}, {S32, S64}})
12751298
.clampScalar(0, S32, S32)
12761299
.clampScalar(1, S32, S64)
@@ -2128,6 +2151,8 @@ bool AMDGPULegalizerInfo::legalizeCustom(
21282151
case TargetOpcode::G_CTLZ:
21292152
case TargetOpcode::G_CTTZ:
21302153
return legalizeCTLZ_CTTZ(MI, MRI, B);
2154+
case TargetOpcode::G_CTLZ_ZERO_UNDEF:
2155+
return legalizeCTLZ_ZERO_UNDEF(MI, MRI, B);
21312156
case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
21322157
return legalizeFPTruncRound(MI, B);
21332158
case TargetOpcode::G_STACKSAVE:
@@ -4145,6 +4170,29 @@ bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
41454170
return true;
41464171
}
41474172

4173+
bool AMDGPULegalizerInfo::legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI,
4174+
MachineRegisterInfo &MRI,
4175+
MachineIRBuilder &B) const {
4176+
auto Dst = MI.getOperand(0).getReg();
4177+
auto Src = MI.getOperand(1).getReg();
4178+
auto DstTy = MRI.getType(Dst);
4179+
auto SrcTy = MRI.getType(Src);
4180+
auto NumBits = SrcTy.getSizeInBits();
4181+
4182+
assert(NumBits < 32u);
4183+
4184+
auto ShiftAmt = B.buildConstant(S32, 32u - NumBits);
4185+
Src = B.buildAnyExt(S32, {Src}).getReg(0u);
4186+
Src = B.buildLShr(S32, {Src}, ShiftAmt).getReg(0u);
4187+
B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {Dst}, {Src});
4188+
MI.eraseFromParent();
4189+
return true;
4190+
4191+
// LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
4192+
// auto ShiftAmt = B.buildConstant(S32, Shift);
4193+
// AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
4194+
}
4195+
41484196
// Check that this is a G_XOR x, -1
41494197
static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
41504198
if (MI.getOpcode() != TargetOpcode::G_XOR)

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,8 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
108108
bool legalizeMul(LegalizerHelper &Helper, MachineInstr &MI) const;
109109
bool legalizeCTLZ_CTTZ(MachineInstr &MI, MachineRegisterInfo &MRI,
110110
MachineIRBuilder &B) const;
111+
bool legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI, MachineRegisterInfo &MRI,
112+
MachineIRBuilder &B) const;
111113

112114
bool loadInputValue(Register DstReg, MachineIRBuilder &B,
113115
const ArgDescriptor *Arg,

llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz-zero-undef.mir

Lines changed: 14 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -81,14 +81,12 @@ body: |
8181
; CHECK: liveins: $vgpr0
8282
; CHECK-NEXT: {{ $}}
8383
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
84-
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
85-
; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]]
86-
; CHECK-NEXT: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[AND]](s32)
87-
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
88-
; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[CTLZ_ZERO_UNDEF]], [[C1]]
89-
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[SUB]](s32)
90-
; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
91-
; CHECK-NEXT: $vgpr0 = COPY [[AND1]](s32)
84+
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
85+
; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32)
86+
; CHECK-NEXT: [[AMDGPU_FFBH_U32:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[LSHR]](s32)
87+
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
88+
; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[AMDGPU_FFBH_U32]], [[C1]]
89+
; CHECK-NEXT: $vgpr0 = COPY [[AND]](s32)
9290
%0:_(s32) = COPY $vgpr0
9391
%1:_(s16) = G_TRUNC %0
9492
%2:_(s16) = G_CTLZ_ZERO_UNDEF %1
@@ -149,18 +147,15 @@ body: |
149147
; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
150148
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
151149
; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
150+
; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
151+
; CHECK-NEXT: [[AMDGPU_FFBH_U32:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[LSHR1]](s32)
152+
; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[LSHR]], [[C]](s32)
153+
; CHECK-NEXT: [[AMDGPU_FFBH_U321:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[LSHR2]](s32)
152154
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
153-
; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C1]]
154-
; CHECK-NEXT: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[AND]](s32)
155-
; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[CTLZ_ZERO_UNDEF]], [[C]]
156-
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[SUB]](s32)
157-
; CHECK-NEXT: [[CTLZ_ZERO_UNDEF1:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[LSHR]](s32)
158-
; CHECK-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[CTLZ_ZERO_UNDEF1]], [[C]]
159-
; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[SUB1]](s32)
160-
; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]]
161-
; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]]
162-
; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32)
163-
; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL]]
155+
; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[AMDGPU_FFBH_U32]], [[C1]]
156+
; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[AMDGPU_FFBH_U321]], [[C1]]
157+
; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
158+
; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
164159
; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
165160
; CHECK-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>)
166161
%0:_(<2 x s16>) = COPY $vgpr0

llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll

Lines changed: 9 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -369,9 +369,8 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
369369
; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
370370
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
371371
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
372-
; GFX9-GISEL-NEXT: s_and_b32 s0, s4, 0xff
372+
; GFX9-GISEL-NEXT: s_lshr_b32 s0, s4, 24
373373
; GFX9-GISEL-NEXT: s_flbit_i32_b32 s0, s0
374-
; GFX9-GISEL-NEXT: s_sub_i32 s0, s0, 24
375374
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
376375
; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[2:3]
377376
; GFX9-GISEL-NEXT: s_endpgm
@@ -445,9 +444,8 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
445444
; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
446445
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
447446
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
448-
; GFX9-GISEL-NEXT: s_and_b32 s0, s4, 0xffff
447+
; GFX9-GISEL-NEXT: s_lshr_b32 s0, s4, 16
449448
; GFX9-GISEL-NEXT: s_flbit_i32_b32 s0, s0
450-
; GFX9-GISEL-NEXT: s_sub_i32 s0, s0, 16
451449
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
452450
; GFX9-GISEL-NEXT: global_store_short v1, v0, s[2:3]
453451
; GFX9-GISEL-NEXT: s_endpgm
@@ -649,8 +647,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
649647
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
650648
; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3]
651649
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
652-
; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v1
653-
; GFX9-GISEL-NEXT: v_subrev_u32_e32 v2, 24, v2
650+
; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v2, v1
654651
; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xff, v2
655652
; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
656653
; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc
@@ -755,8 +752,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
755752
; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1
756753
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
757754
; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1
758-
; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v1
759-
; GFX9-GISEL-NEXT: v_subrev_u32_e32 v2, 16, v2
755+
; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v2, v1
760756
; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2
761757
; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
762758
; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc
@@ -1163,8 +1159,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p
11631159
; GFX9-GISEL-NEXT: global_load_ubyte v0, v[0:1], off
11641160
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
11651161
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
1166-
; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0
1167-
; GFX9-GISEL-NEXT: v_subrev_u32_e32 v0, 24, v0
1162+
; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v0, v0
11681163
; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[0:1]
11691164
; GFX9-GISEL-NEXT: s_endpgm
11701165
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1700,12 +1695,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noa
17001695
; GFX9-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v1, v0
17011696
; GFX9-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v3, vcc
17021697
; GFX9-GISEL-NEXT: global_load_ubyte v0, v[0:1], off
1703-
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
1704-
; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0
1705-
; GFX9-GISEL-NEXT: v_subrev_u32_e32 v1, 24, v1
1706-
; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
1707-
; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc
17081698
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
1699+
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
1700+
; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v2, v0
1701+
; GFX9-GISEL-NEXT: v_cmp_eq_u32_sdwa s[2:3], v0, v1
1702+
; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, -1, s[2:3]
17091703
; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[0:1]
17101704
; GFX9-GISEL-NEXT: s_endpgm
17111705
%tid = call i32 @llvm.amdgcn.workitem.id.x()

0 commit comments

Comments
 (0)