Skip to content

Commit bf77b11

Browse files
committed
[AMDGPU] Introduce optimizeCompareInstr
The following patterns are currently handled: s_cmp_eq_u32 (s_and_b32 $src, 1), 1 => s_and_b32 $src, 1 s_cmp_eq_i32 (s_and_b32 $src, 1), 1 => s_and_b32 $src, 1 s_cmp_eq_u64 (s_and_b64 $src, 1), 1 => s_and_b64 $src, 1 s_cmp_ge_u32 (s_and_b32 $src, 1), 1 => s_and_b32 $src, 1 s_cmp_ge_i32 (s_and_b32 $src, 1), 1 => s_and_b32 $src, 1 s_cmp_lg_u32 (s_and_b32 $src, 1), 0 => s_and_b32 $src, 1 s_cmp_lg_i32 (s_and_b32 $src, 1), 0 => s_and_b32 $src, 1 s_cmp_lg_u64 (s_and_b64 $src, 1), 0 => s_and_b64 $src, 1 s_cmp_gt_u32 (s_and_b32 $src, 1), 0 => s_and_b32 $src, 1 s_cmp_gt_i32 (s_and_b32 $src, 1), 0 => s_and_b32 $src, 1 Differential Revision: https://reviews.llvm.org/D109031
1 parent a10409f commit bf77b11

File tree

9 files changed

+1135
-10
lines changed

9 files changed

+1135
-10
lines changed

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7940,3 +7940,143 @@ unsigned SIInstrInfo::getDSShaderTypeValue(const MachineFunction &MF) {
79407940
return 0;
79417941
}
79427942
}
7943+
7944+
bool SIInstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
7945+
Register &SrcReg2, int64_t &CmpMask,
7946+
int64_t &CmpValue) const {
7947+
if (!MI.getOperand(0).isReg() || MI.getOperand(0).getSubReg())
7948+
return false;
7949+
7950+
switch (MI.getOpcode()) {
7951+
default:
7952+
break;
7953+
case AMDGPU::S_CMP_EQ_U32:
7954+
case AMDGPU::S_CMP_EQ_I32:
7955+
case AMDGPU::S_CMP_LG_U32:
7956+
case AMDGPU::S_CMP_LG_I32:
7957+
case AMDGPU::S_CMP_LT_U32:
7958+
case AMDGPU::S_CMP_LT_I32:
7959+
case AMDGPU::S_CMP_GT_U32:
7960+
case AMDGPU::S_CMP_GT_I32:
7961+
case AMDGPU::S_CMP_LE_U32:
7962+
case AMDGPU::S_CMP_LE_I32:
7963+
case AMDGPU::S_CMP_GE_U32:
7964+
case AMDGPU::S_CMP_GE_I32:
7965+
case AMDGPU::S_CMP_EQ_U64:
7966+
case AMDGPU::S_CMP_LG_U64:
7967+
SrcReg = MI.getOperand(0).getReg();
7968+
if (MI.getOperand(1).isReg()) {
7969+
if (MI.getOperand(1).getSubReg())
7970+
return false;
7971+
SrcReg2 = MI.getOperand(1).getReg();
7972+
CmpValue = 0;
7973+
} else if (MI.getOperand(1).isImm()) {
7974+
SrcReg2 = Register();
7975+
CmpValue = MI.getOperand(1).getImm();
7976+
} else {
7977+
return false;
7978+
}
7979+
CmpMask = ~0;
7980+
return true;
7981+
case AMDGPU::S_CMPK_EQ_U32:
7982+
case AMDGPU::S_CMPK_EQ_I32:
7983+
case AMDGPU::S_CMPK_LG_U32:
7984+
case AMDGPU::S_CMPK_LG_I32:
7985+
case AMDGPU::S_CMPK_LT_U32:
7986+
case AMDGPU::S_CMPK_LT_I32:
7987+
case AMDGPU::S_CMPK_GT_U32:
7988+
case AMDGPU::S_CMPK_GT_I32:
7989+
case AMDGPU::S_CMPK_LE_U32:
7990+
case AMDGPU::S_CMPK_LE_I32:
7991+
case AMDGPU::S_CMPK_GE_U32:
7992+
case AMDGPU::S_CMPK_GE_I32:
7993+
SrcReg = MI.getOperand(0).getReg();
7994+
SrcReg2 = Register();
7995+
CmpValue = MI.getOperand(1).getImm();
7996+
CmpMask = ~0;
7997+
return true;
7998+
}
7999+
8000+
return false;
8001+
}
8002+
8003+
bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
8004+
Register SrcReg2, int64_t CmpMask,
8005+
int64_t CmpValue,
8006+
const MachineRegisterInfo *MRI) const {
8007+
if (SrcReg2 || SrcReg.isPhysical())
8008+
return false;
8009+
8010+
const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
8011+
this](int64_t ExpectedValue) -> bool {
8012+
// s_cmp_eq_u32 (s_and_b32 $src, 1), 1 => s_and_b32 $src, 1
8013+
// s_cmp_eq_i32 (s_and_b32 $src, 1), 1 => s_and_b32 $src, 1
8014+
// s_cmp_ge_u32 (s_and_b32 $src, 1), 1 => s_and_b32 $src, 1
8015+
// s_cmp_ge_i32 (s_and_b32 $src, 1), 1 => s_and_b32 $src, 1
8016+
// s_cmp_eq_u64 (s_and_b64 $src, 1), 1 => s_and_b64 $src, 1
8017+
// s_cmp_lg_u32 (s_and_b32 $src, 1), 0 => s_and_b32 $src, 1
8018+
// s_cmp_lg_i32 (s_and_b32 $src, 1), 0 => s_and_b32 $src, 1
8019+
// s_cmp_gt_u32 (s_and_b32 $src, 1), 0 => s_and_b32 $src, 1
8020+
// s_cmp_gt_i32 (s_and_b32 $src, 1), 0 => s_and_b32 $src, 1
8021+
// s_cmp_lg_u64 (s_and_b64 $src, 1), 0 => s_and_b64 $src, 1
8022+
8023+
// TODO: Fold this into s_bitcmp* if result of an AND is unused.
8024+
// TODO: If s_bitcmp can be used we are not limited to 1 and 0 but can
8025+
// process any power of 2.
8026+
8027+
if (CmpValue != ExpectedValue)
8028+
return false;
8029+
8030+
MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg);
8031+
if (!Def || Def->getParent() != CmpInstr.getParent())
8032+
return false;
8033+
8034+
if (Def->getOpcode() != AMDGPU::S_AND_B32 &&
8035+
Def->getOpcode() != AMDGPU::S_AND_B64)
8036+
return false;
8037+
8038+
if ((!Def->getOperand(1).isImm() || Def->getOperand(1).getImm() != 1) &&
8039+
(!Def->getOperand(2).isImm() || Def->getOperand(2).getImm() != 1))
8040+
return false;
8041+
8042+
for (auto I = std::next(Def->getIterator()), E = CmpInstr.getIterator();
8043+
I != E; ++I) {
8044+
if (I->modifiesRegister(AMDGPU::SCC, &RI) ||
8045+
I->killsRegister(AMDGPU::SCC, &RI))
8046+
return false;
8047+
}
8048+
8049+
MachineOperand *SccDef = Def->findRegisterDefOperand(AMDGPU::SCC);
8050+
SccDef->setIsDead(false);
8051+
CmpInstr.eraseFromParent();
8052+
8053+
return true;
8054+
};
8055+
8056+
switch (CmpInstr.getOpcode()) {
8057+
default:
8058+
break;
8059+
case AMDGPU::S_CMP_EQ_U32:
8060+
case AMDGPU::S_CMP_EQ_I32:
8061+
case AMDGPU::S_CMP_GE_U32:
8062+
case AMDGPU::S_CMP_GE_I32:
8063+
case AMDGPU::S_CMP_EQ_U64:
8064+
case AMDGPU::S_CMPK_EQ_U32:
8065+
case AMDGPU::S_CMPK_EQ_I32:
8066+
case AMDGPU::S_CMPK_GE_U32:
8067+
case AMDGPU::S_CMPK_GE_I32:
8068+
return optimizeCmpAnd(1);
8069+
case AMDGPU::S_CMP_LG_U32:
8070+
case AMDGPU::S_CMP_LG_I32:
8071+
case AMDGPU::S_CMP_GT_U32:
8072+
case AMDGPU::S_CMP_GT_I32:
8073+
case AMDGPU::S_CMP_LG_U64:
8074+
case AMDGPU::S_CMPK_LG_U32:
8075+
case AMDGPU::S_CMPK_LG_I32:
8076+
case AMDGPU::S_CMPK_GT_U32:
8077+
case AMDGPU::S_CMPK_GT_I32:
8078+
return optimizeCmpAnd(0);
8079+
}
8080+
8081+
return false;
8082+
}

llvm/lib/Target/AMDGPU/SIInstrInfo.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -316,6 +316,14 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
316316
Register DstReg, ArrayRef<MachineOperand> Cond,
317317
Register TrueReg, Register FalseReg) const;
318318

319+
bool analyzeCompare(const MachineInstr &MI, Register &SrcReg,
320+
Register &SrcReg2, int64_t &CmpMask,
321+
int64_t &CmpValue) const override;
322+
323+
bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
324+
Register SrcReg2, int64_t CmpMask, int64_t CmpValue,
325+
const MachineRegisterInfo *MRI) const override;
326+
319327
unsigned getAddressSpaceForPseudoSourceKind(
320328
unsigned Kind) const override;
321329

llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -397,7 +397,6 @@ define amdgpu_kernel void @select_add_lhs_const_i16(i1 %cond) {
397397
; GCN-NEXT: v_mov_b32_e32 v1, 0x80
398398
; GCN-NEXT: s_waitcnt lgkmcnt(0)
399399
; GCN-NEXT: s_and_b32 s0, 1, s0
400-
; GCN-NEXT: s_cmp_eq_u32 s0, 1
401400
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
402401
; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
403402
; GCN-NEXT: flat_store_short v[0:1], v0

llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -175,7 +175,6 @@ define amdgpu_kernel void @v3i16_registers(i1 %cond) #0 {
175175
; GCN-NEXT: s_addc_u32 s1, s1, 0
176176
; GCN-NEXT: s_waitcnt lgkmcnt(0)
177177
; GCN-NEXT: s_and_b32 s4, 1, s4
178-
; GCN-NEXT: s_cmp_eq_u32 s4, 1
179178
; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
180179
; GCN-NEXT: s_and_b64 vcc, exec, s[4:5]
181180
; GCN-NEXT: s_mov_b32 s32, 0
@@ -221,7 +220,6 @@ define amdgpu_kernel void @v3f16_registers(i1 %cond) #0 {
221220
; GCN-NEXT: s_addc_u32 s1, s1, 0
222221
; GCN-NEXT: s_waitcnt lgkmcnt(0)
223222
; GCN-NEXT: s_and_b32 s4, 1, s4
224-
; GCN-NEXT: s_cmp_eq_u32 s4, 1
225223
; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
226224
; GCN-NEXT: s_and_b64 vcc, exec, s[4:5]
227225
; GCN-NEXT: s_mov_b32 s32, 0

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@ declare double @llvm.amdgcn.div.fmas.f64(double, double, double, i1) nounwind re
1717
; VI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x94
1818

1919
; GCN-DAG: s_and_b32 [[AND_I1:s[0-9]+]], 1, s{{[0-9]+}}
20-
; GCN: s_cmp_eq_u32 [[AND_I1]], 1
2120

2221
; GCN-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]]
2322
; GCN-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]

0 commit comments

Comments
 (0)