Skip to content

[AMDGPU] Implement readcyclecounter for GFX12 #76965

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jan 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -761,6 +761,12 @@ def FeatureShaderCyclesRegister : SubtargetFeature<"shader-cycles-register",
"Has SHADER_CYCLES hardware register"
>;

def FeatureShaderCyclesHiLoRegisters : SubtargetFeature<"shader-cycles-hi-lo-registers",
"HasShaderCyclesHiLoRegisters",
"true",
"Has SHADER_CYCLES_HI/LO hardware registers"
>;

def FeatureMadMacF32Insts : SubtargetFeature<"mad-mac-f32-insts",
"HasMadMacF32Insts",
"true",
Expand Down Expand Up @@ -1469,7 +1475,7 @@ def FeatureISAVersion12 : FeatureSet<
FeatureNSAEncoding,
FeaturePartialNSAEncoding,
FeatureWavefrontSize32,
FeatureShaderCyclesRegister,
FeatureShaderCyclesHiLoRegisters,
FeatureArchitectedFlatScratch,
FeatureAtomicFaddRtnInsts,
FeatureAtomicFaddNoRtnInsts,
Expand Down Expand Up @@ -1970,6 +1976,8 @@ def HasSMemTimeInst : Predicate<"Subtarget->hasSMemTimeInst()">,
def HasShaderCyclesRegister : Predicate<"Subtarget->hasShaderCyclesRegister()">,
AssemblerPredicate<(all_of FeatureShaderCyclesRegister)>;

def HasShaderCyclesHiLoRegisters : Predicate<"Subtarget->hasShaderCyclesHiLoRegisters()">;

def HasFP8Insts : Predicate<"Subtarget->hasFP8Insts()">,
AssemblerPredicate<(all_of FeatureFP8Insts)>;

Expand Down
5 changes: 5 additions & 0 deletions llvm/lib/Target/AMDGPU/GCNSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool HasGetWaveIdInst = false;
bool HasSMemTimeInst = false;
bool HasShaderCyclesRegister = false;
bool HasShaderCyclesHiLoRegisters = false;
bool HasVOP3Literal = false;
bool HasNoDataDepHazard = false;
bool FlatAddressSpace = false;
Expand Down Expand Up @@ -819,6 +820,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
return HasShaderCyclesRegister;
}

bool hasShaderCyclesHiLoRegisters() const {
return HasShaderCyclesHiLoRegisters;
}

bool hasVOP3Literal() const {
return HasVOP3Literal;
}
Expand Down
42 changes: 42 additions & 0 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4827,6 +4827,48 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
MI.eraseFromParent();
return BB;
}
case AMDGPU::GET_SHADERCYCLESHILO: {
assert(MF->getSubtarget<GCNSubtarget>().hasShaderCyclesHiLoRegisters());
MachineRegisterInfo &MRI = MF->getRegInfo();
const DebugLoc &DL = MI.getDebugLoc();
// The algorithm is:
//
// hi1 = getreg(SHADER_CYCLES_HI)
// lo1 = getreg(SHADER_CYCLES_LO)
// hi2 = getreg(SHADER_CYCLES_HI)
//
// If hi1 == hi2 then there was no overflow and the result is hi2:lo1.
// Otherwise there was overflow and the result is hi2:0. In both cases the
// result should represent the actual time at some point during the sequence
// of three getregs.
Comment on lines +4840 to +4843
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is pretty ugly

Register RegHi1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1)
.addImm(AMDGPU::Hwreg::encodeHwreg(AMDGPU::Hwreg::ID_SHADER_CYCLES_HI,
0, 32));
Register RegLo1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1)
.addImm(
AMDGPU::Hwreg::encodeHwreg(AMDGPU::Hwreg::ID_SHADER_CYCLES, 0, 32));
Register RegHi2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2)
.addImm(AMDGPU::Hwreg::encodeHwreg(AMDGPU::Hwreg::ID_SHADER_CYCLES_HI,
0, 32));
BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
.addReg(RegHi1)
.addReg(RegHi2);
Register RegLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), RegLo)
.addReg(RegLo1)
.addImm(0);
BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE))
.add(MI.getOperand(0))
.addReg(RegLo)
.addImm(AMDGPU::sub0)
.addReg(RegHi2)
.addImm(AMDGPU::sub1);
MI.eraseFromParent();
return BB;
}
case AMDGPU::SI_INDIRECT_SRC_V1:
case AMDGPU::SI_INDIRECT_SRC_V2:
case AMDGPU::SI_INDIRECT_SRC_V4:
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -316,6 +316,12 @@ def S_USUBO_PSEUDO : SPseudoInstSI <
(outs SReg_32:$sdst, SSrc_i1:$scc_out), (ins SSrc_b32:$src0, SSrc_b32:$src1)
>;

let OtherPredicates = [HasShaderCyclesHiLoRegisters] in
def GET_SHADERCYCLESHILO : SPseudoInstSI<
(outs SReg_64:$sdst), (ins),
[(set SReg_64:$sdst, (i64 (readcyclecounter)))]
>;

} // End usesCustomInserter = 1, Defs = [SCC]

let usesCustomInserter = 1 in {
Expand Down
12 changes: 12 additions & 0 deletions llvm/test/CodeGen/AMDGPU/readcyclecounter.ll
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,19 @@
; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GETREG,GETREG-GISEL -check-prefix=GCN %s
; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GETREG,GETREG-SDAG -check-prefix=GCN %s
; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GETREG,GETREG-GISEL -check-prefix=GCN %s
; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX12 %s
; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX12 %s

declare i64 @llvm.readcyclecounter() #0

; GCN-LABEL: {{^}}test_readcyclecounter:
; MEMTIME-DAG: s_memtime s{{\[[0-9]+:[0-9]+\]}}
; GCN-DAG: s_load_{{dwordx2|b64}}
; GFX12: s_getreg_b32 [[HI1:s[0-9]+]], hwreg(HW_REG_SHADER_CYCLES_HI)
; GFX12: s_getreg_b32 [[LO1:s[0-9]+]], hwreg(HW_REG_SHADER_CYCLES_LO)
; GFX12: s_getreg_b32 [[HI2:s[0-9]+]], hwreg(HW_REG_SHADER_CYCLES_HI)
; GFX12: s_cmp_eq_u32 [[HI1]], [[HI2]]
; GFX12: s_cselect_b32 {{s[0-9]+}}, [[LO1]], 0
; GCN-DAG: lgkmcnt
; MEMTIME: store_dwordx2
; SIVI-NOT: lgkmcnt
Expand Down Expand Up @@ -43,8 +50,13 @@ define amdgpu_kernel void @test_readcyclecounter(ptr addrspace(1) %out) #0 {
;
; GCN-LABEL: {{^}}test_readcyclecounter_smem:
; MEMTIME-DAG: s_memtime
; GFX12: s_getreg_b32 [[HI1:s[0-9]+]], hwreg(HW_REG_SHADER_CYCLES_HI)
; GFX12: s_getreg_b32 [[LO1:s[0-9]+]], hwreg(HW_REG_SHADER_CYCLES_LO)
; GFX12: s_getreg_b32 [[HI2:s[0-9]+]], hwreg(HW_REG_SHADER_CYCLES_HI)
; GCN-DAG: s_load_{{dword|b32|b64}}
; GETREG-DAG: s_getreg_b32 s{{[0-9]+}}, hwreg(HW_REG_SHADER_CYCLES, 0, 20)
; GFX12: s_cmp_eq_u32 [[HI1]], [[HI2]]
; GFX12: s_cselect_b32 {{s[0-9]+}}, [[LO1]], 0
define amdgpu_cs i32 @test_readcyclecounter_smem(ptr addrspace(4) inreg %in) #0 {
%cycle0 = call i64 @llvm.readcyclecounter()
%in.v = load i64, ptr addrspace(4) %in
Expand Down