Skip to content

Commit e96e7a9

Browse files
authored
[AMDGPU] Implement readcyclecounter for GFX12 (#76965)
1 parent 4d7c5ad commit e96e7a9

File tree

5 files changed

+74
-1
lines changed

5 files changed

+74
-1
lines changed

llvm/lib/Target/AMDGPU/AMDGPU.td

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -761,6 +761,12 @@ def FeatureShaderCyclesRegister : SubtargetFeature<"shader-cycles-register",
761761
"Has SHADER_CYCLES hardware register"
762762
>;
763763

764+
def FeatureShaderCyclesHiLoRegisters : SubtargetFeature<"shader-cycles-hi-lo-registers",
765+
"HasShaderCyclesHiLoRegisters",
766+
"true",
767+
"Has SHADER_CYCLES_HI/LO hardware registers"
768+
>;
769+
764770
def FeatureMadMacF32Insts : SubtargetFeature<"mad-mac-f32-insts",
765771
"HasMadMacF32Insts",
766772
"true",
@@ -1469,7 +1475,7 @@ def FeatureISAVersion12 : FeatureSet<
14691475
FeatureNSAEncoding,
14701476
FeaturePartialNSAEncoding,
14711477
FeatureWavefrontSize32,
1472-
FeatureShaderCyclesRegister,
1478+
FeatureShaderCyclesHiLoRegisters,
14731479
FeatureArchitectedFlatScratch,
14741480
FeatureAtomicFaddRtnInsts,
14751481
FeatureAtomicFaddNoRtnInsts,
@@ -1970,6 +1976,8 @@ def HasSMemTimeInst : Predicate<"Subtarget->hasSMemTimeInst()">,
19701976
def HasShaderCyclesRegister : Predicate<"Subtarget->hasShaderCyclesRegister()">,
19711977
AssemblerPredicate<(all_of FeatureShaderCyclesRegister)>;
19721978

1979+
def HasShaderCyclesHiLoRegisters : Predicate<"Subtarget->hasShaderCyclesHiLoRegisters()">;
1980+
19731981
def HasFP8Insts : Predicate<"Subtarget->hasFP8Insts()">,
19741982
AssemblerPredicate<(all_of FeatureFP8Insts)>;
19751983

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
176176
bool HasGetWaveIdInst = false;
177177
bool HasSMemTimeInst = false;
178178
bool HasShaderCyclesRegister = false;
179+
bool HasShaderCyclesHiLoRegisters = false;
179180
bool HasVOP3Literal = false;
180181
bool HasNoDataDepHazard = false;
181182
bool FlatAddressSpace = false;
@@ -819,6 +820,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
819820
return HasShaderCyclesRegister;
820821
}
821822

823+
bool hasShaderCyclesHiLoRegisters() const {
824+
return HasShaderCyclesHiLoRegisters;
825+
}
826+
822827
bool hasVOP3Literal() const {
823828
return HasVOP3Literal;
824829
}

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4909,6 +4909,48 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
49094909
MI.eraseFromParent();
49104910
return BB;
49114911
}
4912+
case AMDGPU::GET_SHADERCYCLESHILO: {
4913+
assert(MF->getSubtarget<GCNSubtarget>().hasShaderCyclesHiLoRegisters());
4914+
MachineRegisterInfo &MRI = MF->getRegInfo();
4915+
const DebugLoc &DL = MI.getDebugLoc();
4916+
// The algorithm is:
4917+
//
4918+
// hi1 = getreg(SHADER_CYCLES_HI)
4919+
// lo1 = getreg(SHADER_CYCLES_LO)
4920+
// hi2 = getreg(SHADER_CYCLES_HI)
4921+
//
4922+
// If hi1 == hi2 then there was no overflow and the result is hi2:lo1.
4923+
// Otherwise there was overflow and the result is hi2:0. In both cases the
4924+
// result should represent the actual time at some point during the sequence
4925+
// of three getregs.
4926+
Register RegHi1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4927+
BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1)
4928+
.addImm(AMDGPU::Hwreg::encodeHwreg(AMDGPU::Hwreg::ID_SHADER_CYCLES_HI,
4929+
0, 32));
4930+
Register RegLo1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4931+
BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1)
4932+
.addImm(
4933+
AMDGPU::Hwreg::encodeHwreg(AMDGPU::Hwreg::ID_SHADER_CYCLES, 0, 32));
4934+
Register RegHi2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4935+
BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2)
4936+
.addImm(AMDGPU::Hwreg::encodeHwreg(AMDGPU::Hwreg::ID_SHADER_CYCLES_HI,
4937+
0, 32));
4938+
BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
4939+
.addReg(RegHi1)
4940+
.addReg(RegHi2);
4941+
Register RegLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4942+
BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), RegLo)
4943+
.addReg(RegLo1)
4944+
.addImm(0);
4945+
BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE))
4946+
.add(MI.getOperand(0))
4947+
.addReg(RegLo)
4948+
.addImm(AMDGPU::sub0)
4949+
.addReg(RegHi2)
4950+
.addImm(AMDGPU::sub1);
4951+
MI.eraseFromParent();
4952+
return BB;
4953+
}
49124954
case AMDGPU::SI_INDIRECT_SRC_V1:
49134955
case AMDGPU::SI_INDIRECT_SRC_V2:
49144956
case AMDGPU::SI_INDIRECT_SRC_V4:

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -316,6 +316,12 @@ def S_USUBO_PSEUDO : SPseudoInstSI <
316316
(outs SReg_32:$sdst, SSrc_i1:$scc_out), (ins SSrc_b32:$src0, SSrc_b32:$src1)
317317
>;
318318

319+
let OtherPredicates = [HasShaderCyclesHiLoRegisters] in
320+
def GET_SHADERCYCLESHILO : SPseudoInstSI<
321+
(outs SReg_64:$sdst), (ins),
322+
[(set SReg_64:$sdst, (i64 (readcyclecounter)))]
323+
>;
324+
319325
} // End usesCustomInserter = 1, Defs = [SCC]
320326

321327
let usesCustomInserter = 1 in {

llvm/test/CodeGen/AMDGPU/readcyclecounter.ll

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,19 @@
88
; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GETREG,GETREG-GISEL -check-prefix=GCN %s
99
; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GETREG,GETREG-SDAG -check-prefix=GCN %s
1010
; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GETREG,GETREG-GISEL -check-prefix=GCN %s
11+
; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX12 %s
12+
; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX12 %s
1113

1214
declare i64 @llvm.readcyclecounter() #0
1315

1416
; GCN-LABEL: {{^}}test_readcyclecounter:
1517
; MEMTIME-DAG: s_memtime s{{\[[0-9]+:[0-9]+\]}}
1618
; GCN-DAG: s_load_{{dwordx2|b64}}
19+
; GFX12: s_getreg_b32 [[HI1:s[0-9]+]], hwreg(HW_REG_SHADER_CYCLES_HI)
20+
; GFX12: s_getreg_b32 [[LO1:s[0-9]+]], hwreg(HW_REG_SHADER_CYCLES_LO)
21+
; GFX12: s_getreg_b32 [[HI2:s[0-9]+]], hwreg(HW_REG_SHADER_CYCLES_HI)
22+
; GFX12: s_cmp_eq_u32 [[HI1]], [[HI2]]
23+
; GFX12: s_cselect_b32 {{s[0-9]+}}, [[LO1]], 0
1724
; GCN-DAG: lgkmcnt
1825
; MEMTIME: store_dwordx2
1926
; SIVI-NOT: lgkmcnt
@@ -43,8 +50,13 @@ define amdgpu_kernel void @test_readcyclecounter(ptr addrspace(1) %out) #0 {
4350
;
4451
; GCN-LABEL: {{^}}test_readcyclecounter_smem:
4552
; MEMTIME-DAG: s_memtime
53+
; GFX12: s_getreg_b32 [[HI1:s[0-9]+]], hwreg(HW_REG_SHADER_CYCLES_HI)
54+
; GFX12: s_getreg_b32 [[LO1:s[0-9]+]], hwreg(HW_REG_SHADER_CYCLES_LO)
55+
; GFX12: s_getreg_b32 [[HI2:s[0-9]+]], hwreg(HW_REG_SHADER_CYCLES_HI)
4656
; GCN-DAG: s_load_{{dword|b32|b64}}
4757
; GETREG-DAG: s_getreg_b32 s{{[0-9]+}}, hwreg(HW_REG_SHADER_CYCLES, 0, 20)
58+
; GFX12: s_cmp_eq_u32 [[HI1]], [[HI2]]
59+
; GFX12: s_cselect_b32 {{s[0-9]+}}, [[LO1]], 0
4860
define amdgpu_cs i32 @test_readcyclecounter_smem(ptr addrspace(4) inreg %in) #0 {
4961
%cycle0 = call i64 @llvm.readcyclecounter()
5062
%in.v = load i64, ptr addrspace(4) %in

0 commit comments

Comments
 (0)