Skip to content

[AMDGPU][MC] Add GFX12 SMEM encoding #75215

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Dec 15, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -411,6 +411,12 @@ def FeatureVGPRIndexMode : SubtargetFeature<"vgpr-index-mode",
"Has VGPR mode register indexing"
>;

def FeatureScalarDwordx3Loads : SubtargetFeature<"scalar-dwordx3-loads",
"HasScalarDwordx3Loads",
"true",
"Has 96-bit scalar load instructions"
>;

def FeatureScalarStores : SubtargetFeature<"scalar-stores",
"HasScalarStores",
"true",
Expand Down Expand Up @@ -1462,7 +1468,8 @@ def FeatureISAVersion12 : FeatureSet<
FeatureVcmpxPermlaneHazard,
FeatureSALUFloatInsts,
FeatureVGPRSingleUseHintInsts,
FeatureMADIntraFwdBug]>;
FeatureMADIntraFwdBug,
FeatureScalarDwordx3Loads]>;

//===----------------------------------------------------------------------===//

Expand Down Expand Up @@ -2011,6 +2018,8 @@ def HasNoCvtFP8VOP1Bug : Predicate<"!Subtarget->hasCvtFP8VOP1Bug()">;

def HasAtomicCSubNoRtnInsts : Predicate<"Subtarget->hasAtomicCSubNoRtnInsts()">;

def HasScalarDwordx3Loads : Predicate<"Subtarget->hasScalarDwordx3Loads()">;

// Include AMDGPU TD files
include "SISchedule.td"
include "GCNProcessors.td"
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2635,7 +2635,7 @@ AMDGPUAsmParser::getRegularReg(RegisterKind RegKind,
if (RegKind == IS_SGPR || RegKind == IS_TTMP) {
// SGPR and TTMP registers must be aligned.
// Max required alignment is 4 dwords.
AlignSize = std::min(RegWidth / 32, 4u);
AlignSize = std::min(llvm::bit_ceil(RegWidth / 32), 4u);
}

if (RegNum % AlignSize != 0) {
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,7 @@ DECODE_OPERAND_REG_7(SReg_32_XM0_XEXEC, OPW32)
DECODE_OPERAND_REG_7(SReg_32_XEXEC_HI, OPW32)
DECODE_OPERAND_REG_7(SReg_64, OPW64)
DECODE_OPERAND_REG_7(SReg_64_XEXEC, OPW64)
DECODE_OPERAND_REG_7(SReg_96, OPW96)
DECODE_OPERAND_REG_7(SReg_128, OPW128)
DECODE_OPERAND_REG_7(SReg_256, OPW256)
DECODE_OPERAND_REG_7(SReg_512, OPW512)
Expand Down Expand Up @@ -1239,6 +1240,8 @@ MCOperand AMDGPUDisassembler::createSRegOperand(unsigned SRegClassID,
case AMDGPU::TTMP_64RegClassID:
shift = 1;
break;
case AMDGPU::SGPR_96RegClassID:
case AMDGPU::TTMP_96RegClassID:
case AMDGPU::SGPR_128RegClassID:
case AMDGPU::TTMP_128RegClassID:
// ToDo: unclear if s[100:104] is available on VI. Can we use VCC as SGPR in
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/GCNSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool HasFmaMixInsts = false;
bool HasMovrel = false;
bool HasVGPRIndexMode = false;
bool HasScalarDwordx3Loads = false;
bool HasScalarStores = false;
bool HasScalarAtomics = false;
bool HasSDWAOmod = false;
Expand Down Expand Up @@ -886,6 +887,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
return getGeneration() >= VOLCANIC_ISLANDS;
}

bool hasScalarDwordx3Loads() const { return HasScalarDwordx3Loads; }

bool hasScalarStores() const {
return HasScalarStores;
}
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/AMDGPU/SIRegisterInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -414,7 +414,7 @@ def SGPR_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
// SGPR 64-bit registers
def SGPR_64Regs : SIRegisterTuples<getSubRegs<2>.ret, SGPR_32, 105, 2, 2, "s">;

// SGPR 96-bit registers. No operations use these, but for symmetry with 96-bit VGPRs.
// SGPR 96-bit registers.
def SGPR_96Regs : SIRegisterTuples<getSubRegs<3>.ret, SGPR_32, 105, 4, 3, "s">;

// SGPR 128-bit registers
Expand Down
77 changes: 66 additions & 11 deletions llvm/lib/Target/AMDGPU/SMInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ class SM_Real <SM_Pseudo ps, string opName = ps.Mnemonic>
bits<7> sdst;
bits<32> offset;
bits<8> soffset;
bits<5> cpol;
bits<5> cpol;
}

class OffsetMode<bit hasOffset, bit hasSOffset, string variant,
Expand Down Expand Up @@ -300,6 +300,8 @@ multiclass SM_Pseudo_Atomics<RegisterClass baseClass,
// does sdst for SMRD on SI/CI?
defm S_LOAD_DWORD : SM_Pseudo_Loads <SReg_64, SReg_32_XM0_XEXEC>;
defm S_LOAD_DWORDX2 : SM_Pseudo_Loads <SReg_64, SReg_64_XEXEC>;
let SubtargetPredicate = HasScalarDwordx3Loads in
defm S_LOAD_DWORDX3 : SM_Pseudo_Loads <SReg_64, SReg_96>;
defm S_LOAD_DWORDX4 : SM_Pseudo_Loads <SReg_64, SReg_128>;
defm S_LOAD_DWORDX8 : SM_Pseudo_Loads <SReg_64, SReg_256>;
defm S_LOAD_DWORDX16 : SM_Pseudo_Loads <SReg_64, SReg_512>;
Expand All @@ -309,6 +311,8 @@ defm S_BUFFER_LOAD_DWORD : SM_Pseudo_Loads <SReg_128, SReg_32_XM0_XEXEC>;
// FIXME: exec_lo/exec_hi appear to be allowed for SMRD loads on
// SI/CI, bit disallowed for SMEM on VI.
defm S_BUFFER_LOAD_DWORDX2 : SM_Pseudo_Loads <SReg_128, SReg_64_XEXEC>;
let SubtargetPredicate = HasScalarDwordx3Loads in
defm S_BUFFER_LOAD_DWORDX3 : SM_Pseudo_Loads <SReg_128, SReg_96>;
defm S_BUFFER_LOAD_DWORDX4 : SM_Pseudo_Loads <SReg_128, SReg_128>;
defm S_BUFFER_LOAD_DWORDX8 : SM_Pseudo_Loads <SReg_128, SReg_256>;
defm S_BUFFER_LOAD_DWORDX16 : SM_Pseudo_Loads <SReg_128, SReg_512>;
Expand Down Expand Up @@ -1179,7 +1183,7 @@ def SMInfoTable : GenericTable {
class SMEM_Real_gfx11<bits<8> op, SM_Pseudo ps, string opName = ps.Mnemonic> :
SMEM_Real_10Plus_common<op, ps, opName, SIEncodingFamily.GFX11,
SGPR_NULL_gfx11plus> {
let AssemblerPredicate = isGFX11Plus;
let AssemblerPredicate = isGFX11Only;
let DecoderNamespace = "GFX11";
let Inst{13} = !if(ps.has_dlc, cpol{CPolBit.DLC}, 0);
let Inst{14} = !if(ps.has_glc, cpol{CPolBit.GLC}, 0);
Expand Down Expand Up @@ -1235,28 +1239,79 @@ defm S_ATC_PROBE_BUFFER : SM_Real_Probe_gfx11 <0x23>;
// GFX12.
//===----------------------------------------------------------------------===//

class SMEM_Real_gfx12<bits<8> op, SM_Pseudo ps, string opName = ps.Mnemonic> :
SMEM_Real_10Plus_common<op, ps, opName, SIEncodingFamily.GFX12,
SGPR_NULL_gfx11plus> {
class SMEM_Real_gfx12Plus<bits<6> op, SM_Pseudo ps, string opName,
int subtarget, RegisterWithSubRegs sgpr_null> :
SM_Real<ps, opName>, SIMCInstr<ps.PseudoInstr, subtarget>, Enc64 {

let Inst{18-13} = op;
let Inst{31-26} = 0x3d;

let Inst{55-32} = !if(ps.has_offset, offset{23-0}, !if(ps.has_soffset, 0, ?));
let Inst{63-57} = !if(ps.has_soffset, soffset{6-0},
!if(ps.has_offset, sgpr_null.HWEncoding{6-0}, ?));
}

class SMEM_Real_gfx12<bits<6> op, SM_Pseudo ps, string opName = ps.Mnemonic> :
SMEM_Real_gfx12Plus<op, ps, opName, SIEncodingFamily.GFX12,
SGPR_NULL_gfx11plus> {
let AssemblerPredicate = isGFX12Plus;
let DecoderNamespace = "GFX12";
let Inst{18-13} = op{5-0};
let Inst{19} = !if(ps.has_dlc, cpol{CPolBit.DLC}, 0);
let Inst{24-20} = ?; // TODO-GFX12: Add new bits {24-20}: TH, Scope, NV
let Inst{25} = !if(ps.has_glc, cpol{CPolBit.GLC}, 0);
let Inst{55-32} = offset{23-0};

let Inst{5-0} = !if(ps.has_sbase, sbase{6-1}, ?);
let Inst{12-6} = !if(ps.has_sdst, sdst{6-0}, ?);
}

class SMEM_Real_Prefetch_gfx12 <bits<8> op, SM_Pseudo ps> : SMEM_Real_gfx12<op, ps> {
class SMEM_Real_Prefetch_gfx12<bits<6> op, SM_Pseudo ps> :
SMEM_Real_gfx12<op, ps> {
bits<7> sdata; // Only 5 bits of sdata are supported.

let sdst = ?;
let Inst{12-11} = 0; // Unused sdata bits.
let Inst{10-6} = !if(ps.has_sdst, sdata{4-0}, ?);
}

class SMEM_Real_Load_gfx12<bits<6> op, string ps, string opName, OffsetMode offsets> :
SMEM_Real_gfx12<op, !cast<SM_Pseudo>(ps # offsets.Variant), opName> {
RegisterClass BaseClass = !cast<SM_Load_Pseudo>(ps # offsets.Variant).BaseClass;
let InOperandList = !con((ins BaseClass:$sbase), offsets.Ins, (ins CPol:$cpol));

let Inst{22-21} = cpol{4-3}; // scope
let Inst{24-23} = cpol{1-0}; // th - only lower 2 bits are supported
}

multiclass SM_Real_Loads_gfx12<bits<6> op, string ps = NAME> {
defvar opName = !tolower(NAME);
def _IMM_gfx12 : SMEM_Real_Load_gfx12<op, ps, opName, IMM_Offset>;
def _SGPR_IMM_gfx12 : SMEM_Real_Load_gfx12<op, ps, opName, SGPR_IMM_Offset>;
}

defm S_LOAD_B32 : SM_Real_Loads_gfx12<0x00, "S_LOAD_DWORD">;
defm S_LOAD_B64 : SM_Real_Loads_gfx12<0x01, "S_LOAD_DWORDX2">;
defm S_LOAD_B96 : SM_Real_Loads_gfx12<0x05, "S_LOAD_DWORDX3">;
defm S_LOAD_B128 : SM_Real_Loads_gfx12<0x02, "S_LOAD_DWORDX4">;
defm S_LOAD_B256 : SM_Real_Loads_gfx12<0x03, "S_LOAD_DWORDX8">;
defm S_LOAD_B512 : SM_Real_Loads_gfx12<0x04, "S_LOAD_DWORDX16">;

defm S_BUFFER_LOAD_B32 : SM_Real_Loads_gfx12<0x10, "S_BUFFER_LOAD_DWORD">;
defm S_BUFFER_LOAD_B64 : SM_Real_Loads_gfx12<0x11, "S_BUFFER_LOAD_DWORDX2">;
defm S_BUFFER_LOAD_B96 : SM_Real_Loads_gfx12<0x15, "S_BUFFER_LOAD_DWORDX3">;
defm S_BUFFER_LOAD_B128 : SM_Real_Loads_gfx12<0x12, "S_BUFFER_LOAD_DWORDX4">;
defm S_BUFFER_LOAD_B256 : SM_Real_Loads_gfx12<0x13, "S_BUFFER_LOAD_DWORDX8">;
defm S_BUFFER_LOAD_B512 : SM_Real_Loads_gfx12<0x14, "S_BUFFER_LOAD_DWORDX16">;

def S_DCACHE_INV_gfx12 : SMEM_Real_gfx12<0x021, S_DCACHE_INV>;

def S_PREFETCH_INST_gfx12 : SMEM_Real_Prefetch_gfx12<0x24, S_PREFETCH_INST>;
def S_PREFETCH_INST_PC_REL_gfx12 : SMEM_Real_Prefetch_gfx12<0x25, S_PREFETCH_INST_PC_REL>;
def S_PREFETCH_DATA_gfx12 : SMEM_Real_Prefetch_gfx12<0x26, S_PREFETCH_DATA>;
def S_BUFFER_PREFETCH_DATA_gfx12 : SMEM_Real_Prefetch_gfx12<0x27, S_BUFFER_PREFETCH_DATA>;
def S_PREFETCH_DATA_PC_REL_gfx12 : SMEM_Real_Prefetch_gfx12<0x28, S_PREFETCH_DATA_PC_REL>;

multiclass SMEM_Real_Probe_gfx12<bits<6> op> {
defvar ps = NAME;
def _IMM_gfx12 : SMEM_Real_Prefetch_gfx12<op, !cast<SM_Probe_Pseudo>(ps#_IMM)>;
def _SGPR_IMM_gfx12 : SMEM_Real_Prefetch_gfx12<op, !cast<SM_Probe_Pseudo>(ps#_SGPR_IMM)>;
}

defm S_ATC_PROBE : SMEM_Real_Probe_gfx12<0x22>;
defm S_ATC_PROBE_BUFFER : SMEM_Real_Probe_gfx12<0x23>;
6 changes: 6 additions & 0 deletions llvm/test/MC/AMDGPU/gfx11_asm_err.s
Original file line number Diff line number Diff line change
Expand Up @@ -158,3 +158,9 @@ scratch_store_b128 off, v[2:5], s0 offset:8000000

flat_atomic_add_f32 v1, v[0:1], v2 offset:-1
// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: expected a 12-bit unsigned offset

s_load_b96 s[20:22], s[2:3], s0
// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU

s_buffer_load_b96 s[20:22], s[4:7], s0
// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
Loading