Skip to content

[AMDGPU] Fix broken sign-extended subword buffer load combine #77470

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jan 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUCombine.td
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ def smulu64 : GICombineRule<
[{ return matchCombine_s_mul_u64(*${smul}, ${matchinfo}); }]),
(apply [{ applyCombine_s_mul_u64(*${smul}, ${matchinfo}); }])>;

def sign_exension_in_reg_matchdata : GIDefMatchData<"MachineInstr *">;
def sign_exension_in_reg_matchdata : GIDefMatchData<"std::pair<MachineInstr *, unsigned>">;

def sign_extension_in_reg : GICombineRule<
(defs root:$sign_inreg, sign_exension_in_reg_matchdata:$matchinfo),
Expand Down
46 changes: 24 additions & 22 deletions llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -99,10 +99,10 @@ class AMDGPUPostLegalizerCombinerImpl : public Combiner {

// Combine unsigned buffer load and signed extension instructions to generate
// signed buffer laod instructions.
bool matchCombineSignExtendInReg(MachineInstr &MI,
MachineInstr *&MatchInfo) const;
void applyCombineSignExtendInReg(MachineInstr &MI,
MachineInstr *&MatchInfo) const;
bool matchCombineSignExtendInReg(
MachineInstr &MI, std::pair<MachineInstr *, unsigned> &MatchInfo) const;
void applyCombineSignExtendInReg(
MachineInstr &MI, std::pair<MachineInstr *, unsigned> &MatchInfo) const;

// Find the s_mul_u64 instructions where the higher bits are either
// zero-extended or sign-extended.
Expand Down Expand Up @@ -395,34 +395,36 @@ bool AMDGPUPostLegalizerCombinerImpl::matchRemoveFcanonicalize(

// Identify buffer_load_{u8, u16}.
bool AMDGPUPostLegalizerCombinerImpl::matchCombineSignExtendInReg(
MachineInstr &MI, MachineInstr *&SubwordBufferLoad) const {
Register Op0Reg = MI.getOperand(1).getReg();
SubwordBufferLoad = MRI.getVRegDef(Op0Reg);

if (!MRI.hasOneNonDBGUse(Op0Reg))
MachineInstr &MI, std::pair<MachineInstr *, unsigned> &MatchData) const {
Register LoadReg = MI.getOperand(1).getReg();
if (!MRI.hasOneNonDBGUse(LoadReg))
return false;

// Check if the first operand of the sign extension is a subword buffer load
// instruction.
return SubwordBufferLoad->getOpcode() == AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE ||
SubwordBufferLoad->getOpcode() == AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
MachineInstr *LoadMI = MRI.getVRegDef(LoadReg);
int64_t Width = MI.getOperand(2).getImm();
switch (LoadMI->getOpcode()) {
case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
MatchData = {LoadMI, AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE};
return Width == 8;
case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
MatchData = {LoadMI, AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT};
return Width == 16;
}
return false;
}

// Combine buffer_load_{u8, u16} and the sign extension instruction to generate
// buffer_load_{i8, i16}.
void AMDGPUPostLegalizerCombinerImpl::applyCombineSignExtendInReg(
MachineInstr &MI, MachineInstr *&SubwordBufferLoad) const {
// Modify the opcode and the destination of buffer_load_{u8, u16}:
// Replace the opcode.
unsigned Opc =
SubwordBufferLoad->getOpcode() == AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE
? AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE
: AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT;
SubwordBufferLoad->setDesc(TII.get(Opc));
// Update the destination register of SubwordBufferLoad with the destination
// register of the sign extension.
MachineInstr &MI, std::pair<MachineInstr *, unsigned> &MatchData) const {
auto [LoadMI, NewOpcode] = MatchData;
LoadMI->setDesc(TII.get(NewOpcode));
// Update the destination register of the load with the destination register
// of the sign extension.
Register SignExtendInsnDst = MI.getOperand(0).getReg();
SubwordBufferLoad->getOperand(0).setReg(SignExtendInsnDst);
LoadMI->getOperand(0).setReg(SignExtendInsnDst);
// Remove the sign extension.
MI.eraseFromParent();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -500,6 +500,49 @@ define amdgpu_ps float @struct_buffer_load_i8_sext__sgpr_rsrc__vgpr_vindex__vgpr
ret float %cast
}

define amdgpu_ps float @struct_buffer_load_i8_sext_wrong_width(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
; GFX8-LABEL: name: struct_buffer_load_i8_sext_wrong_width
; GFX8: bb.1 (%ir-block.0):
; GFX8-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
; GFX8-NEXT: {{ $}}
; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
; GFX8-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX8-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
; GFX8-NEXT: [[BUFFER_LOAD_UBYTE_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8)
; GFX8-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[BUFFER_LOAD_UBYTE_BOTHEN]], 0, 4, implicit $exec
; GFX8-NEXT: $vgpr0 = COPY [[V_BFE_I32_e64_]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
; GFX12-LABEL: name: struct_buffer_load_i8_sext_wrong_width
; GFX12: bb.1 (%ir-block.0):
; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
; GFX12-NEXT: [[BUFFER_LOAD_UBYTE_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8)
; GFX12-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[BUFFER_LOAD_UBYTE_VBUFFER_BOTHEN]], 0, 4, implicit $exec
; GFX12-NEXT: $vgpr0 = COPY [[V_BFE_I32_e64_]]
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%val = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
%trunc = trunc i8 %val to i4
%ext = sext i4 %trunc to i32
%cast = bitcast i32 %ext to float
ret float %cast
}

define amdgpu_ps float @struct_buffer_load_i16_zext__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
; GFX8-LABEL: name: struct_buffer_load_i16_zext__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
; GFX8: bb.1 (%ir-block.0):
Expand Down Expand Up @@ -580,6 +623,49 @@ define amdgpu_ps float @struct_buffer_load_i16_sext__sgpr_rsrc__vgpr_vindex__vgp
ret float %cast
}

define amdgpu_ps float @struct_buffer_load_i16_sext_wrong_width(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
; GFX8-LABEL: name: struct_buffer_load_i16_sext_wrong_width
; GFX8: bb.1 (%ir-block.0):
; GFX8-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
; GFX8-NEXT: {{ $}}
; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
; GFX8-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX8-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
; GFX8-NEXT: [[BUFFER_LOAD_USHORT_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8)
; GFX8-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[BUFFER_LOAD_USHORT_BOTHEN]], 0, 8, implicit $exec
; GFX8-NEXT: $vgpr0 = COPY [[V_BFE_I32_e64_]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
; GFX12-LABEL: name: struct_buffer_load_i16_sext_wrong_width
; GFX12: bb.1 (%ir-block.0):
; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
; GFX12-NEXT: [[BUFFER_LOAD_USHORT_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8)
; GFX12-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[BUFFER_LOAD_USHORT_VBUFFER_BOTHEN]], 0, 8, implicit $exec
; GFX12-NEXT: $vgpr0 = COPY [[V_BFE_I32_e64_]]
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%val = call i16 @llvm.amdgcn.struct.buffer.load.i16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
%trunc = trunc i16 %val to i8
%ext = sext i8 %trunc to i32
%cast = bitcast i32 %ext to float
ret float %cast
}

; Natural mapping
define amdgpu_ps half @struct_buffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
; GFX8-LABEL: name: struct_buffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
Expand Down