Skip to content

Commit 08da7ac

Browse files
authored
[AMDGPU] Fix broken sign-extended subword buffer load combine (#77470)
1 parent 5cc0344 commit 08da7ac

File tree

3 files changed

+111
-23
lines changed

3 files changed

+111
-23
lines changed

llvm/lib/Target/AMDGPU/AMDGPUCombine.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ def smulu64 : GICombineRule<
111111
[{ return matchCombine_s_mul_u64(*${smul}, ${matchinfo}); }]),
112112
(apply [{ applyCombine_s_mul_u64(*${smul}, ${matchinfo}); }])>;
113113

114-
def sign_exension_in_reg_matchdata : GIDefMatchData<"MachineInstr *">;
114+
def sign_exension_in_reg_matchdata : GIDefMatchData<"std::pair<MachineInstr *, unsigned>">;
115115

116116
def sign_extension_in_reg : GICombineRule<
117117
(defs root:$sign_inreg, sign_exension_in_reg_matchdata:$matchinfo),

llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp

Lines changed: 24 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -99,10 +99,10 @@ class AMDGPUPostLegalizerCombinerImpl : public Combiner {
9999

100100
// Combine unsigned buffer load and signed extension instructions to generate
101101
// signed buffer laod instructions.
102-
bool matchCombineSignExtendInReg(MachineInstr &MI,
103-
MachineInstr *&MatchInfo) const;
104-
void applyCombineSignExtendInReg(MachineInstr &MI,
105-
MachineInstr *&MatchInfo) const;
102+
bool matchCombineSignExtendInReg(
103+
MachineInstr &MI, std::pair<MachineInstr *, unsigned> &MatchInfo) const;
104+
void applyCombineSignExtendInReg(
105+
MachineInstr &MI, std::pair<MachineInstr *, unsigned> &MatchInfo) const;
106106

107107
// Find the s_mul_u64 instructions where the higher bits are either
108108
// zero-extended or sign-extended.
@@ -395,34 +395,36 @@ bool AMDGPUPostLegalizerCombinerImpl::matchRemoveFcanonicalize(
395395

396396
// Identify buffer_load_{u8, u16}.
397397
bool AMDGPUPostLegalizerCombinerImpl::matchCombineSignExtendInReg(
398-
MachineInstr &MI, MachineInstr *&SubwordBufferLoad) const {
399-
Register Op0Reg = MI.getOperand(1).getReg();
400-
SubwordBufferLoad = MRI.getVRegDef(Op0Reg);
401-
402-
if (!MRI.hasOneNonDBGUse(Op0Reg))
398+
MachineInstr &MI, std::pair<MachineInstr *, unsigned> &MatchData) const {
399+
Register LoadReg = MI.getOperand(1).getReg();
400+
if (!MRI.hasOneNonDBGUse(LoadReg))
403401
return false;
404402

405403
// Check if the first operand of the sign extension is a subword buffer load
406404
// instruction.
407-
return SubwordBufferLoad->getOpcode() == AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE ||
408-
SubwordBufferLoad->getOpcode() == AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
405+
MachineInstr *LoadMI = MRI.getVRegDef(LoadReg);
406+
int64_t Width = MI.getOperand(2).getImm();
407+
switch (LoadMI->getOpcode()) {
408+
case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
409+
MatchData = {LoadMI, AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE};
410+
return Width == 8;
411+
case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
412+
MatchData = {LoadMI, AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT};
413+
return Width == 16;
414+
}
415+
return false;
409416
}
410417

411418
// Combine buffer_load_{u8, u16} and the sign extension instruction to generate
412419
// buffer_load_{i8, i16}.
413420
void AMDGPUPostLegalizerCombinerImpl::applyCombineSignExtendInReg(
414-
MachineInstr &MI, MachineInstr *&SubwordBufferLoad) const {
415-
// Modify the opcode and the destination of buffer_load_{u8, u16}:
416-
// Replace the opcode.
417-
unsigned Opc =
418-
SubwordBufferLoad->getOpcode() == AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE
419-
? AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE
420-
: AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT;
421-
SubwordBufferLoad->setDesc(TII.get(Opc));
422-
// Update the destination register of SubwordBufferLoad with the destination
423-
// register of the sign extension.
421+
MachineInstr &MI, std::pair<MachineInstr *, unsigned> &MatchData) const {
422+
auto [LoadMI, NewOpcode] = MatchData;
423+
LoadMI->setDesc(TII.get(NewOpcode));
424+
// Update the destination register of the load with the destination register
425+
// of the sign extension.
424426
Register SignExtendInsnDst = MI.getOperand(0).getReg();
425-
SubwordBufferLoad->getOperand(0).setReg(SignExtendInsnDst);
427+
LoadMI->getOperand(0).setReg(SignExtendInsnDst);
426428
// Remove the sign extension.
427429
MI.eraseFromParent();
428430
}

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -500,6 +500,49 @@ define amdgpu_ps float @struct_buffer_load_i8_sext__sgpr_rsrc__vgpr_vindex__vgpr
500500
ret float %cast
501501
}
502502

503+
define amdgpu_ps float @struct_buffer_load_i8_sext_wrong_width(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
504+
; GFX8-LABEL: name: struct_buffer_load_i8_sext_wrong_width
505+
; GFX8: bb.1 (%ir-block.0):
506+
; GFX8-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
507+
; GFX8-NEXT: {{ $}}
508+
; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
509+
; GFX8-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
510+
; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
511+
; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
512+
; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
513+
; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
514+
; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
515+
; GFX8-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
516+
; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
517+
; GFX8-NEXT: [[BUFFER_LOAD_UBYTE_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8)
518+
; GFX8-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[BUFFER_LOAD_UBYTE_BOTHEN]], 0, 4, implicit $exec
519+
; GFX8-NEXT: $vgpr0 = COPY [[V_BFE_I32_e64_]]
520+
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
521+
;
522+
; GFX12-LABEL: name: struct_buffer_load_i8_sext_wrong_width
523+
; GFX12: bb.1 (%ir-block.0):
524+
; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
525+
; GFX12-NEXT: {{ $}}
526+
; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
527+
; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
528+
; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
529+
; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
530+
; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
531+
; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
532+
; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
533+
; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
534+
; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
535+
; GFX12-NEXT: [[BUFFER_LOAD_UBYTE_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8)
536+
; GFX12-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[BUFFER_LOAD_UBYTE_VBUFFER_BOTHEN]], 0, 4, implicit $exec
537+
; GFX12-NEXT: $vgpr0 = COPY [[V_BFE_I32_e64_]]
538+
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
539+
%val = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
540+
%trunc = trunc i8 %val to i4
541+
%ext = sext i4 %trunc to i32
542+
%cast = bitcast i32 %ext to float
543+
ret float %cast
544+
}
545+
503546
define amdgpu_ps float @struct_buffer_load_i16_zext__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
504547
; GFX8-LABEL: name: struct_buffer_load_i16_zext__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
505548
; GFX8: bb.1 (%ir-block.0):
@@ -580,6 +623,49 @@ define amdgpu_ps float @struct_buffer_load_i16_sext__sgpr_rsrc__vgpr_vindex__vgp
580623
ret float %cast
581624
}
582625

626+
define amdgpu_ps float @struct_buffer_load_i16_sext_wrong_width(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
627+
; GFX8-LABEL: name: struct_buffer_load_i16_sext_wrong_width
628+
; GFX8: bb.1 (%ir-block.0):
629+
; GFX8-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
630+
; GFX8-NEXT: {{ $}}
631+
; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
632+
; GFX8-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
633+
; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
634+
; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
635+
; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
636+
; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
637+
; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
638+
; GFX8-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
639+
; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
640+
; GFX8-NEXT: [[BUFFER_LOAD_USHORT_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8)
641+
; GFX8-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[BUFFER_LOAD_USHORT_BOTHEN]], 0, 8, implicit $exec
642+
; GFX8-NEXT: $vgpr0 = COPY [[V_BFE_I32_e64_]]
643+
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
644+
;
645+
; GFX12-LABEL: name: struct_buffer_load_i16_sext_wrong_width
646+
; GFX12: bb.1 (%ir-block.0):
647+
; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
648+
; GFX12-NEXT: {{ $}}
649+
; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
650+
; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
651+
; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
652+
; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
653+
; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
654+
; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
655+
; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
656+
; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
657+
; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
658+
; GFX12-NEXT: [[BUFFER_LOAD_USHORT_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8)
659+
; GFX12-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[BUFFER_LOAD_USHORT_VBUFFER_BOTHEN]], 0, 8, implicit $exec
660+
; GFX12-NEXT: $vgpr0 = COPY [[V_BFE_I32_e64_]]
661+
; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
662+
%val = call i16 @llvm.amdgcn.struct.buffer.load.i16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
663+
%trunc = trunc i16 %val to i8
664+
%ext = sext i8 %trunc to i32
665+
%cast = bitcast i32 %ext to float
666+
ret float %cast
667+
}
668+
583669
; Natural mapping
584670
define amdgpu_ps half @struct_buffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
585671
; GFX8-LABEL: name: struct_buffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset

0 commit comments

Comments
 (0)