Skip to content

Commit f5b6866

Browse files
committed
[AMDGPU] Add missing testcase for SGPR to AGPR copy
and, also update the function indirectCopyToAGPR() to ensure that it is called only on GFX908 sub-target. Reviewed By: rampitec Differential Revision: https://reviews.llvm.org/D122286
1 parent 850de56 commit f5b6866

File tree

2 files changed

+280
-26
lines changed

2 files changed

+280
-26
lines changed

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 26 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -540,8 +540,9 @@ static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB,
540540
.addReg(SrcReg, getKillRegState(KillSrc));
541541
}
542542

543-
/// Handle copying from SGPR to AGPR, or from AGPR to AGPR. It is not possible
544-
/// to directly copy, so an intermediate VGPR needs to be used.
543+
/// Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908. It is not
544+
/// possible to have a direct copy in these cases on GFX908, so an intermediate
545+
/// VGPR copy is required.
545546
static void indirectCopyToAGPR(const SIInstrInfo &TII,
546547
MachineBasicBlock &MBB,
547548
MachineBasicBlock::iterator MI,
@@ -550,10 +551,18 @@ static void indirectCopyToAGPR(const SIInstrInfo &TII,
550551
RegScavenger &RS,
551552
Register ImpDefSuperReg = Register(),
552553
Register ImpUseSuperReg = Register()) {
553-
const SIRegisterInfo &RI = TII.getRegisterInfo();
554+
assert((TII.getSubtarget().hasMAIInsts() &&
555+
!TII.getSubtarget().hasGFX90AInsts()) &&
556+
"Expected GFX908 subtarget.");
557+
558+
assert((AMDGPU::SReg_32RegClass.contains(SrcReg) ||
559+
AMDGPU::AGPR_32RegClass.contains(SrcReg)) &&
560+
"Source register of the copy should be either an SGPR or an AGPR.");
554561

555-
assert(AMDGPU::SReg_32RegClass.contains(SrcReg) ||
556-
AMDGPU::AGPR_32RegClass.contains(SrcReg));
562+
assert(AMDGPU::AGPR_32RegClass.contains(DestReg) &&
563+
"Destination register of the copy should be an AGPR.");
564+
565+
const SIRegisterInfo &RI = TII.getRegisterInfo();
557566

558567
// First try to find defining accvgpr_write to avoid temporary registers.
559568
for (auto Def = MI, E = MBB.begin(); Def != E; ) {
@@ -605,23 +614,18 @@ static void indirectCopyToAGPR(const SIInstrInfo &TII,
605614
// Registers in the sequence are allocated contiguously so we can just
606615
// use register number to pick one of three round-robin temps.
607616
unsigned RegNo = DestReg % 3;
608-
Register Tmp;
609-
if (!TII.getSubtarget().hasGFX90AInsts()) {
610-
Tmp = AMDGPU::VGPR32;
611-
assert(MBB.getParent()->getRegInfo().isReserved(AMDGPU::VGPR32));
612-
613-
// Only loop through if there are any free registers left, otherwise
614-
// scavenger may report a fatal error without emergency spill slot
615-
// or spill with the slot.
616-
while (RegNo-- && RS.FindUnusedReg(&AMDGPU::VGPR_32RegClass)) {
617-
Register Tmp2 = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0);
618-
if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
619-
break;
620-
Tmp = Tmp2;
621-
RS.setRegUsed(Tmp);
622-
}
623-
} else {
624-
Tmp = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0);
617+
Register Tmp = AMDGPU::VGPR32;
618+
assert(MBB.getParent()->getRegInfo().isReserved(Tmp) &&
619+
"VGPR used for an intermediate copy should have been reserved.");
620+
621+
// Only loop through if there are any free registers left, otherwise
622+
// scavenger may report a fatal error without emergency spill slot
623+
// or spill with the slot.
624+
while (RegNo-- && RS.FindUnusedReg(&AMDGPU::VGPR_32RegClass)) {
625+
Register Tmp2 = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0);
626+
if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
627+
break;
628+
Tmp = Tmp2;
625629
RS.setRegUsed(Tmp);
626630
}
627631

llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll

Lines changed: 254 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,10 @@
22
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s
33
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s
44

5-
; This testcase would fail due to not having a free VGPR available to
5+
; This testcase would fail on GFX908 due to not having a free VGPR available to
66
; copy between AGPRs.
7-
define void @no_free_vgprs_at_agpr_copy(float %v0, float %v1) #0 {
8-
; GFX908-LABEL: no_free_vgprs_at_agpr_copy:
7+
define void @no_free_vgprs_at_agpr_to_agpr_copy(float %v0, float %v1) #0 {
8+
; GFX908-LABEL: no_free_vgprs_at_agpr_to_agpr_copy:
99
; GFX908: ; %bb.0:
1010
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1111
; GFX908-NEXT: v_mov_b32_e32 v33, v1
@@ -156,7 +156,7 @@ define void @no_free_vgprs_at_agpr_copy(float %v0, float %v1) #0 {
156156
; GFX908-NEXT: ;;#ASMEND
157157
; GFX908-NEXT: s_setpc_b64 s[30:31]
158158
;
159-
; GFX90A-LABEL: no_free_vgprs_at_agpr_copy:
159+
; GFX90A-LABEL: no_free_vgprs_at_agpr_to_agpr_copy
160160
; GFX90A: ; %bb.0:
161161
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
162162
; GFX90A-NEXT: v_mov_b32_e32 v33, v0
@@ -864,6 +864,256 @@ bb58: ; preds = %bb51, %bb16
864864
br i1 %i66, label %bb16, label %bb12
865865
}
866866

867+
; This testcase would fail on GFX908 due to not having a free VGPR available to
868+
; copy SGPR to AGPR.
869+
define void @no_free_vgprs_at_sgpr_to_agpr_copy(float %v0, float %v1) #0 {
870+
; GFX908-LABEL: no_free_vgprs_at_sgpr_to_agpr_copy:
871+
; GFX908: ; %bb.0:
872+
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
873+
; GFX908-NEXT: v_mov_b32_e32 v33, v1
874+
; GFX908-NEXT: v_mov_b32_e32 v34, v0
875+
; GFX908-NEXT: ;;#ASMSTART
876+
; GFX908-NEXT: ; def v[0:31] s[0:15]
877+
; GFX908-NEXT: ;;#ASMEND
878+
; GFX908-NEXT: v_mov_b32_e32 v32, s15
879+
; GFX908-NEXT: s_nop 1
880+
; GFX908-NEXT: v_accvgpr_write_b32 a31, v32
881+
; GFX908-NEXT: v_mov_b32_e32 v32, s14
882+
; GFX908-NEXT: s_nop 1
883+
; GFX908-NEXT: v_accvgpr_write_b32 a30, v32
884+
; GFX908-NEXT: v_mov_b32_e32 v32, s13
885+
; GFX908-NEXT: s_nop 1
886+
; GFX908-NEXT: v_accvgpr_write_b32 a29, v32
887+
; GFX908-NEXT: v_mov_b32_e32 v32, s12
888+
; GFX908-NEXT: s_nop 1
889+
; GFX908-NEXT: v_accvgpr_write_b32 a28, v32
890+
; GFX908-NEXT: v_mov_b32_e32 v32, s11
891+
; GFX908-NEXT: s_nop 1
892+
; GFX908-NEXT: v_accvgpr_write_b32 a27, v32
893+
; GFX908-NEXT: v_mov_b32_e32 v32, s10
894+
; GFX908-NEXT: s_nop 1
895+
; GFX908-NEXT: v_accvgpr_write_b32 a26, v32
896+
; GFX908-NEXT: v_mov_b32_e32 v32, s9
897+
; GFX908-NEXT: s_nop 1
898+
; GFX908-NEXT: v_accvgpr_write_b32 a25, v32
899+
; GFX908-NEXT: v_mov_b32_e32 v32, s8
900+
; GFX908-NEXT: s_nop 1
901+
; GFX908-NEXT: v_accvgpr_write_b32 a24, v32
902+
; GFX908-NEXT: v_mov_b32_e32 v32, s7
903+
; GFX908-NEXT: s_nop 1
904+
; GFX908-NEXT: v_accvgpr_write_b32 a23, v32
905+
; GFX908-NEXT: v_mov_b32_e32 v32, s6
906+
; GFX908-NEXT: s_nop 1
907+
; GFX908-NEXT: v_accvgpr_write_b32 a22, v32
908+
; GFX908-NEXT: v_mov_b32_e32 v32, s5
909+
; GFX908-NEXT: s_nop 1
910+
; GFX908-NEXT: v_accvgpr_write_b32 a21, v32
911+
; GFX908-NEXT: v_mov_b32_e32 v32, s4
912+
; GFX908-NEXT: s_nop 1
913+
; GFX908-NEXT: v_accvgpr_write_b32 a20, v32
914+
; GFX908-NEXT: v_mov_b32_e32 v32, s3
915+
; GFX908-NEXT: s_nop 1
916+
; GFX908-NEXT: v_accvgpr_write_b32 a19, v32
917+
; GFX908-NEXT: v_mov_b32_e32 v32, s2
918+
; GFX908-NEXT: s_nop 1
919+
; GFX908-NEXT: v_accvgpr_write_b32 a18, v32
920+
; GFX908-NEXT: v_mov_b32_e32 v32, s1
921+
; GFX908-NEXT: s_nop 1
922+
; GFX908-NEXT: v_accvgpr_write_b32 a17, v32
923+
; GFX908-NEXT: v_mov_b32_e32 v32, s0
924+
; GFX908-NEXT: s_nop 1
925+
; GFX908-NEXT: v_accvgpr_write_b32 a16, v32
926+
; GFX908-NEXT: s_nop 0
927+
; GFX908-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v34, v33, a[16:31]
928+
; GFX908-NEXT: s_nop 7
929+
; GFX908-NEXT: s_nop 1
930+
; GFX908-NEXT: v_accvgpr_read_b32 v32, a0 ; Reload Reuse
931+
; GFX908-NEXT: v_accvgpr_read_b32 v39, a11 ; Reload Reuse
932+
; GFX908-NEXT: v_accvgpr_read_b32 v38, a12 ; Reload Reuse
933+
; GFX908-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
934+
; GFX908-NEXT: v_accvgpr_read_b32 v32, a1 ; Reload Reuse
935+
; GFX908-NEXT: v_accvgpr_read_b32 v37, a13 ; Reload Reuse
936+
; GFX908-NEXT: v_accvgpr_read_b32 v36, a14 ; Reload Reuse
937+
; GFX908-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
938+
; GFX908-NEXT: v_accvgpr_read_b32 v32, a2 ; Reload Reuse
939+
; GFX908-NEXT: v_accvgpr_read_b32 v35, a15 ; Reload Reuse
940+
; GFX908-NEXT: s_nop 0
941+
; GFX908-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
942+
; GFX908-NEXT: v_accvgpr_read_b32 v32, a3 ; Reload Reuse
943+
; GFX908-NEXT: s_nop 1
944+
; GFX908-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
945+
; GFX908-NEXT: v_accvgpr_read_b32 v32, a4 ; Reload Reuse
946+
; GFX908-NEXT: s_nop 1
947+
; GFX908-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
948+
; GFX908-NEXT: v_accvgpr_read_b32 v32, a5 ; Reload Reuse
949+
; GFX908-NEXT: s_nop 1
950+
; GFX908-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
951+
; GFX908-NEXT: v_accvgpr_read_b32 v32, a6 ; Reload Reuse
952+
; GFX908-NEXT: s_nop 1
953+
; GFX908-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
954+
; GFX908-NEXT: v_accvgpr_read_b32 v32, a7 ; Reload Reuse
955+
; GFX908-NEXT: s_nop 1
956+
; GFX908-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
957+
; GFX908-NEXT: v_accvgpr_read_b32 v32, a8 ; Reload Reuse
958+
; GFX908-NEXT: s_nop 1
959+
; GFX908-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
960+
; GFX908-NEXT: v_accvgpr_read_b32 v32, a9 ; Reload Reuse
961+
; GFX908-NEXT: s_nop 1
962+
; GFX908-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
963+
; GFX908-NEXT: v_accvgpr_read_b32 v32, a10 ; Reload Reuse
964+
; GFX908-NEXT: s_nop 1
965+
; GFX908-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
966+
; GFX908-NEXT: ;;#ASMSTART
967+
; GFX908-NEXT: ; copy
968+
; GFX908-NEXT: ;;#ASMEND
969+
; GFX908-NEXT: v_accvgpr_read_b32 v32, a1
970+
; GFX908-NEXT: s_nop 1
971+
; GFX908-NEXT: v_accvgpr_write_b32 a32, v32
972+
; GFX908-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
973+
; GFX908-NEXT: s_waitcnt vmcnt(0)
974+
; GFX908-NEXT: v_accvgpr_write_b32 a0, v32 ; Reload Reuse
975+
; GFX908-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
976+
; GFX908-NEXT: s_waitcnt vmcnt(0)
977+
; GFX908-NEXT: v_accvgpr_write_b32 a1, v32 ; Reload Reuse
978+
; GFX908-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
979+
; GFX908-NEXT: s_waitcnt vmcnt(0)
980+
; GFX908-NEXT: v_accvgpr_write_b32 a2, v32 ; Reload Reuse
981+
; GFX908-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
982+
; GFX908-NEXT: s_waitcnt vmcnt(0)
983+
; GFX908-NEXT: v_accvgpr_write_b32 a3, v32 ; Reload Reuse
984+
; GFX908-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
985+
; GFX908-NEXT: s_waitcnt vmcnt(0)
986+
; GFX908-NEXT: v_accvgpr_write_b32 a4, v32 ; Reload Reuse
987+
; GFX908-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
988+
; GFX908-NEXT: s_waitcnt vmcnt(0)
989+
; GFX908-NEXT: v_accvgpr_write_b32 a5, v32 ; Reload Reuse
990+
; GFX908-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
991+
; GFX908-NEXT: s_waitcnt vmcnt(0)
992+
; GFX908-NEXT: v_accvgpr_write_b32 a6, v32 ; Reload Reuse
993+
; GFX908-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
994+
; GFX908-NEXT: s_waitcnt vmcnt(0)
995+
; GFX908-NEXT: v_accvgpr_write_b32 a7, v32 ; Reload Reuse
996+
; GFX908-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
997+
; GFX908-NEXT: s_waitcnt vmcnt(0)
998+
; GFX908-NEXT: v_accvgpr_write_b32 a8, v32 ; Reload Reuse
999+
; GFX908-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
1000+
; GFX908-NEXT: s_waitcnt vmcnt(0)
1001+
; GFX908-NEXT: v_accvgpr_write_b32 a9, v32 ; Reload Reuse
1002+
; GFX908-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
1003+
; GFX908-NEXT: s_waitcnt vmcnt(0)
1004+
; GFX908-NEXT: v_accvgpr_write_b32 a10, v32 ; Reload Reuse
1005+
; GFX908-NEXT: v_accvgpr_write_b32 a11, v39 ; Reload Reuse
1006+
; GFX908-NEXT: v_accvgpr_write_b32 a12, v38 ; Reload Reuse
1007+
; GFX908-NEXT: v_accvgpr_write_b32 a13, v37 ; Reload Reuse
1008+
; GFX908-NEXT: v_accvgpr_write_b32 a14, v36 ; Reload Reuse
1009+
; GFX908-NEXT: v_accvgpr_write_b32 a15, v35 ; Reload Reuse
1010+
; GFX908-NEXT: ;;#ASMSTART
1011+
; GFX908-NEXT: ; copy
1012+
; GFX908-NEXT: ;;#ASMEND
1013+
; GFX908-NEXT: v_accvgpr_read_b32 v33, a2
1014+
; GFX908-NEXT: s_nop 1
1015+
; GFX908-NEXT: v_accvgpr_write_b32 a3, v33
1016+
; GFX908-NEXT: ;;#ASMSTART
1017+
; GFX908-NEXT: ; use a3 v[0:31]
1018+
; GFX908-NEXT: ;;#ASMEND
1019+
; GFX908-NEXT: s_setpc_b64 s[30:31]
1020+
;
1021+
; GFX90A-LABEL: no_free_vgprs_at_sgpr_to_agpr_copy:
1022+
; GFX90A: ; %bb.0:
1023+
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1024+
; GFX90A-NEXT: v_mov_b32_e32 v33, v0
1025+
; GFX90A-NEXT: v_mov_b32_e32 v32, v1
1026+
; GFX90A-NEXT: ;;#ASMSTART
1027+
; GFX90A-NEXT: ; def v[0:31] s[0:15]
1028+
; GFX90A-NEXT: ;;#ASMEND
1029+
; GFX90A-NEXT: v_accvgpr_write_b32 a31, s15
1030+
; GFX90A-NEXT: v_accvgpr_write_b32 a30, s14
1031+
; GFX90A-NEXT: v_accvgpr_write_b32 a29, s13
1032+
; GFX90A-NEXT: v_accvgpr_write_b32 a28, s12
1033+
; GFX90A-NEXT: v_accvgpr_write_b32 a27, s11
1034+
; GFX90A-NEXT: v_accvgpr_write_b32 a26, s10
1035+
; GFX90A-NEXT: v_accvgpr_write_b32 a25, s9
1036+
; GFX90A-NEXT: v_accvgpr_write_b32 a24, s8
1037+
; GFX90A-NEXT: v_accvgpr_write_b32 a23, s7
1038+
; GFX90A-NEXT: v_accvgpr_write_b32 a22, s6
1039+
; GFX90A-NEXT: v_accvgpr_write_b32 a21, s5
1040+
; GFX90A-NEXT: v_accvgpr_write_b32 a20, s4
1041+
; GFX90A-NEXT: v_accvgpr_write_b32 a19, s3
1042+
; GFX90A-NEXT: v_accvgpr_write_b32 a18, s2
1043+
; GFX90A-NEXT: v_accvgpr_write_b32 a17, s1
1044+
; GFX90A-NEXT: v_accvgpr_write_b32 a16, s0
1045+
; GFX90A-NEXT: v_accvgpr_read_b32 v34, a32 ; Reload Reuse
1046+
; GFX90A-NEXT: s_nop 0
1047+
; GFX90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v33, v32, a[16:31]
1048+
; GFX90A-NEXT: s_nop 7
1049+
; GFX90A-NEXT: s_nop 2
1050+
; GFX90A-NEXT: buffer_store_dword a0, off, s[0:3], s32 ; 4-byte Folded Spill
1051+
; GFX90A-NEXT: s_waitcnt vmcnt(0)
1052+
; GFX90A-NEXT: buffer_store_dword a1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
1053+
; GFX90A-NEXT: buffer_store_dword a2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
1054+
; GFX90A-NEXT: buffer_store_dword a3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
1055+
; GFX90A-NEXT: buffer_store_dword a4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
1056+
; GFX90A-NEXT: buffer_store_dword a5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
1057+
; GFX90A-NEXT: buffer_store_dword a6, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
1058+
; GFX90A-NEXT: buffer_store_dword a7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
1059+
; GFX90A-NEXT: buffer_store_dword a8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
1060+
; GFX90A-NEXT: buffer_store_dword a9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
1061+
; GFX90A-NEXT: buffer_store_dword a10, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
1062+
; GFX90A-NEXT: v_accvgpr_read_b32 v39, a11 ; Reload Reuse
1063+
; GFX90A-NEXT: v_accvgpr_read_b32 v38, a12 ; Reload Reuse
1064+
; GFX90A-NEXT: v_accvgpr_read_b32 v37, a13 ; Reload Reuse
1065+
; GFX90A-NEXT: v_accvgpr_read_b32 v36, a14 ; Reload Reuse
1066+
; GFX90A-NEXT: v_accvgpr_read_b32 v35, a15 ; Reload Reuse
1067+
; GFX90A-NEXT: ;;#ASMSTART
1068+
; GFX90A-NEXT: ; copy
1069+
; GFX90A-NEXT: ;;#ASMEND
1070+
; GFX90A-NEXT: v_accvgpr_mov_b32 a32, a1
1071+
; GFX90A-NEXT: buffer_load_dword a0, off, s[0:3], s32 ; 4-byte Folded Reload
1072+
; GFX90A-NEXT: s_nop 0
1073+
; GFX90A-NEXT: buffer_load_dword a1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
1074+
; GFX90A-NEXT: s_nop 0
1075+
; GFX90A-NEXT: buffer_load_dword a2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
1076+
; GFX90A-NEXT: s_nop 0
1077+
; GFX90A-NEXT: buffer_load_dword a3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
1078+
; GFX90A-NEXT: s_nop 0
1079+
; GFX90A-NEXT: buffer_load_dword a4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
1080+
; GFX90A-NEXT: s_nop 0
1081+
; GFX90A-NEXT: buffer_load_dword a5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
1082+
; GFX90A-NEXT: s_nop 0
1083+
; GFX90A-NEXT: buffer_load_dword a6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
1084+
; GFX90A-NEXT: s_nop 0
1085+
; GFX90A-NEXT: buffer_load_dword a7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
1086+
; GFX90A-NEXT: s_nop 0
1087+
; GFX90A-NEXT: buffer_load_dword a8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
1088+
; GFX90A-NEXT: s_nop 0
1089+
; GFX90A-NEXT: buffer_load_dword a9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
1090+
; GFX90A-NEXT: s_nop 0
1091+
; GFX90A-NEXT: buffer_load_dword a10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
1092+
; GFX90A-NEXT: s_waitcnt vmcnt(0)
1093+
; GFX90A-NEXT: v_accvgpr_write_b32 a11, v39 ; Reload Reuse
1094+
; GFX90A-NEXT: v_accvgpr_write_b32 a12, v38 ; Reload Reuse
1095+
; GFX90A-NEXT: v_accvgpr_write_b32 a13, v37 ; Reload Reuse
1096+
; GFX90A-NEXT: v_accvgpr_write_b32 a14, v36 ; Reload Reuse
1097+
; GFX90A-NEXT: v_accvgpr_write_b32 a15, v35 ; Reload Reuse
1098+
; GFX90A-NEXT: ;;#ASMSTART
1099+
; GFX90A-NEXT: ; copy
1100+
; GFX90A-NEXT: ;;#ASMEND
1101+
; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a2
1102+
; GFX90A-NEXT: ;;#ASMSTART
1103+
; GFX90A-NEXT: ; use a3 v[0:31]
1104+
; GFX90A-NEXT: ;;#ASMEND
1105+
; GFX90A-NEXT: v_accvgpr_write_b32 a32, v34 ; Reload Reuse
1106+
; GFX90A-NEXT: s_setpc_b64 s[30:31]
1107+
%asm = call { <32 x i32>, <16 x float> } asm sideeffect "; def $0 $1","=${v[0:31]},=${s[0:15]}"()
1108+
%vgpr0 = extractvalue { <32 x i32>, <16 x float> } %asm, 0
1109+
%agpr0 = extractvalue { <32 x i32>, <16 x float> } %asm, 1
1110+
%mfma = call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float %v0, float %v1, <16 x float> %agpr0, i32 0, i32 0, i32 0)
1111+
%agpr1 = call i32 asm sideeffect "; copy ", "={a1},a,~{a[0:15]}"(<16 x float> %agpr0)
1112+
%agpr2 = call i32 asm sideeffect "; copy ", "={a2},a,{a[0:15]}"(i32 %agpr1, <16 x float> %mfma)
1113+
call void asm sideeffect "; use $0 $1","{a3},{v[0:31]}"(i32 %agpr2, <32 x i32> %vgpr0)
1114+
ret void
1115+
}
1116+
8671117
declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float, float, <16 x float>, i32 immarg, i32 immarg, i32 immarg) #1
8681118
declare i32 @llvm.amdgcn.workitem.id.x() #2
8691119

0 commit comments

Comments
 (0)