Skip to content

Commit 2a9d431

Browse files
committed
[AMDGPU/VOP3P] - Add tied wmma_f16 pseudos
Add intrinsics for `wmma_f16` and `wmma_bf16`, which stay as two-address instructions. This is a requirement for a future optimization regarding wmma instructions. The new changes make use of the `op_sel` argument of `wmma` instructions to read from the upper halves of the input accumulator and write to the upper halves of the output matrix. With two-address instructions, we can guarantee that the content of the upper halves is the same as the input accumulator. With three-address instructions, the output registers do not copy the content of the input registers. Instead, the upper halves remain unchanged from their previous values. This can cause issues if there are unexpected values remaining in these registers. For example: ``` v_wmma_f16_16x16x16_f16 v[0:7], ..., v[24:31] v_wmma_f16_16x16x16_f16 v[32:30], ..., v[24:31] ``` After these two instructions run, there is no guarantee that the content of bits 16-31 of `v[0:7]` are the same as the ones from `v[24:31]`. If we have another instruction like the following: ``` v_wmma_f16_16x16x16_f16 v[0:7], v[24:31], v[32:49], v[0:7] op_sel:[0,0,1] ``` We read from the upper halves of `v[0:7]`, but the content is not necessarily correct. For our purpose, we create new pseudo instructions, while maintaining the behavior of the original instructions.
1 parent edebbb4 commit 2a9d431

File tree

7 files changed

+404
-18
lines changed

7 files changed

+404
-18
lines changed

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2361,8 +2361,16 @@ class AMDGPUWmmaIntrinsicIU<LLVMType AB, LLVMType CD> :
23612361

23622362
def int_amdgcn_wmma_f32_16x16x16_f16 : AMDGPUWmmaIntrinsic<llvm_v16f16_ty, llvm_anyfloat_ty>;
23632363
def int_amdgcn_wmma_f32_16x16x16_bf16 : AMDGPUWmmaIntrinsic<llvm_v16i16_ty, llvm_anyfloat_ty>;
2364+
// The regular, untied f16/bf16 wmma intrinsics only write to one half
2365+
// of the registers (set via the op_sel bit).
2366+
// The content of the other 16-bit of the registers is undefined.
23642367
def int_amdgcn_wmma_f16_16x16x16_f16 : AMDGPUWmmaIntrinsicOPSEL<llvm_v16f16_ty, llvm_anyfloat_ty>;
23652368
def int_amdgcn_wmma_bf16_16x16x16_bf16 : AMDGPUWmmaIntrinsicOPSEL<llvm_v16i16_ty, llvm_anyint_ty>;
2369+
// The tied versions of the f16/bf16 wmma intrinsics tie the destination matrix
2370+
// registers to the input accumulator registers.
2371+
// Essentially, the content of the other 16-bit is preserved from the input.
2372+
def int_amdgcn_wmma_f16_16x16x16_f16_tied : AMDGPUWmmaIntrinsicOPSEL<llvm_v16f16_ty, llvm_anyfloat_ty>;
2373+
def int_amdgcn_wmma_bf16_16x16x16_bf16_tied : AMDGPUWmmaIntrinsicOPSEL<llvm_v16i16_ty, llvm_anyint_ty>;
23662374
def int_amdgcn_wmma_i32_16x16x16_iu8 : AMDGPUWmmaIntrinsicIU<llvm_v4i32_ty, llvm_anyint_ty>;
23672375
def int_amdgcn_wmma_i32_16x16x16_iu4 : AMDGPUWmmaIntrinsicIU<llvm_v2i32_ty, llvm_anyint_ty>;
23682376

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4279,6 +4279,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
42794279
case Intrinsic::amdgcn_sudot8:
42804280
case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16:
42814281
case Intrinsic::amdgcn_wmma_f16_16x16x16_f16:
4282+
case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied:
4283+
case Intrinsic::amdgcn_wmma_f16_16x16x16_f16_tied:
42824284
case Intrinsic::amdgcn_wmma_f32_16x16x16_bf16:
42834285
case Intrinsic::amdgcn_wmma_f32_16x16x16_f16:
42844286
case Intrinsic::amdgcn_wmma_i32_16x16x16_iu4:

llvm/lib/Target/AMDGPU/VOP3PInstructions.td

Lines changed: 26 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -865,22 +865,26 @@ def WMMAOpcode3AddrMappingTable : WMMAMappingTable {
865865
// it converts the default pseudo to the pseudo where src2 is not the same as vdst.
866866
// 3) @earlyclobber on the destination satisfies the constraint during RA.
867867

868-
multiclass WMMAInst<string Suffix, string Instr, VOPProfile P, SDPatternOperator node = null_frag, RegisterOperand _Src01RC64 = VRegSrc_256, WMMAType Type> {
868+
multiclass WMMAInst<string Suffix, string Instr, VOPProfile P, SDPatternOperator node = null_frag, RegisterOperand _Src01RC64 = VRegSrc_256, WMMAType Type, bit convertibleTo3Addr> {
869869

870870
defvar WMMAConstraints2Addr = "@earlyclobber $vdst,$vdst = $src2";
871871
defvar WMMAConstraints3Addr = "@earlyclobber $vdst";
872872

873873
defvar WMMAProfile = VOPProfileWMMA<P, Suffix, _Src01RC64, Type.hasClamp, Type.hasOpsel>;
874874
let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in {
875-
let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = 1 in {
875+
let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = convertibleTo3Addr in {
876876
def _twoaddr # Suffix : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>;
877877
}
878-
let Constraints = WMMAConstraints3Addr, SchedRW = [Write32Bit, Write32Bit] in {
879-
def _threeaddr # Suffix : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>;
880-
}
881878
}
882-
def : WMMAOpcodeMapping<!cast<Instruction>(NAME # _twoaddr # Suffix),
879+
if convertibleTo3Addr then {
880+
let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in {
881+
let Constraints = WMMAConstraints3Addr, SchedRW = [Write32Bit, Write32Bit] in {
882+
def _threeaddr # Suffix : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>;
883+
}
884+
}
885+
def : WMMAOpcodeMapping<!cast<Instruction>(NAME # _twoaddr # Suffix),
883886
!cast<Instruction>(NAME # _threeaddr # Suffix)>;
887+
}
884888

885889
if !eq(Type, WMMAOpSel) then {
886890
def : WMMAOpSelPat<!cast<Instruction>(NAME # _twoaddr # Suffix), node, P>;
@@ -893,21 +897,25 @@ multiclass WMMAInst<string Suffix, string Instr, VOPProfile P, SDPatternOperator
893897

894898

895899
let WaveSizePredicate = isWave32 in {
896-
defm V_WMMA_F32_16X16X16_F16 : WMMAInst<"_w32", "v_wmma_f32_16x16x16_f16", VOP_V8F32_V16F16_V16F16_V8F32, int_amdgcn_wmma_f32_16x16x16_f16, VRegSrc_256, WMMARegular>;
897-
defm V_WMMA_F32_16X16X16_BF16 : WMMAInst<"_w32", "v_wmma_f32_16x16x16_bf16", VOP_V8F32_V16I16_V16I16_V8F32, int_amdgcn_wmma_f32_16x16x16_bf16, VRegSrc_256, WMMARegular>;
898-
defm V_WMMA_F16_16X16X16_F16 : WMMAInst<"_w32", "v_wmma_f16_16x16x16_f16", VOP_V16F16_V16F16_V16F16_V16F16, int_amdgcn_wmma_f16_16x16x16_f16, VRegSrc_256, WMMAOpSel>;
899-
defm V_WMMA_BF16_16X16X16_BF16 : WMMAInst<"_w32", "v_wmma_bf16_16x16x16_bf16", VOP_V16I16_V16I16_V16I16_V16I16, int_amdgcn_wmma_bf16_16x16x16_bf16, VRegSrc_256, WMMAOpSel>;
900-
defm V_WMMA_I32_16X16X16_IU8 : WMMAInst<"_w32", "v_wmma_i32_16x16x16_iu8", VOP_V8I32_V4I32_V4I32_V8I32, int_amdgcn_wmma_i32_16x16x16_iu8, VRegSrc_128, WMMAUIClamp>;
901-
defm V_WMMA_I32_16X16X16_IU4 : WMMAInst<"_w32", "v_wmma_i32_16x16x16_iu4", VOP_V8I32_V2I32_V2I32_V8I32, int_amdgcn_wmma_i32_16x16x16_iu4, VRegSrc_64, WMMAUIClamp>;
900+
defm V_WMMA_F32_16X16X16_F16 : WMMAInst<"_w32", "v_wmma_f32_16x16x16_f16", VOP_V8F32_V16F16_V16F16_V8F32, int_amdgcn_wmma_f32_16x16x16_f16, VRegSrc_256, WMMARegular, 1>;
901+
defm V_WMMA_F32_16X16X16_BF16 : WMMAInst<"_w32", "v_wmma_f32_16x16x16_bf16", VOP_V8F32_V16I16_V16I16_V8F32, int_amdgcn_wmma_f32_16x16x16_bf16, VRegSrc_256, WMMARegular, 1>;
902+
defm V_WMMA_F16_16X16X16_F16 : WMMAInst<"_w32", "v_wmma_f16_16x16x16_f16", VOP_V16F16_V16F16_V16F16_V16F16, int_amdgcn_wmma_f16_16x16x16_f16, VRegSrc_256, WMMAOpSel, 1>;
903+
defm V_WMMA_BF16_16X16X16_BF16 : WMMAInst<"_w32", "v_wmma_bf16_16x16x16_bf16", VOP_V16I16_V16I16_V16I16_V16I16, int_amdgcn_wmma_bf16_16x16x16_bf16, VRegSrc_256, WMMAOpSel, 1>;
904+
defm V_WMMA_F16_16X16X16_F16_TIED : WMMAInst<"_w32", "v_wmma_f16_16x16x16_f16", VOP_V16F16_V16F16_V16F16_V16F16, int_amdgcn_wmma_f16_16x16x16_f16_tied, VRegSrc_256, WMMAOpSel, 0>;
905+
defm V_WMMA_BF16_16X16X16_BF16_TIED : WMMAInst<"_w32", "v_wmma_bf16_16x16x16_bf16", VOP_V16I16_V16I16_V16I16_V16I16, int_amdgcn_wmma_bf16_16x16x16_bf16_tied, VRegSrc_256, WMMAOpSel, 0>;
906+
defm V_WMMA_I32_16X16X16_IU8 : WMMAInst<"_w32", "v_wmma_i32_16x16x16_iu8", VOP_V8I32_V4I32_V4I32_V8I32, int_amdgcn_wmma_i32_16x16x16_iu8, VRegSrc_128, WMMAUIClamp, 1>;
907+
defm V_WMMA_I32_16X16X16_IU4 : WMMAInst<"_w32", "v_wmma_i32_16x16x16_iu4", VOP_V8I32_V2I32_V2I32_V8I32, int_amdgcn_wmma_i32_16x16x16_iu4, VRegSrc_64, WMMAUIClamp, 1>;
902908
}
903909

904910
let WaveSizePredicate = isWave64 in {
905-
defm V_WMMA_F32_16X16X16_F16 : WMMAInst<"_w64", "v_wmma_f32_16x16x16_f16", VOP_V4F32_V16F16_V16F16_V4F32, int_amdgcn_wmma_f32_16x16x16_f16, VRegSrc_256, WMMARegular>;
906-
defm V_WMMA_F32_16X16X16_BF16 : WMMAInst<"_w64", "v_wmma_f32_16x16x16_bf16", VOP_V4F32_V16I16_V16I16_V4F32, int_amdgcn_wmma_f32_16x16x16_bf16, VRegSrc_256, WMMARegular>;
907-
defm V_WMMA_F16_16X16X16_F16 : WMMAInst<"_w64", "v_wmma_f16_16x16x16_f16", VOP_V8F16_V16F16_V16F16_V8F16, int_amdgcn_wmma_f16_16x16x16_f16, VRegSrc_256, WMMAOpSel>;
908-
defm V_WMMA_BF16_16X16X16_BF16 : WMMAInst<"_w64", "v_wmma_bf16_16x16x16_bf16", VOP_V8I16_V16I16_V16I16_V8I16, int_amdgcn_wmma_bf16_16x16x16_bf16, VRegSrc_256, WMMAOpSel>;
909-
defm V_WMMA_I32_16X16X16_IU8 : WMMAInst<"_w64", "v_wmma_i32_16x16x16_iu8", VOP_V4I32_V4I32_V4I32_V4I32, int_amdgcn_wmma_i32_16x16x16_iu8, VRegSrc_128, WMMAUIClamp>;
910-
defm V_WMMA_I32_16X16X16_IU4 : WMMAInst<"_w64", "v_wmma_i32_16x16x16_iu4", VOP_V4I32_V2I32_V2I32_V4I32, int_amdgcn_wmma_i32_16x16x16_iu4, VRegSrc_64, WMMAUIClamp>;
911+
defm V_WMMA_F32_16X16X16_F16 : WMMAInst<"_w64", "v_wmma_f32_16x16x16_f16", VOP_V4F32_V16F16_V16F16_V4F32, int_amdgcn_wmma_f32_16x16x16_f16, VRegSrc_256, WMMARegular, 1>;
912+
defm V_WMMA_F32_16X16X16_BF16 : WMMAInst<"_w64", "v_wmma_f32_16x16x16_bf16", VOP_V4F32_V16I16_V16I16_V4F32, int_amdgcn_wmma_f32_16x16x16_bf16, VRegSrc_256, WMMARegular, 1>;
913+
defm V_WMMA_F16_16X16X16_F16 : WMMAInst<"_w64", "v_wmma_f16_16x16x16_f16", VOP_V8F16_V16F16_V16F16_V8F16, int_amdgcn_wmma_f16_16x16x16_f16, VRegSrc_256, WMMAOpSel, 1>;
914+
defm V_WMMA_BF16_16X16X16_BF16 : WMMAInst<"_w64", "v_wmma_bf16_16x16x16_bf16", VOP_V8I16_V16I16_V16I16_V8I16, int_amdgcn_wmma_bf16_16x16x16_bf16, VRegSrc_256, WMMAOpSel, 1>;
915+
defm V_WMMA_F16_16X16X16_F16_TIED : WMMAInst<"_w64", "v_wmma_f16_16x16x16_f16", VOP_V8F16_V16F16_V16F16_V8F16, int_amdgcn_wmma_f16_16x16x16_f16_tied, VRegSrc_256, WMMAOpSel, 0>;
916+
defm V_WMMA_BF16_16X16X16_BF16_TIED : WMMAInst<"_w64", "v_wmma_bf16_16x16x16_bf16", VOP_V8I16_V16I16_V16I16_V8I16, int_amdgcn_wmma_bf16_16x16x16_bf16_tied, VRegSrc_256, WMMAOpSel, 0>;
917+
defm V_WMMA_I32_16X16X16_IU8 : WMMAInst<"_w64", "v_wmma_i32_16x16x16_iu8", VOP_V4I32_V4I32_V4I32_V4I32, int_amdgcn_wmma_i32_16x16x16_iu8, VRegSrc_128, WMMAUIClamp, 1>;
918+
defm V_WMMA_I32_16X16X16_IU4 : WMMAInst<"_w64", "v_wmma_i32_16x16x16_iu4", VOP_V4I32_V2I32_V2I32_V4I32, int_amdgcn_wmma_i32_16x16x16_iu4, VRegSrc_64, WMMAUIClamp, 1>;
911919

912920
}
913921

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,9 @@
44
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half>, <16 x half> , <8 x float>)
55
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16>, <16 x i16> , <8 x float>)
66
declare <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half>, <16 x half> , <16 x half>, i1 immarg)
7+
declare <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied(<16 x half>, <16 x half> , <16 x half>, i1 immarg)
78
declare <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16>, <16 x i16> , <16 x i16>, i1 immarg)
9+
declare <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied(<16 x i16>, <16 x i16> , <16 x i16>, i1 immarg)
810
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32> , <8 x i32>, i1 immarg)
911
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32> , <8 x i32>, i1 immarg)
1012

@@ -78,6 +80,55 @@ bb:
7880
ret void
7981
}
8082

83+
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_untied(<16 x half> %A.0, <16 x half> %B.0, <16 x half> %A.1, <16 x half> %B.1, <16 x half> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) {
84+
; W32-LABEL: test_wmma_f16_16x16x16_f16_untied:
85+
; W32: ; %bb.0: ; %bb
86+
; W32-NEXT: v_wmma_f16_16x16x16_f16 v[44:51], v[0:7], v[8:15], v[32:39]
87+
; W32-NEXT: v_wmma_f16_16x16x16_f16 v[32:39], v[16:23], v[24:31], v[32:39]
88+
; W32-NEXT: s_clause 0x1
89+
; W32-NEXT: global_store_b128 v[40:41], v[44:47], off
90+
; W32-NEXT: global_store_b128 v[40:41], v[48:51], off offset:16
91+
; W32-NEXT: s_clause 0x1
92+
; W32-NEXT: global_store_b128 v[42:43], v[32:35], off
93+
; W32-NEXT: global_store_b128 v[42:43], v[36:39], off offset:16
94+
; W32-NEXT: s_nop 0
95+
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
96+
; W32-NEXT: s_endpgm
97+
bb:
98+
%res.0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A.0, <16 x half> %B.0, <16 x half> %C, i1 0)
99+
%res.1 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A.1, <16 x half> %B.1, <16 x half> %C, i1 0)
100+
store <16 x half> %res.0, ptr addrspace(1) %out.0, align 32
101+
store <16 x half> %res.1, ptr addrspace(1) %out.1, align 32
102+
ret void
103+
}
104+
105+
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_tied(<16 x half> %A.0, <16 x half> %B.0, <16 x half> %A.1, <16 x half> %B.1, <16 x half> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) {
106+
; W32-LABEL: test_wmma_f16_16x16x16_f16_tied:
107+
; W32: ; %bb.0: ; %bb
108+
; W32-NEXT: v_dual_mov_b32 v51, v39 :: v_dual_mov_b32 v50, v38
109+
; W32-NEXT: v_dual_mov_b32 v49, v37 :: v_dual_mov_b32 v48, v36
110+
; W32-NEXT: v_dual_mov_b32 v47, v35 :: v_dual_mov_b32 v46, v34
111+
; W32-NEXT: v_dual_mov_b32 v45, v33 :: v_dual_mov_b32 v44, v32
112+
; W32-NEXT: v_wmma_f16_16x16x16_f16 v[32:39], v[16:23], v[24:31], v[32:39]
113+
; W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
114+
; W32-NEXT: v_wmma_f16_16x16x16_f16 v[44:51], v[0:7], v[8:15], v[44:51]
115+
; W32-NEXT: s_clause 0x1
116+
; W32-NEXT: global_store_b128 v[40:41], v[44:47], off
117+
; W32-NEXT: global_store_b128 v[40:41], v[48:51], off offset:16
118+
; W32-NEXT: s_clause 0x1
119+
; W32-NEXT: global_store_b128 v[42:43], v[32:35], off
120+
; W32-NEXT: global_store_b128 v[42:43], v[36:39], off offset:16
121+
; W32-NEXT: s_nop 0
122+
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
123+
; W32-NEXT: s_endpgm
124+
bb:
125+
%res.0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied(<16 x half> %A.0, <16 x half> %B.0, <16 x half> %C, i1 0)
126+
%res.1 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied(<16 x half> %A.1, <16 x half> %B.1, <16 x half> %C, i1 0)
127+
store <16 x half> %res.0, ptr addrspace(1) %out.0, align 32
128+
store <16 x half> %res.1, ptr addrspace(1) %out.1, align 32
129+
ret void
130+
}
131+
81132
; @llvm.amdgcn.wmma.bf16.16x16x16.bf16
82133

83134
define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, ptr addrspace(1) %out) {
@@ -112,6 +163,55 @@ bb:
112163
ret void
113164
}
114165

166+
define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_untied(<16 x i16> %A.0, <16 x i16> %B.0, <16 x i16> %A.1, <16 x i16> %B.1, <16 x i16> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) {
167+
; W32-LABEL: test_wmma_bf16_16x16x16_bf16_untied:
168+
; W32: ; %bb.0: ; %bb
169+
; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[44:51], v[0:7], v[8:15], v[32:39]
170+
; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[32:39], v[16:23], v[24:31], v[32:39]
171+
; W32-NEXT: s_clause 0x1
172+
; W32-NEXT: global_store_b128 v[40:41], v[44:47], off
173+
; W32-NEXT: global_store_b128 v[40:41], v[48:51], off offset:16
174+
; W32-NEXT: s_clause 0x1
175+
; W32-NEXT: global_store_b128 v[42:43], v[32:35], off
176+
; W32-NEXT: global_store_b128 v[42:43], v[36:39], off offset:16
177+
; W32-NEXT: s_nop 0
178+
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
179+
; W32-NEXT: s_endpgm
180+
bb:
181+
%res.0 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A.0, <16 x i16> %B.0, <16 x i16> %C, i1 0)
182+
%res.1 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A.1, <16 x i16> %B.1, <16 x i16> %C, i1 0)
183+
store <16 x i16> %res.0, ptr addrspace(1) %out.0, align 32
184+
store <16 x i16> %res.1, ptr addrspace(1) %out.1, align 32
185+
ret void
186+
}
187+
188+
define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_tied(<16 x i16> %A.0, <16 x i16> %B.0, <16 x i16> %A.1, <16 x i16> %B.1, <16 x i16> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) {
189+
; W32-LABEL: test_wmma_bf16_16x16x16_bf16_tied:
190+
; W32: ; %bb.0: ; %bb
191+
; W32-NEXT: v_dual_mov_b32 v51, v39 :: v_dual_mov_b32 v50, v38
192+
; W32-NEXT: v_dual_mov_b32 v49, v37 :: v_dual_mov_b32 v48, v36
193+
; W32-NEXT: v_dual_mov_b32 v47, v35 :: v_dual_mov_b32 v46, v34
194+
; W32-NEXT: v_dual_mov_b32 v45, v33 :: v_dual_mov_b32 v44, v32
195+
; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[32:39], v[16:23], v[24:31], v[32:39]
196+
; W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
197+
; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[44:51], v[0:7], v[8:15], v[44:51]
198+
; W32-NEXT: s_clause 0x1
199+
; W32-NEXT: global_store_b128 v[40:41], v[44:47], off
200+
; W32-NEXT: global_store_b128 v[40:41], v[48:51], off offset:16
201+
; W32-NEXT: s_clause 0x1
202+
; W32-NEXT: global_store_b128 v[42:43], v[32:35], off
203+
; W32-NEXT: global_store_b128 v[42:43], v[36:39], off offset:16
204+
; W32-NEXT: s_nop 0
205+
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
206+
; W32-NEXT: s_endpgm
207+
bb:
208+
%res.0 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied(<16 x i16> %A.0, <16 x i16> %B.0, <16 x i16> %C, i1 0)
209+
%res.1 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied(<16 x i16> %A.1, <16 x i16> %B.1, <16 x i16> %C, i1 0)
210+
store <16 x i16> %res.0, ptr addrspace(1) %out.0, align 32
211+
store <16 x i16> %res.1, ptr addrspace(1) %out.1, align 32
212+
ret void
213+
}
214+
115215
; @llvm.amdgcn.wmma.i32.16x16x16.iu8
116216

117217
define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {

0 commit comments

Comments
 (0)