Skip to content

Commit 61c6e00

Browse files
authored
[AMDGPU][True16][CodeGen] flat/global/scratch load/store pseudo for true16 (#127945)
T16D16 table is implemented in #127673 this is a follow up patch to add load/store pseudo for: flat_store global_load/global_store scratch_load/scratch_store in true16 mode and updated the codegen test file
1 parent 2b06cee commit 61c6e00

File tree

11 files changed

+1029
-465
lines changed

11 files changed

+1029
-465
lines changed

llvm/lib/Target/AMDGPU/FLATInstructions.td

Lines changed: 222 additions & 32 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/bf16.ll

Lines changed: 151 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -60,13 +60,21 @@ define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) {
6060
; GFX10-NEXT: global_store_short v[2:3], v0, off
6161
; GFX10-NEXT: s_setpc_b64 s[30:31]
6262
;
63-
; GFX11-LABEL: test_load_store:
64-
; GFX11: ; %bb.0:
65-
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
66-
; GFX11-NEXT: global_load_u16 v0, v[0:1], off
67-
; GFX11-NEXT: s_waitcnt vmcnt(0)
68-
; GFX11-NEXT: global_store_b16 v[2:3], v0, off
69-
; GFX11-NEXT: s_setpc_b64 s[30:31]
63+
; GFX11TRUE16-LABEL: test_load_store:
64+
; GFX11TRUE16: ; %bb.0:
65+
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
66+
; GFX11TRUE16-NEXT: global_load_d16_b16 v0, v[0:1], off
67+
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0)
68+
; GFX11TRUE16-NEXT: global_store_b16 v[2:3], v0, off
69+
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
70+
;
71+
; GFX11FAKE16-LABEL: test_load_store:
72+
; GFX11FAKE16: ; %bb.0:
73+
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
74+
; GFX11FAKE16-NEXT: global_load_u16 v0, v[0:1], off
75+
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0)
76+
; GFX11FAKE16-NEXT: global_store_b16 v[2:3], v0, off
77+
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
7078
%val = load bfloat, ptr addrspace(1) %in
7179
store bfloat %val, ptr addrspace(1) %out
7280
ret void
@@ -2127,14 +2135,23 @@ define void @test_store_fpimm(ptr addrspace(1) %ptr0, ptr addrspace(1) %ptr1) {
21272135
; GFX10-NEXT: global_store_short v[2:3], v5, off
21282136
; GFX10-NEXT: s_setpc_b64 s[30:31]
21292137
;
2130-
; GFX11-LABEL: test_store_fpimm:
2131-
; GFX11: ; %bb.0:
2132-
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2133-
; GFX11-NEXT: v_mov_b32_e32 v4, 0x3f80
2134-
; GFX11-NEXT: v_mov_b32_e32 v5, 0x4228
2135-
; GFX11-NEXT: global_store_b16 v[0:1], v4, off
2136-
; GFX11-NEXT: global_store_b16 v[2:3], v5, off
2137-
; GFX11-NEXT: s_setpc_b64 s[30:31]
2138+
; GFX11TRUE16-LABEL: test_store_fpimm:
2139+
; GFX11TRUE16: ; %bb.0:
2140+
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2141+
; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.l, 0x3f80
2142+
; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.h, 0x4228
2143+
; GFX11TRUE16-NEXT: global_store_b16 v[0:1], v4, off
2144+
; GFX11TRUE16-NEXT: global_store_d16_hi_b16 v[2:3], v4, off
2145+
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
2146+
;
2147+
; GFX11FAKE16-LABEL: test_store_fpimm:
2148+
; GFX11FAKE16: ; %bb.0:
2149+
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2150+
; GFX11FAKE16-NEXT: v_mov_b32_e32 v4, 0x3f80
2151+
; GFX11FAKE16-NEXT: v_mov_b32_e32 v5, 0x4228
2152+
; GFX11FAKE16-NEXT: global_store_b16 v[0:1], v4, off
2153+
; GFX11FAKE16-NEXT: global_store_b16 v[2:3], v5, off
2154+
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
21382155
store bfloat 1.0, ptr addrspace(1) %ptr0
21392156
store bfloat 42.0, ptr addrspace(1) %ptr1
21402157
ret void
@@ -3330,12 +3347,19 @@ define amdgpu_gfx void @test_inreg_arg_store(bfloat inreg %in, ptr addrspace(1)
33303347
; GFX10-NEXT: global_store_short v[0:1], v2, off
33313348
; GFX10-NEXT: s_setpc_b64 s[30:31]
33323349
;
3333-
; GFX11-LABEL: test_inreg_arg_store:
3334-
; GFX11: ; %bb.0:
3335-
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3336-
; GFX11-NEXT: v_mov_b32_e32 v2, s4
3337-
; GFX11-NEXT: global_store_b16 v[0:1], v2, off
3338-
; GFX11-NEXT: s_setpc_b64 s[30:31]
3350+
; GFX11TRUE16-LABEL: test_inreg_arg_store:
3351+
; GFX11TRUE16: ; %bb.0:
3352+
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3353+
; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, s4
3354+
; GFX11TRUE16-NEXT: global_store_b16 v[0:1], v2, off
3355+
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
3356+
;
3357+
; GFX11FAKE16-LABEL: test_inreg_arg_store:
3358+
; GFX11FAKE16: ; %bb.0:
3359+
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3360+
; GFX11FAKE16-NEXT: v_mov_b32_e32 v2, s4
3361+
; GFX11FAKE16-NEXT: global_store_b16 v[0:1], v2, off
3362+
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
33393363
store bfloat %in, ptr addrspace(1) %out
33403364
ret void
33413365
}
@@ -3379,11 +3403,18 @@ define bfloat @test_byval(ptr addrspace(5) byval(bfloat) %bv, bfloat %val) {
33793403
; GFX10-NEXT: buffer_store_short v0, off, s[0:3], s32
33803404
; GFX10-NEXT: s_setpc_b64 s[30:31]
33813405
;
3382-
; GFX11-LABEL: test_byval:
3383-
; GFX11: ; %bb.0:
3384-
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3385-
; GFX11-NEXT: scratch_store_b16 off, v0, s32
3386-
; GFX11-NEXT: s_setpc_b64 s[30:31]
3406+
; GFX11TRUE16-LABEL: test_byval:
3407+
; GFX11TRUE16: ; %bb.0:
3408+
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3409+
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
3410+
; GFX11TRUE16-NEXT: scratch_store_b16 off, v1, s32
3411+
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
3412+
;
3413+
; GFX11FAKE16-LABEL: test_byval:
3414+
; GFX11FAKE16: ; %bb.0:
3415+
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3416+
; GFX11FAKE16-NEXT: scratch_store_b16 off, v0, s32
3417+
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
33873418
store bfloat %val, ptr addrspace(5) %bv
33883419
%retval = load bfloat, ptr addrspace(5) %bv
33893420
ret bfloat %retval
@@ -3490,13 +3521,21 @@ define void @test_bitcast_from_bfloat(ptr addrspace(1) %in, ptr addrspace(1) %ou
34903521
; GFX10-NEXT: global_store_short v[2:3], v0, off
34913522
; GFX10-NEXT: s_setpc_b64 s[30:31]
34923523
;
3493-
; GFX11-LABEL: test_bitcast_from_bfloat:
3494-
; GFX11: ; %bb.0:
3495-
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3496-
; GFX11-NEXT: global_load_u16 v0, v[0:1], off
3497-
; GFX11-NEXT: s_waitcnt vmcnt(0)
3498-
; GFX11-NEXT: global_store_b16 v[2:3], v0, off
3499-
; GFX11-NEXT: s_setpc_b64 s[30:31]
3524+
; GFX11TRUE16-LABEL: test_bitcast_from_bfloat:
3525+
; GFX11TRUE16: ; %bb.0:
3526+
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3527+
; GFX11TRUE16-NEXT: global_load_d16_b16 v0, v[0:1], off
3528+
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0)
3529+
; GFX11TRUE16-NEXT: global_store_b16 v[2:3], v0, off
3530+
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
3531+
;
3532+
; GFX11FAKE16-LABEL: test_bitcast_from_bfloat:
3533+
; GFX11FAKE16: ; %bb.0:
3534+
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3535+
; GFX11FAKE16-NEXT: global_load_u16 v0, v[0:1], off
3536+
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0)
3537+
; GFX11FAKE16-NEXT: global_store_b16 v[2:3], v0, off
3538+
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
35003539
%val = load bfloat, ptr addrspace(1) %in
35013540
%val_int = bitcast bfloat %val to i16
35023541
store i16 %val_int, ptr addrspace(1) %out
@@ -3556,13 +3595,21 @@ define void @test_bitcast_to_bfloat(ptr addrspace(1) %out, ptr addrspace(1) %in)
35563595
; GFX10-NEXT: global_store_short v[0:1], v2, off
35573596
; GFX10-NEXT: s_setpc_b64 s[30:31]
35583597
;
3559-
; GFX11-LABEL: test_bitcast_to_bfloat:
3560-
; GFX11: ; %bb.0:
3561-
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3562-
; GFX11-NEXT: global_load_u16 v2, v[2:3], off
3563-
; GFX11-NEXT: s_waitcnt vmcnt(0)
3564-
; GFX11-NEXT: global_store_b16 v[0:1], v2, off
3565-
; GFX11-NEXT: s_setpc_b64 s[30:31]
3598+
; GFX11TRUE16-LABEL: test_bitcast_to_bfloat:
3599+
; GFX11TRUE16: ; %bb.0:
3600+
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3601+
; GFX11TRUE16-NEXT: global_load_d16_b16 v2, v[2:3], off
3602+
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0)
3603+
; GFX11TRUE16-NEXT: global_store_b16 v[0:1], v2, off
3604+
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
3605+
;
3606+
; GFX11FAKE16-LABEL: test_bitcast_to_bfloat:
3607+
; GFX11FAKE16: ; %bb.0:
3608+
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3609+
; GFX11FAKE16-NEXT: global_load_u16 v2, v[2:3], off
3610+
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0)
3611+
; GFX11FAKE16-NEXT: global_store_b16 v[0:1], v2, off
3612+
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
35663613
%val = load i16, ptr addrspace(1) %in
35673614
%val_fp = bitcast i16 %val to bfloat
35683615
store bfloat %val_fp, ptr addrspace(1) %out
@@ -5309,14 +5356,23 @@ define bfloat @test_alloca_load_store_ret(bfloat %in) {
53095356
; GFX10-NEXT: s_waitcnt vmcnt(0)
53105357
; GFX10-NEXT: s_setpc_b64 s[30:31]
53115358
;
5312-
; GFX11-LABEL: test_alloca_load_store_ret:
5313-
; GFX11: ; %bb.0: ; %entry
5314-
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5315-
; GFX11-NEXT: scratch_store_b16 off, v0, s32 dlc
5316-
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
5317-
; GFX11-NEXT: scratch_load_u16 v0, off, s32 glc dlc
5318-
; GFX11-NEXT: s_waitcnt vmcnt(0)
5319-
; GFX11-NEXT: s_setpc_b64 s[30:31]
5359+
; GFX11TRUE16-LABEL: test_alloca_load_store_ret:
5360+
; GFX11TRUE16: ; %bb.0: ; %entry
5361+
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5362+
; GFX11TRUE16-NEXT: scratch_store_b16 off, v0, s32 dlc
5363+
; GFX11TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
5364+
; GFX11TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 glc dlc
5365+
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0)
5366+
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
5367+
;
5368+
; GFX11FAKE16-LABEL: test_alloca_load_store_ret:
5369+
; GFX11FAKE16: ; %bb.0: ; %entry
5370+
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5371+
; GFX11FAKE16-NEXT: scratch_store_b16 off, v0, s32 dlc
5372+
; GFX11FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
5373+
; GFX11FAKE16-NEXT: scratch_load_u16 v0, off, s32 glc dlc
5374+
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0)
5375+
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
53205376
entry:
53215377
%in.addr = alloca bfloat, align 2, addrspace(5)
53225378
store volatile bfloat %in, ptr addrspace(5) %in.addr, align 2
@@ -5667,26 +5723,48 @@ define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) {
56675723
; GFX10-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen offset:128
56685724
; GFX10-NEXT: s_setpc_b64 s[30:31]
56695725
;
5670-
; GFX11-LABEL: test_overflow_stack:
5671-
; GFX11: ; %bb.0:
5672-
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5673-
; GFX11-NEXT: s_clause 0x2
5674-
; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8
5675-
; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4
5676-
; GFX11-NEXT: scratch_load_b32 v31, off, s32
5677-
; GFX11-NEXT: s_clause 0x5
5678-
; GFX11-NEXT: scratch_store_b128 v0, v[22:25], off offset:80
5679-
; GFX11-NEXT: scratch_store_b128 v0, v[18:21], off offset:64
5680-
; GFX11-NEXT: scratch_store_b128 v0, v[14:17], off offset:48
5681-
; GFX11-NEXT: scratch_store_b128 v0, v[10:13], off offset:32
5682-
; GFX11-NEXT: scratch_store_b128 v0, v[6:9], off offset:16
5683-
; GFX11-NEXT: scratch_store_b128 v0, v[2:5], off
5684-
; GFX11-NEXT: s_waitcnt vmcnt(0)
5685-
; GFX11-NEXT: s_clause 0x2
5686-
; GFX11-NEXT: scratch_store_b128 v0, v[30:33], off offset:112
5687-
; GFX11-NEXT: scratch_store_b128 v0, v[26:29], off offset:96
5688-
; GFX11-NEXT: scratch_store_b16 v0, v1, off offset:128
5689-
; GFX11-NEXT: s_setpc_b64 s[30:31]
5726+
; GFX11TRUE16-LABEL: test_overflow_stack:
5727+
; GFX11TRUE16: ; %bb.0:
5728+
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5729+
; GFX11TRUE16-NEXT: s_clause 0x2
5730+
; GFX11TRUE16-NEXT: scratch_load_b32 v33, off, s32 offset:8
5731+
; GFX11TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
5732+
; GFX11TRUE16-NEXT: scratch_load_b32 v31, off, s32
5733+
; GFX11TRUE16-NEXT: s_clause 0x3
5734+
; GFX11TRUE16-NEXT: scratch_store_b128 v0, v[22:25], off offset:80
5735+
; GFX11TRUE16-NEXT: scratch_store_b128 v0, v[18:21], off offset:64
5736+
; GFX11TRUE16-NEXT: scratch_store_b128 v0, v[14:17], off offset:48
5737+
; GFX11TRUE16-NEXT: scratch_store_b128 v0, v[10:13], off offset:32
5738+
; GFX11TRUE16-NEXT: s_clause 0x1
5739+
; GFX11TRUE16-NEXT: scratch_store_b128 v0, v[6:9], off offset:16
5740+
; GFX11TRUE16-NEXT: scratch_store_b128 v0, v[2:5], off
5741+
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0)
5742+
; GFX11TRUE16-NEXT: s_clause 0x2
5743+
; GFX11TRUE16-NEXT: scratch_store_b128 v0, v[30:33], off offset:112
5744+
; GFX11TRUE16-NEXT: scratch_store_b128 v0, v[26:29], off offset:96
5745+
; GFX11TRUE16-NEXT: scratch_store_b16 v0, v1, off offset:128
5746+
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
5747+
;
5748+
; GFX11FAKE16-LABEL: test_overflow_stack:
5749+
; GFX11FAKE16: ; %bb.0:
5750+
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5751+
; GFX11FAKE16-NEXT: s_clause 0x2
5752+
; GFX11FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8
5753+
; GFX11FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
5754+
; GFX11FAKE16-NEXT: scratch_load_b32 v31, off, s32
5755+
; GFX11FAKE16-NEXT: s_clause 0x5
5756+
; GFX11FAKE16-NEXT: scratch_store_b128 v0, v[22:25], off offset:80
5757+
; GFX11FAKE16-NEXT: scratch_store_b128 v0, v[18:21], off offset:64
5758+
; GFX11FAKE16-NEXT: scratch_store_b128 v0, v[14:17], off offset:48
5759+
; GFX11FAKE16-NEXT: scratch_store_b128 v0, v[10:13], off offset:32
5760+
; GFX11FAKE16-NEXT: scratch_store_b128 v0, v[6:9], off offset:16
5761+
; GFX11FAKE16-NEXT: scratch_store_b128 v0, v[2:5], off
5762+
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0)
5763+
; GFX11FAKE16-NEXT: s_clause 0x2
5764+
; GFX11FAKE16-NEXT: scratch_store_b128 v0, v[30:33], off offset:112
5765+
; GFX11FAKE16-NEXT: scratch_store_b128 v0, v[26:29], off offset:96
5766+
; GFX11FAKE16-NEXT: scratch_store_b16 v0, v1, off offset:128
5767+
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
56905768
%ins.0 = insertvalue { <32 x i32>, bfloat } poison, <32 x i32> %b, 0
56915769
%ins.1 = insertvalue { <32 x i32>, bfloat } %ins.0 ,bfloat %a, 1
56925770
ret { <32 x i32>, bfloat } %ins.1
@@ -42719,7 +42797,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
4271942797
; GFX11TRUE16: ; %bb.0:
4272042798
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4272142799
; GFX11TRUE16-NEXT: s_clause 0x1f
42722-
; GFX11TRUE16-NEXT: scratch_load_u16 v31, off, s32
42800+
; GFX11TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32
4272342801
; GFX11TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:64
4272442802
; GFX11TRUE16-NEXT: scratch_load_b32 v33, off, s32 offset:128
4272542803
; GFX11TRUE16-NEXT: scratch_load_b32 v34, off, s32 offset:60
@@ -42752,16 +42830,17 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
4275242830
; GFX11TRUE16-NEXT: scratch_load_b32 v85, off, s32 offset:72
4275342831
; GFX11TRUE16-NEXT: scratch_load_b32 v86, off, s32 offset:4
4275442832
; GFX11TRUE16-NEXT: scratch_load_b32 v87, off, s32 offset:68
42833+
; GFX11TRUE16-NEXT: v_and_b32_e32 v16, 1, v16
4275542834
; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
4275642835
; GFX11TRUE16-NEXT: v_and_b32_e32 v14, 1, v14
42757-
; GFX11TRUE16-NEXT: v_and_b32_e32 v16, 1, v16
4275842836
; GFX11TRUE16-NEXT: v_and_b32_e32 v18, 1, v18
4275942837
; GFX11TRUE16-NEXT: v_and_b32_e32 v20, 1, v20
4276042838
; GFX11TRUE16-NEXT: v_and_b32_e32 v22, 1, v22
4276142839
; GFX11TRUE16-NEXT: v_and_b32_e32 v24, 1, v24
4276242840
; GFX11TRUE16-NEXT: v_and_b32_e32 v26, 1, v26
4276342841
; GFX11TRUE16-NEXT: v_and_b32_e32 v28, 1, v28
4276442842
; GFX11TRUE16-NEXT: v_and_b32_e32 v30, 1, v30
42843+
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s15, 1, v16
4276542844
; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 1, v1
4276642845
; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 1, v2
4276742846
; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v3
@@ -42785,7 +42864,6 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
4278542864
; GFX11TRUE16-NEXT: v_and_b32_e32 v29, 1, v29
4278642865
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
4278742866
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s13, 1, v14
42788-
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s15, 1, v16
4278942867
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s17, 1, v18
4279042868
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s19, 1, v20
4279142869
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s21, 1, v22
@@ -42815,14 +42893,15 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
4281542893
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s26, 1, v27
4281642894
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s29, 1, v29
4281742895
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(32)
42818-
; GFX11TRUE16-NEXT: v_and_b32_e32 v16, 1, v31
42896+
; GFX11TRUE16-NEXT: v_mov_b16_e32 v16.l, v31.l
4281942897
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(31)
4282042898
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v17, 16, v32
4282142899
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(30)
4282242900
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v33
4282342901
; GFX11TRUE16-NEXT: v_cndmask_b16 v15.l, v33.l, v32.l, s28
4282442902
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(29)
4282542903
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v34
42904+
; GFX11TRUE16-NEXT: v_and_b32_e32 v16, 1, v16
4282642905
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(28)
4282742906
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v35
4282842907
; GFX11TRUE16-NEXT: v_cndmask_b16 v14.l, v35.l, v34.l, s27

0 commit comments

Comments
 (0)