|
| 1 | +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py |
| 2 | +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -misched-cluster=0 < %s | FileCheck -check-prefix=GCN %s |
| 3 | +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -misched-cluster=0 -amdgpu-igrouplp-exact-solver-max-branches=250000 < %s | FileCheck -check-prefix=EXACTCUTOFF %s |
| 4 | + |
| 5 | +declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16..i16(<8 x half>, <16 x half>, <8 x half>, i16) |
| 6 | + |
| 7 | +define amdgpu_kernel void @test_sched_group_barrier_pipeline_SWMMAC_cluster(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { |
| 8 | +; GCN-LABEL: test_sched_group_barrier_pipeline_SWMMAC_cluster: |
| 9 | +; GCN: ; %bb.0: ; %entry |
| 10 | +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 |
| 11 | +; GCN-NEXT: v_lshlrev_b32_e32 v28, 4, v0 |
| 12 | +; GCN-NEXT: v_mov_b32_e32 v48, 0 |
| 13 | +; GCN-NEXT: s_wait_kmcnt 0x0 |
| 14 | +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| 15 | +; GCN-NEXT: v_add_nc_u32_e32 v0, s0, v28 |
| 16 | +; GCN-NEXT: v_dual_mov_b32 v50, s1 :: v_dual_add_nc_u32 v49, s1, v28 |
| 17 | +; GCN-NEXT: ds_load_b128 v[8:11], v0 |
| 18 | +; GCN-NEXT: ds_load_b128 v[12:15], v0 offset:512 |
| 19 | +; GCN-NEXT: ds_load_b128 v[16:19], v0 offset:1536 |
| 20 | +; GCN-NEXT: ds_load_b128 v[20:23], v0 offset:3072 |
| 21 | +; GCN-NEXT: ds_load_b128 v[24:27], v0 offset:5120 |
| 22 | +; GCN-NEXT: ds_load_b128 v[4:7], v0 offset:11280 |
| 23 | +; GCN-NEXT: ds_load_b128 v[0:3], v0 offset:11264 |
| 24 | +; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(7) SyncID(0) |
| 25 | +; GCN-NEXT: s_wait_dscnt 0x6 |
| 26 | +; GCN-NEXT: v_mov_b32_e32 v31, v11 |
| 27 | +; GCN-NEXT: s_wait_dscnt 0x5 |
| 28 | +; GCN-NEXT: v_mov_b32_e32 v35, v15 |
| 29 | +; GCN-NEXT: s_wait_dscnt 0x4 |
| 30 | +; GCN-NEXT: v_mov_b32_e32 v39, v19 |
| 31 | +; GCN-NEXT: s_wait_dscnt 0x3 |
| 32 | +; GCN-NEXT: v_mov_b32_e32 v43, v23 |
| 33 | +; GCN-NEXT: s_wait_dscnt 0x2 |
| 34 | +; GCN-NEXT: v_dual_mov_b32 v47, v27 :: v_dual_mov_b32 v30, v10 |
| 35 | +; GCN-NEXT: v_dual_mov_b32 v29, v9 :: v_dual_mov_b32 v28, v8 |
| 36 | +; GCN-NEXT: v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v33, v13 |
| 37 | +; GCN-NEXT: v_mov_b32_e32 v32, v12 |
| 38 | +; GCN-NEXT: v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v37, v17 |
| 39 | +; GCN-NEXT: v_mov_b32_e32 v36, v16 |
| 40 | +; GCN-NEXT: v_dual_mov_b32 v42, v22 :: v_dual_mov_b32 v41, v21 |
| 41 | +; GCN-NEXT: v_mov_b32_e32 v40, v20 |
| 42 | +; GCN-NEXT: v_dual_mov_b32 v46, v26 :: v_dual_mov_b32 v45, v25 |
| 43 | +; GCN-NEXT: v_mov_b32_e32 v44, v24 |
| 44 | +; GCN-NEXT: s_wait_dscnt 0x0 |
| 45 | +; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[28:31], v[8:11], v[0:7], v48 |
| 46 | +; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[32:35], v[12:15], v[0:7], v48 |
| 47 | +; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[36:39], v[16:19], v[0:7], v48 |
| 48 | +; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[40:43], v[20:23], v[0:7], v48 |
| 49 | +; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[44:47], v[24:27], v[0:7], v48 |
| 50 | +; GCN-NEXT: ds_store_b128 v49, v[28:31] |
| 51 | +; GCN-NEXT: ds_store_b128 v50, v[32:35] offset:512 |
| 52 | +; GCN-NEXT: ds_store_b128 v50, v[36:39] offset:1024 |
| 53 | +; GCN-NEXT: ds_store_b128 v50, v[40:43] offset:1536 |
| 54 | +; GCN-NEXT: ds_store_b128 v50, v[44:47] offset:2048 |
| 55 | +; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(5) SyncID(0) |
| 56 | +; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(5) SyncID(0) |
| 57 | +; GCN-NEXT: s_endpgm |
| 58 | +; |
| 59 | +; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_SWMMAC_cluster: |
| 60 | +; EXACTCUTOFF: ; %bb.0: ; %entry |
| 61 | +; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 |
| 62 | +; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v28, 4, v0 |
| 63 | +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v48, 0 |
| 64 | +; EXACTCUTOFF-NEXT: s_wait_kmcnt 0x0 |
| 65 | +; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| 66 | +; EXACTCUTOFF-NEXT: v_add_nc_u32_e32 v0, s0, v28 |
| 67 | +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v50, s1 :: v_dual_add_nc_u32 v49, s1, v28 |
| 68 | +; EXACTCUTOFF-NEXT: ds_load_b128 v[8:11], v0 |
| 69 | +; EXACTCUTOFF-NEXT: ds_load_b128 v[12:15], v0 offset:512 |
| 70 | +; EXACTCUTOFF-NEXT: ds_load_b128 v[16:19], v0 offset:1536 |
| 71 | +; EXACTCUTOFF-NEXT: ds_load_b128 v[20:23], v0 offset:3072 |
| 72 | +; EXACTCUTOFF-NEXT: ds_load_b128 v[24:27], v0 offset:5120 |
| 73 | +; EXACTCUTOFF-NEXT: ds_load_b128 v[4:7], v0 offset:11280 |
| 74 | +; EXACTCUTOFF-NEXT: ds_load_b128 v[0:3], v0 offset:11264 |
| 75 | +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(7) SyncID(0) |
| 76 | +; EXACTCUTOFF-NEXT: s_wait_dscnt 0x6 |
| 77 | +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v31, v11 |
| 78 | +; EXACTCUTOFF-NEXT: s_wait_dscnt 0x5 |
| 79 | +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v35, v15 |
| 80 | +; EXACTCUTOFF-NEXT: s_wait_dscnt 0x4 |
| 81 | +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v39, v19 |
| 82 | +; EXACTCUTOFF-NEXT: s_wait_dscnt 0x3 |
| 83 | +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v43, v23 |
| 84 | +; EXACTCUTOFF-NEXT: s_wait_dscnt 0x2 |
| 85 | +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v47, v27 :: v_dual_mov_b32 v30, v10 |
| 86 | +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v29, v9 :: v_dual_mov_b32 v28, v8 |
| 87 | +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v33, v13 |
| 88 | +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v32, v12 |
| 89 | +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v37, v17 |
| 90 | +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v36, v16 |
| 91 | +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v42, v22 :: v_dual_mov_b32 v41, v21 |
| 92 | +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v40, v20 |
| 93 | +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v46, v26 :: v_dual_mov_b32 v45, v25 |
| 94 | +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v44, v24 |
| 95 | +; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0 |
| 96 | +; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[28:31], v[8:11], v[0:7], v48 |
| 97 | +; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[32:35], v[12:15], v[0:7], v48 |
| 98 | +; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[36:39], v[16:19], v[0:7], v48 |
| 99 | +; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[40:43], v[20:23], v[0:7], v48 |
| 100 | +; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[44:47], v[24:27], v[0:7], v48 |
| 101 | +; EXACTCUTOFF-NEXT: ds_store_b128 v49, v[28:31] |
| 102 | +; EXACTCUTOFF-NEXT: ds_store_b128 v50, v[32:35] offset:512 |
| 103 | +; EXACTCUTOFF-NEXT: ds_store_b128 v50, v[36:39] offset:1024 |
| 104 | +; EXACTCUTOFF-NEXT: ds_store_b128 v50, v[40:43] offset:1536 |
| 105 | +; EXACTCUTOFF-NEXT: ds_store_b128 v50, v[44:47] offset:2048 |
| 106 | +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(5) SyncID(0) |
| 107 | +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(5) SyncID(0) |
| 108 | +; EXACTCUTOFF-NEXT: s_endpgm |
| 109 | +entry: |
| 110 | + %idx = call i32 @llvm.amdgcn.workitem.id.x() |
| 111 | + %load.0.addr = getelementptr <8 x half>, ptr addrspace(3) %in, i32 %idx |
| 112 | + %load.0 = load <8 x half>, ptr addrspace(3) %load.0.addr |
| 113 | + %load.1.addr = getelementptr <8 x half>, ptr addrspace(3) %load.0.addr, i32 32 |
| 114 | + %load.1 = load <8 x half>, ptr addrspace(3) %load.1.addr |
| 115 | + %load.2.addr = getelementptr <8 x half>, ptr addrspace(3) %load.1.addr, i32 64 |
| 116 | + %load.2 = load <8 x half>, ptr addrspace(3) %load.2.addr |
| 117 | + %load.3.addr = getelementptr <8 x half>, ptr addrspace(3) %load.2.addr, i32 96 |
| 118 | + %load.3 = load <8 x half>, ptr addrspace(3) %load.3.addr |
| 119 | + %load.4.addr = getelementptr <8 x half>, ptr addrspace(3) %load.3.addr, i32 128 |
| 120 | + %load.4 = load <8 x half>, ptr addrspace(3) %load.4.addr |
| 121 | + %load.b.addr = getelementptr <16 x half>, ptr addrspace(3) %load.4.addr, i32 192 |
| 122 | + %load.b = load <16 x half>, ptr addrspace(3) %load.b.addr |
| 123 | + %mai.0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.0, <16 x half> %load.b, <8 x half> %load.0, i1 0) |
| 124 | + %mai.1 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.1, <16 x half> %load.b, <8 x half> %load.1, i1 0) |
| 125 | + %mai.2 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.2, <16 x half> %load.b, <8 x half> %load.2, i1 0) |
| 126 | + %mai.3 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.3, <16 x half> %load.b, <8 x half> %load.3, i1 0) |
| 127 | + %mai.4 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.4, <16 x half> %load.b, <8 x half> %load.4, i1 0) |
| 128 | + %store.0.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 %idx |
| 129 | + store <8 x half> %mai.0, ptr addrspace(3) %store.0.addr |
| 130 | + %store.1.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 32 |
| 131 | + store <8 x half> %mai.1, ptr addrspace(3) %store.1.addr |
| 132 | + %store.2.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 64 |
| 133 | + store <8 x half> %mai.2, ptr addrspace(3) %store.2.addr |
| 134 | + %store.3.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 96 |
| 135 | + store <8 x half> %mai.3, ptr addrspace(3) %store.3.addr |
| 136 | + %store.4.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 128 |
| 137 | + store <8 x half> %mai.4, ptr addrspace(3) %store.4.addr |
| 138 | + ; 7 DS read |
| 139 | + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 7, i32 0) |
| 140 | + ; 5 SWMMAC |
| 141 | + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 5, i32 0) |
| 142 | + ; 5 DS write |
| 143 | + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 5, i32 0) |
| 144 | + ret void |
| 145 | +} |
| 146 | + |
| 147 | +define amdgpu_kernel void @test_sched_group_barrier_pipeline_SWMMAC_interleaved(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { |
| 148 | +; GCN-LABEL: test_sched_group_barrier_pipeline_SWMMAC_interleaved: |
| 149 | +; GCN: ; %bb.0: ; %entry |
| 150 | +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 |
| 151 | +; GCN-NEXT: v_mov_b32_e32 v18, 0 |
| 152 | +; GCN-NEXT: s_wait_kmcnt 0x0 |
| 153 | +; GCN-NEXT: v_lshl_add_u32 v17, v0, 5, s0 |
| 154 | +; GCN-NEXT: v_lshl_add_u32 v0, v0, 4, s1 |
| 155 | +; GCN-NEXT: ds_load_b128 v[9:12], v17 offset:1024 |
| 156 | +; GCN-NEXT: ds_load_b128 v[1:4], v17 |
| 157 | +; GCN-NEXT: ds_load_b128 v[5:8], v17 offset:16 |
| 158 | +; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(3) SyncID(0) |
| 159 | +; GCN-NEXT: s_wait_dscnt 0x2 |
| 160 | +; GCN-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 |
| 161 | +; GCN-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 |
| 162 | +; GCN-NEXT: s_wait_dscnt 0x0 |
| 163 | +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| 164 | +; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 |
| 165 | +; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) |
| 166 | +; GCN-NEXT: ds_store_b128 v0, v[13:16] |
| 167 | +; GCN-NEXT: ds_load_b128 v[9:12], v17 offset:2560 |
| 168 | +; GCN-NEXT: v_mov_b32_e32 v0, s1 |
| 169 | +; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) |
| 170 | +; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) |
| 171 | +; GCN-NEXT: s_wait_dscnt 0x0 |
| 172 | +; GCN-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 |
| 173 | +; GCN-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 |
| 174 | +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| 175 | +; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 |
| 176 | +; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) |
| 177 | +; GCN-NEXT: ds_store_b128 v0, v[13:16] offset:512 |
| 178 | +; GCN-NEXT: ds_load_b128 v[9:12], v17 offset:4608 |
| 179 | +; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) |
| 180 | +; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) |
| 181 | +; GCN-NEXT: s_wait_dscnt 0x0 |
| 182 | +; GCN-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 |
| 183 | +; GCN-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 |
| 184 | +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| 185 | +; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 |
| 186 | +; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) |
| 187 | +; GCN-NEXT: ds_store_b128 v0, v[13:16] offset:1024 |
| 188 | +; GCN-NEXT: ds_load_b128 v[9:12], v17 offset:7168 |
| 189 | +; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) |
| 190 | +; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) |
| 191 | +; GCN-NEXT: s_wait_dscnt 0x0 |
| 192 | +; GCN-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 |
| 193 | +; GCN-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 |
| 194 | +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| 195 | +; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 |
| 196 | +; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) |
| 197 | +; GCN-NEXT: ds_store_b128 v0, v[13:16] offset:1536 |
| 198 | +; GCN-NEXT: ds_load_b128 v[9:12], v17 offset:10240 |
| 199 | +; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) |
| 200 | +; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) |
| 201 | +; GCN-NEXT: s_wait_dscnt 0x0 |
| 202 | +; GCN-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 |
| 203 | +; GCN-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 |
| 204 | +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| 205 | +; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 |
| 206 | +; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) |
| 207 | +; GCN-NEXT: ds_store_b128 v0, v[13:16] offset:2048 |
| 208 | +; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) |
| 209 | +; GCN-NEXT: s_endpgm |
| 210 | +; |
| 211 | +; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_SWMMAC_interleaved: |
| 212 | +; EXACTCUTOFF: ; %bb.0: ; %entry |
| 213 | +; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 |
| 214 | +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v18, 0 |
| 215 | +; EXACTCUTOFF-NEXT: s_wait_kmcnt 0x0 |
| 216 | +; EXACTCUTOFF-NEXT: v_lshl_add_u32 v17, v0, 5, s0 |
| 217 | +; EXACTCUTOFF-NEXT: v_lshl_add_u32 v0, v0, 4, s1 |
| 218 | +; EXACTCUTOFF-NEXT: ds_load_b128 v[9:12], v17 offset:1024 |
| 219 | +; EXACTCUTOFF-NEXT: ds_load_b128 v[1:4], v17 |
| 220 | +; EXACTCUTOFF-NEXT: ds_load_b128 v[5:8], v17 offset:16 |
| 221 | +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(3) SyncID(0) |
| 222 | +; EXACTCUTOFF-NEXT: s_wait_dscnt 0x2 |
| 223 | +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 |
| 224 | +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 |
| 225 | +; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0 |
| 226 | +; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| 227 | +; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 |
| 228 | +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) |
| 229 | +; EXACTCUTOFF-NEXT: ds_store_b128 v0, v[13:16] |
| 230 | +; EXACTCUTOFF-NEXT: ds_load_b128 v[9:12], v17 offset:2560 |
| 231 | +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v0, s1 |
| 232 | +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) |
| 233 | +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) |
| 234 | +; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0 |
| 235 | +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 |
| 236 | +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 |
| 237 | +; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| 238 | +; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 |
| 239 | +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) |
| 240 | +; EXACTCUTOFF-NEXT: ds_store_b128 v0, v[13:16] offset:512 |
| 241 | +; EXACTCUTOFF-NEXT: ds_load_b128 v[9:12], v17 offset:4608 |
| 242 | +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) |
| 243 | +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) |
| 244 | +; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0 |
| 245 | +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 |
| 246 | +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 |
| 247 | +; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| 248 | +; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 |
| 249 | +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) |
| 250 | +; EXACTCUTOFF-NEXT: ds_store_b128 v0, v[13:16] offset:1024 |
| 251 | +; EXACTCUTOFF-NEXT: ds_load_b128 v[9:12], v17 offset:7168 |
| 252 | +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) |
| 253 | +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) |
| 254 | +; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0 |
| 255 | +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 |
| 256 | +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 |
| 257 | +; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| 258 | +; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 |
| 259 | +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) |
| 260 | +; EXACTCUTOFF-NEXT: ds_store_b128 v0, v[13:16] offset:1536 |
| 261 | +; EXACTCUTOFF-NEXT: ds_load_b128 v[9:12], v17 offset:10240 |
| 262 | +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) |
| 263 | +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) |
| 264 | +; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0 |
| 265 | +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 |
| 266 | +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 |
| 267 | +; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| 268 | +; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 |
| 269 | +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) |
| 270 | +; EXACTCUTOFF-NEXT: ds_store_b128 v0, v[13:16] offset:2048 |
| 271 | +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) |
| 272 | +; EXACTCUTOFF-NEXT: s_endpgm |
| 273 | +entry: |
| 274 | + %idx = call i32 @llvm.amdgcn.workitem.id.x() |
| 275 | + %load.b.addr = getelementptr <16 x half>, ptr addrspace(3) %in, i32 %idx |
| 276 | + %load.b = load <16 x half>, ptr addrspace(3) %load.b.addr |
| 277 | + %load.0.addr = getelementptr <8 x half>, ptr addrspace(3) %load.b.addr, i32 64 |
| 278 | + %load.0 = load <8 x half>, ptr addrspace(3) %load.0.addr |
| 279 | + %load.1.addr = getelementptr <8 x half>, ptr addrspace(3) %load.0.addr, i32 96 |
| 280 | + %load.1 = load <8 x half>, ptr addrspace(3) %load.1.addr |
| 281 | + %load.2.addr = getelementptr <8 x half>, ptr addrspace(3) %load.1.addr, i32 128 |
| 282 | + %load.2 = load <8 x half>, ptr addrspace(3) %load.2.addr |
| 283 | + %load.3.addr = getelementptr <8 x half>, ptr addrspace(3) %load.2.addr, i32 160 |
| 284 | + %load.3 = load <8 x half>, ptr addrspace(3) %load.3.addr |
| 285 | + %load.4.addr = getelementptr <8 x half>, ptr addrspace(3) %load.3.addr, i32 192 |
| 286 | + %load.4 = load <8 x half>, ptr addrspace(3) %load.4.addr |
| 287 | + %mai.0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.0, <16 x half> %load.b, <8 x half> %load.0, i1 0) |
| 288 | + %mai.1 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.1, <16 x half> %load.b, <8 x half> %load.1, i1 0) |
| 289 | + %mai.2 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.2, <16 x half> %load.b, <8 x half> %load.2, i1 0) |
| 290 | + %mai.3 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.3, <16 x half> %load.b, <8 x half> %load.3, i1 0) |
| 291 | + %mai.4 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.4, <16 x half> %load.b, <8 x half> %load.4, i1 0) |
| 292 | + %store.0.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 %idx |
| 293 | + store <8 x half> %mai.0, ptr addrspace(3) %store.0.addr |
| 294 | + %store.1.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 32 |
| 295 | + store <8 x half> %mai.1, ptr addrspace(3) %store.1.addr |
| 296 | + %store.2.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 64 |
| 297 | + store <8 x half> %mai.2, ptr addrspace(3) %store.2.addr |
| 298 | + %store.3.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 96 |
| 299 | + store <8 x half> %mai.3, ptr addrspace(3) %store.3.addr |
| 300 | + %store.4.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 128 |
| 301 | + store <8 x half> %mai.4, ptr addrspace(3) %store.4.addr |
| 302 | + ; 3 DS read |
| 303 | + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 3, i32 0) |
| 304 | + ; 1 SWMMAC |
| 305 | + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) |
| 306 | + ; 1 DS write |
| 307 | + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0) |
| 308 | + ; 1 DS read |
| 309 | + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0) |
| 310 | + ; 1 SWMMAC |
| 311 | + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) |
| 312 | + ; 1 DS write |
| 313 | + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0) |
| 314 | + ; 1 DS read |
| 315 | + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0) |
| 316 | + ; 1 SWMMAC |
| 317 | + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) |
| 318 | + ; 1 DS write |
| 319 | + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0) |
| 320 | + ; 1 DS read |
| 321 | + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0) |
| 322 | + ; 1 SWMMAC |
| 323 | + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) |
| 324 | + ; 1 DS write |
| 325 | + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0) |
| 326 | + ; 1 DS read |
| 327 | + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0) |
| 328 | + ; 1 SWMMAC |
| 329 | + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) |
| 330 | + ; 1 DS write |
| 331 | + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0) |
| 332 | + ret void |
| 333 | +} |
0 commit comments