|
| 1 | +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py |
| 2 | +; RUN: llc -march=amdgcn -mcpu=gfx1102 -mattr=+wavefrontsize32 -misched=gcn-max-occupancy -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK |
| 3 | + |
| 4 | +; Provide a long sequence of 32 vec4 load/store pairs that ought to be fully |
| 5 | +; overlapped for latency hiding. Doing so requires using (at least) 128 VGPRs, |
| 6 | +; which currently looks to the scheduler like an occupancy reduction, even |
| 7 | +; though it's not. TODO: Fix! |
| 8 | + |
| 9 | +; 6 kB of LDS, allows 10 workgroups |
| 10 | +@lds = internal addrspace(3) global [384 x <4 x i32>] undef |
| 11 | + |
| 12 | +define internal void @copy(ptr addrspace(1) %src, i32 %ofs) alwaysinline { |
| 13 | + %src.gep = getelementptr <4 x i32>, ptr addrspace(1) %src, i32 %ofs |
| 14 | + %ld = load <4 x i32>, ptr addrspace(1) %src.gep |
| 15 | + %dst.gep = getelementptr <4 x i32>, ptr addrspace(3) @lds, i32 %ofs |
| 16 | + store <4 x i32> %ld, ptr addrspace(3) %dst.gep |
| 17 | + ret void |
| 18 | +} |
| 19 | + |
| 20 | +define amdgpu_cs void @test(ptr addrspace(1) %src) "amdgpu-flat-work-group-size"="32,32" { |
| 21 | +; CHECK-LABEL: test: |
| 22 | +; CHECK: ; %bb.0: |
| 23 | +; CHECK-NEXT: s_clause 0xa |
| 24 | +; CHECK-NEXT: global_load_b128 v[2:5], v[0:1], off |
| 25 | +; CHECK-NEXT: global_load_b128 v[6:9], v[0:1], off offset:16 |
| 26 | +; CHECK-NEXT: global_load_b128 v[10:13], v[0:1], off offset:32 |
| 27 | +; CHECK-NEXT: global_load_b128 v[14:17], v[0:1], off offset:48 |
| 28 | +; CHECK-NEXT: global_load_b128 v[18:21], v[0:1], off offset:64 |
| 29 | +; CHECK-NEXT: global_load_b128 v[22:25], v[0:1], off offset:80 |
| 30 | +; CHECK-NEXT: global_load_b128 v[26:29], v[0:1], off offset:96 |
| 31 | +; CHECK-NEXT: global_load_b128 v[30:33], v[0:1], off offset:112 |
| 32 | +; CHECK-NEXT: global_load_b128 v[34:37], v[0:1], off offset:128 |
| 33 | +; CHECK-NEXT: global_load_b128 v[38:41], v[0:1], off offset:144 |
| 34 | +; CHECK-NEXT: global_load_b128 v[42:45], v[0:1], off offset:160 |
| 35 | +; CHECK-NEXT: v_mov_b32_e32 v86, 0 |
| 36 | +; CHECK-NEXT: s_clause 0x8 |
| 37 | +; CHECK-NEXT: global_load_b128 v[46:49], v[0:1], off offset:176 |
| 38 | +; CHECK-NEXT: global_load_b128 v[50:53], v[0:1], off offset:240 |
| 39 | +; CHECK-NEXT: global_load_b128 v[54:57], v[0:1], off offset:224 |
| 40 | +; CHECK-NEXT: global_load_b128 v[58:61], v[0:1], off offset:208 |
| 41 | +; CHECK-NEXT: global_load_b128 v[62:65], v[0:1], off offset:192 |
| 42 | +; CHECK-NEXT: global_load_b128 v[66:69], v[0:1], off offset:304 |
| 43 | +; CHECK-NEXT: global_load_b128 v[70:73], v[0:1], off offset:288 |
| 44 | +; CHECK-NEXT: global_load_b128 v[74:77], v[0:1], off offset:272 |
| 45 | +; CHECK-NEXT: global_load_b128 v[78:81], v[0:1], off offset:256 |
| 46 | +; CHECK-NEXT: s_waitcnt vmcnt(19) |
| 47 | +; CHECK-NEXT: ds_store_b128 v86, v[2:5] |
| 48 | +; CHECK-NEXT: s_waitcnt vmcnt(18) |
| 49 | +; CHECK-NEXT: ds_store_b128 v86, v[6:9] offset:16 |
| 50 | +; CHECK-NEXT: s_waitcnt vmcnt(17) |
| 51 | +; CHECK-NEXT: ds_store_b128 v86, v[10:13] offset:32 |
| 52 | +; CHECK-NEXT: s_waitcnt vmcnt(16) |
| 53 | +; CHECK-NEXT: ds_store_b128 v86, v[14:17] offset:48 |
| 54 | +; CHECK-NEXT: s_waitcnt vmcnt(15) |
| 55 | +; CHECK-NEXT: ds_store_b128 v86, v[18:21] offset:64 |
| 56 | +; CHECK-NEXT: s_waitcnt vmcnt(14) |
| 57 | +; CHECK-NEXT: ds_store_b128 v86, v[22:25] offset:80 |
| 58 | +; CHECK-NEXT: s_waitcnt vmcnt(13) |
| 59 | +; CHECK-NEXT: ds_store_b128 v86, v[26:29] offset:96 |
| 60 | +; CHECK-NEXT: s_waitcnt vmcnt(12) |
| 61 | +; CHECK-NEXT: ds_store_b128 v86, v[30:33] offset:112 |
| 62 | +; CHECK-NEXT: s_waitcnt vmcnt(11) |
| 63 | +; CHECK-NEXT: ds_store_b128 v86, v[34:37] offset:128 |
| 64 | +; CHECK-NEXT: s_waitcnt vmcnt(10) |
| 65 | +; CHECK-NEXT: ds_store_b128 v86, v[38:41] offset:144 |
| 66 | +; CHECK-NEXT: s_waitcnt vmcnt(9) |
| 67 | +; CHECK-NEXT: ds_store_b128 v86, v[42:45] offset:160 |
| 68 | +; CHECK-NEXT: s_clause 0xb |
| 69 | +; CHECK-NEXT: global_load_b128 v[2:5], v[0:1], off offset:368 |
| 70 | +; CHECK-NEXT: global_load_b128 v[6:9], v[0:1], off offset:352 |
| 71 | +; CHECK-NEXT: global_load_b128 v[10:13], v[0:1], off offset:336 |
| 72 | +; CHECK-NEXT: global_load_b128 v[14:17], v[0:1], off offset:320 |
| 73 | +; CHECK-NEXT: global_load_b128 v[18:21], v[0:1], off offset:432 |
| 74 | +; CHECK-NEXT: global_load_b128 v[22:25], v[0:1], off offset:416 |
| 75 | +; CHECK-NEXT: global_load_b128 v[26:29], v[0:1], off offset:400 |
| 76 | +; CHECK-NEXT: global_load_b128 v[30:33], v[0:1], off offset:384 |
| 77 | +; CHECK-NEXT: global_load_b128 v[34:37], v[0:1], off offset:464 |
| 78 | +; CHECK-NEXT: global_load_b128 v[38:41], v[0:1], off offset:448 |
| 79 | +; CHECK-NEXT: global_load_b128 v[42:45], v[0:1], off offset:480 |
| 80 | +; CHECK-NEXT: global_load_b128 v[82:85], v[0:1], off offset:496 |
| 81 | +; CHECK-NEXT: s_waitcnt vmcnt(20) |
| 82 | +; CHECK-NEXT: ds_store_b128 v86, v[46:49] offset:176 |
| 83 | +; CHECK-NEXT: s_waitcnt vmcnt(16) |
| 84 | +; CHECK-NEXT: ds_store_b128 v86, v[62:65] offset:192 |
| 85 | +; CHECK-NEXT: ds_store_b128 v86, v[58:61] offset:208 |
| 86 | +; CHECK-NEXT: ds_store_b128 v86, v[54:57] offset:224 |
| 87 | +; CHECK-NEXT: ds_store_b128 v86, v[50:53] offset:240 |
| 88 | +; CHECK-NEXT: s_waitcnt vmcnt(12) |
| 89 | +; CHECK-NEXT: ds_store_b128 v86, v[78:81] offset:256 |
| 90 | +; CHECK-NEXT: ds_store_b128 v86, v[74:77] offset:272 |
| 91 | +; CHECK-NEXT: ds_store_b128 v86, v[70:73] offset:288 |
| 92 | +; CHECK-NEXT: ds_store_b128 v86, v[66:69] offset:304 |
| 93 | +; CHECK-NEXT: s_waitcnt vmcnt(8) |
| 94 | +; CHECK-NEXT: ds_store_b128 v86, v[14:17] offset:320 |
| 95 | +; CHECK-NEXT: ds_store_b128 v86, v[10:13] offset:336 |
| 96 | +; CHECK-NEXT: ds_store_b128 v86, v[6:9] offset:352 |
| 97 | +; CHECK-NEXT: ds_store_b128 v86, v[2:5] offset:368 |
| 98 | +; CHECK-NEXT: s_waitcnt vmcnt(4) |
| 99 | +; CHECK-NEXT: ds_store_b128 v86, v[30:33] offset:384 |
| 100 | +; CHECK-NEXT: ds_store_b128 v86, v[26:29] offset:400 |
| 101 | +; CHECK-NEXT: ds_store_b128 v86, v[22:25] offset:416 |
| 102 | +; CHECK-NEXT: ds_store_b128 v86, v[18:21] offset:432 |
| 103 | +; CHECK-NEXT: s_waitcnt vmcnt(2) |
| 104 | +; CHECK-NEXT: ds_store_b128 v86, v[38:41] offset:448 |
| 105 | +; CHECK-NEXT: ds_store_b128 v86, v[34:37] offset:464 |
| 106 | +; CHECK-NEXT: s_waitcnt vmcnt(1) |
| 107 | +; CHECK-NEXT: ds_store_b128 v86, v[42:45] offset:480 |
| 108 | +; CHECK-NEXT: s_waitcnt vmcnt(0) |
| 109 | +; CHECK-NEXT: ds_store_b128 v86, v[82:85] offset:496 |
| 110 | +; CHECK-NEXT: s_endpgm |
| 111 | + call void @copy(ptr addrspace(1) %src, i32 0) |
| 112 | + call void @copy(ptr addrspace(1) %src, i32 1) |
| 113 | + call void @copy(ptr addrspace(1) %src, i32 2) |
| 114 | + call void @copy(ptr addrspace(1) %src, i32 3) |
| 115 | + call void @copy(ptr addrspace(1) %src, i32 4) |
| 116 | + call void @copy(ptr addrspace(1) %src, i32 5) |
| 117 | + call void @copy(ptr addrspace(1) %src, i32 6) |
| 118 | + call void @copy(ptr addrspace(1) %src, i32 7) |
| 119 | + call void @copy(ptr addrspace(1) %src, i32 8) |
| 120 | + call void @copy(ptr addrspace(1) %src, i32 9) |
| 121 | + call void @copy(ptr addrspace(1) %src, i32 10) |
| 122 | + call void @copy(ptr addrspace(1) %src, i32 11) |
| 123 | + call void @copy(ptr addrspace(1) %src, i32 12) |
| 124 | + call void @copy(ptr addrspace(1) %src, i32 13) |
| 125 | + call void @copy(ptr addrspace(1) %src, i32 14) |
| 126 | + call void @copy(ptr addrspace(1) %src, i32 15) |
| 127 | + call void @copy(ptr addrspace(1) %src, i32 16) |
| 128 | + call void @copy(ptr addrspace(1) %src, i32 17) |
| 129 | + call void @copy(ptr addrspace(1) %src, i32 18) |
| 130 | + call void @copy(ptr addrspace(1) %src, i32 19) |
| 131 | + call void @copy(ptr addrspace(1) %src, i32 20) |
| 132 | + call void @copy(ptr addrspace(1) %src, i32 21) |
| 133 | + call void @copy(ptr addrspace(1) %src, i32 22) |
| 134 | + call void @copy(ptr addrspace(1) %src, i32 23) |
| 135 | + call void @copy(ptr addrspace(1) %src, i32 24) |
| 136 | + call void @copy(ptr addrspace(1) %src, i32 25) |
| 137 | + call void @copy(ptr addrspace(1) %src, i32 26) |
| 138 | + call void @copy(ptr addrspace(1) %src, i32 27) |
| 139 | + call void @copy(ptr addrspace(1) %src, i32 28) |
| 140 | + call void @copy(ptr addrspace(1) %src, i32 29) |
| 141 | + call void @copy(ptr addrspace(1) %src, i32 30) |
| 142 | + call void @copy(ptr addrspace(1) %src, i32 31) |
| 143 | + ret void |
| 144 | +} |
0 commit comments