Skip to content
This repository was archived by the owner on Mar 28, 2020. It is now read-only.

Commit d3adac5

Browse files
committed
AMDGPU/SI: Enable lanemask tracking in misched
Summary: This results in higher register usage, but should make it easier for the compiler to hide latency. This pass is a prerequisite for some more scheduler improvements, and I think the increase register usage with this patch is acceptable, because when combined with the scheduler improvements, the total register usage will decrease. shader-db stats: 2382 shaders in 478 tests Totals: SGPRS: 48672 -> 49088 (0.85 %) VGPRS: 34148 -> 34847 (2.05 %) Code Size: 1285816 -> 1289128 (0.26 %) bytes LDS: 28 -> 28 (0.00 %) blocks Scratch: 492544 -> 573440 (16.42 %) bytes per wave Max Waves: 6856 -> 6846 (-0.15 %) Wait states: 0 -> 0 (0.00 %) Depends on D18451 Reviewers: nhaehnle, arsenm Subscribers: arsenm, llvm-commits Differential Revision: http://reviews.llvm.org/D18452 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@264876 91177308-0d34-0410-b5e6-96231b3b80d8
1 parent 5254fb9 commit d3adac5

31 files changed

+128
-139
lines changed

lib/Target/AMDGPU/AMDGPUSubtarget.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,10 @@ void AMDGPUSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
156156
// register spills than just using one of these approaches on its own.
157157
Policy.OnlyTopDown = false;
158158
Policy.OnlyBottomUp = false;
159+
160+
// Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
161+
if (!enableSIScheduler())
162+
Policy.ShouldTrackLaneMasks = true;
159163
}
160164
}
161165

test/CodeGen/AMDGPU/and.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -282,11 +282,11 @@ define void @v_and_multi_use_constant_i64(i64 addrspace(1)* %out, i64 addrspace(
282282
; SI: buffer_load_dwordx2 v{{\[}}[[LO1:[0-9]+]]:[[HI1:[0-9]+]]{{\]}}
283283
; SI-NOT: and
284284
; SI: v_and_b32_e32 v[[RESLO0:[0-9]+]], 63, v[[LO0]]
285-
; SI: v_and_b32_e32 v[[RESLO1:[0-9]+]], 63, v[[LO1]]
286285
; SI-NOT: and
287-
; SI: buffer_store_dwordx2
286+
; SI: buffer_store_dwordx2 v{{\[}}[[RESLO0]]
287+
; SI: v_and_b32_e32 v[[RESLO1:[0-9]+]], 63, v[[LO1]]
288288
; SI-NOT: and
289-
; SI: buffer_store_dwordx2
289+
; SI: buffer_store_dwordx2 v{{\[}}[[RESLO1]]
290290
define void @v_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
291291
%a = load volatile i64, i64 addrspace(1)* %aptr
292292
%b = load volatile i64, i64 addrspace(1)* %aptr

test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,11 @@
33
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN -check-prefix=FUNC %s
44

55
; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_ret_i32_offset:
6-
; GCN: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7
76
; SICI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
87
; SICI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
98
; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
109
; VI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
10+
; GCN: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7
1111
; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
1212
; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]]
1313
; GCN: ds_cmpst_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[VCMP]], [[VSWAP]] offset:16
@@ -21,12 +21,12 @@ define void @lds_atomic_cmpxchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrs
2121
}
2222

2323
; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_ret_i64_offset:
24-
; GCN-DAG: v_mov_b32_e32 v[[LOVCMP:[0-9]+]], 7
25-
; GCN-DAG: v_mov_b32_e32 v[[HIVCMP:[0-9]+]], 0
2624
; SICI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
2725
; SICI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
2826
; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
2927
; VI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34
28+
; GCN-DAG: v_mov_b32_e32 v[[LOVCMP:[0-9]+]], 7
29+
; GCN-DAG: v_mov_b32_e32 v[[HIVCMP:[0-9]+]], 0
3030
; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
3131
; GCN-DAG: v_mov_b32_e32 v[[LOSWAPV:[0-9]+]], s[[LOSWAP]]
3232
; GCN-DAG: v_mov_b32_e32 v[[HISWAPV:[0-9]+]], s[[HISWAP]]

test/CodeGen/AMDGPU/captured-frame-index.ll

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,10 @@
22

33
; GCN-LABEL: {{^}}stored_fi_to_lds:
44
; GCN: s_load_dword [[LDSPTR:s[0-9]+]]
5-
; GCN: v_mov_b32_e32 [[ZERO0:v[0-9]+]], 0{{$}}
65
; GCN: v_mov_b32_e32 [[ZERO1:v[0-9]+]], 0{{$}}
76
; GCN: buffer_store_dword v{{[0-9]+}}, [[ZERO1]]
8-
7+
; GCN: v_mov_b32_e32 [[ZERO0:v[0-9]+]], 0{{$}}
98
; GCN: v_mov_b32_e32 [[VLDSPTR:v[0-9]+]], [[LDSPTR]]
10-
119
; GCN: ds_write_b32 [[VLDSPTR]], [[ZERO0]]
1210
define void @stored_fi_to_lds(float* addrspace(3)* %ptr) #0 {
1311
%tmp = alloca float
@@ -19,14 +17,14 @@ define void @stored_fi_to_lds(float* addrspace(3)* %ptr) #0 {
1917
; Offset is applied
2018
; GCN-LABEL: {{^}}stored_fi_to_lds_2_small_objects:
2119
; GCN: s_load_dword [[LDSPTR:s[0-9]+]]
22-
; GCN: v_mov_b32_e32 [[ZERO0:v[0-9]+]], 0{{$}}
2320
; GCN: v_mov_b32_e32 [[ZERO1:v[0-9]+]], 0{{$}}
2421
; GCN: buffer_store_dword v{{[0-9]+}}, [[ZERO1]]
2522

2623
; GCN: v_mov_b32_e32 [[FI1:v[0-9]+]], 4{{$}}
2724
; GCN: buffer_store_dword v{{[0-9]+}}, [[FI1]]
2825

2926

27+
; GCN-DAG: v_mov_b32_e32 [[ZERO0:v[0-9]+]], 0{{$}}
3028
; GCN-DAG: v_mov_b32_e32 [[VLDSPTR:v[0-9]+]], [[LDSPTR]]
3129
; GCN: ds_write_b32 [[VLDSPTR]], [[ZERO0]]
3230

test/CodeGen/AMDGPU/commute_modifiers.ll

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ declare float @llvm.fma.f32(float, float, float) nounwind readnone
77
; FUNC-LABEL: @commute_add_imm_fabs_f32
88
; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
99
; SI: v_add_f32_e64 [[REG:v[0-9]+]], 2.0, |[[X]]|
10-
; SI-NEXT: buffer_store_dword [[REG]]
10+
; SI: buffer_store_dword [[REG]]
1111
define void @commute_add_imm_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
1212
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1313
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
@@ -21,7 +21,7 @@ define void @commute_add_imm_fabs_f32(float addrspace(1)* %out, float addrspace(
2121
; FUNC-LABEL: @commute_mul_imm_fneg_fabs_f32
2222
; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
2323
; SI: v_mul_f32_e64 [[REG:v[0-9]+]], -4.0, |[[X]]|
24-
; SI-NEXT: buffer_store_dword [[REG]]
24+
; SI: buffer_store_dword [[REG]]
2525
define void @commute_mul_imm_fneg_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
2626
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
2727
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
@@ -36,7 +36,7 @@ define void @commute_mul_imm_fneg_fabs_f32(float addrspace(1)* %out, float addrs
3636
; FUNC-LABEL: @commute_mul_imm_fneg_f32
3737
; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
3838
; SI: v_mul_f32_e32 [[REG:v[0-9]+]], -4.0, [[X]]
39-
; SI-NEXT: buffer_store_dword [[REG]]
39+
; SI: buffer_store_dword [[REG]]
4040
define void @commute_mul_imm_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
4141
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
4242
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
@@ -52,7 +52,7 @@ define void @commute_mul_imm_fneg_f32(float addrspace(1)* %out, float addrspace(
5252
; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
5353
; SI: v_mov_b32_e32 [[K:v[0-9]+]], 0x44800000
5454
; SI: v_add_f32_e64 [[REG:v[0-9]+]], |[[X]]|, [[K]]
55-
; SI-NEXT: buffer_store_dword [[REG]]
55+
; SI: buffer_store_dword [[REG]]
5656
define void @commute_add_lit_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
5757
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
5858
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
@@ -67,7 +67,7 @@ define void @commute_add_lit_fabs_f32(float addrspace(1)* %out, float addrspace(
6767
; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
6868
; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
6969
; SI: v_add_f32_e64 [[REG:v[0-9]+]], [[X]], |[[Y]]|
70-
; SI-NEXT: buffer_store_dword [[REG]]
70+
; SI: buffer_store_dword [[REG]]
7171
define void @commute_add_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
7272
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
7373
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
@@ -84,7 +84,7 @@ define void @commute_add_fabs_f32(float addrspace(1)* %out, float addrspace(1)*
8484
; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
8585
; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
8686
; SI: v_mul_f32_e64 [[REG:v[0-9]+]], [[X]], -[[Y]]
87-
; SI-NEXT: buffer_store_dword [[REG]]
87+
; SI: buffer_store_dword [[REG]]
8888
define void @commute_mul_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
8989
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
9090
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
@@ -101,7 +101,7 @@ define void @commute_mul_fneg_f32(float addrspace(1)* %out, float addrspace(1)*
101101
; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
102102
; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
103103
; SI: v_mul_f32_e64 [[REG:v[0-9]+]], [[X]], -|[[Y]]|
104-
; SI-NEXT: buffer_store_dword [[REG]]
104+
; SI: buffer_store_dword [[REG]]
105105
define void @commute_mul_fabs_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
106106
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
107107
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
@@ -120,7 +120,7 @@ define void @commute_mul_fabs_fneg_f32(float addrspace(1)* %out, float addrspace
120120
; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
121121
; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
122122
; SI: v_mul_f32_e64 [[REG:v[0-9]+]], |[[X]]|, |[[Y]]|
123-
; SI-NEXT: buffer_store_dword [[REG]]
123+
; SI: buffer_store_dword [[REG]]
124124
define void @commute_mul_fabs_x_fabs_y_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
125125
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
126126
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
@@ -138,7 +138,7 @@ define void @commute_mul_fabs_x_fabs_y_f32(float addrspace(1)* %out, float addrs
138138
; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
139139
; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
140140
; SI: v_mul_f32_e64 [[REG:v[0-9]+]], |[[X]]|, -|[[Y]]|
141-
; SI-NEXT: buffer_store_dword [[REG]]
141+
; SI: buffer_store_dword [[REG]]
142142
define void @commute_mul_fabs_x_fneg_fabs_y_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
143143
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
144144
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid

test/CodeGen/AMDGPU/ctlz.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ define void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %
116116
; SI-DAG: s_flbit_i32_b32 [[FFBH_LO:s[0-9]+]], s[[LO]]
117117
; SI-DAG: s_add_i32 [[ADD:s[0-9]+]], [[FFBH_LO]], 32
118118
; SI-DAG: s_flbit_i32_b32 [[FFBH_HI:s[0-9]+]], s[[HI]]
119-
; SI-DAG: v_mov_b32_e32 [[VFFBH_LO:v[0-9]+]], [[FFBH_LO]]
119+
; SI-DAG: v_mov_b32_e32 [[VFFBH_LO:v[0-9]+]], [[ADD]]
120120
; SI-DAG: v_mov_b32_e32 [[VFFBH_HI:v[0-9]+]], [[FFBH_HI]]
121121
; SI-DAG: v_cndmask_b32_e32 v[[CTLZ:[0-9]+]], [[VFFBH_HI]], [[VFFBH_LO]]
122122
; SI-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}}

test/CodeGen/AMDGPU/ctlz_zero_undef.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,7 @@ define void @v_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias %out, i64 add
149149
; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_eq_neg1:
150150
; SI: buffer_load_dword [[VAL:v[0-9]+]],
151151
; SI: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
152-
; SI-NEXT: buffer_store_dword [[RESULT]],
152+
; SI: buffer_store_dword [[RESULT]],
153153
define void @v_ctlz_zero_undef_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
154154
%val = load i32, i32 addrspace(1)* %valptr
155155
%ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
@@ -162,7 +162,7 @@ define void @v_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias %out, i64 add
162162
; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_ne_neg1:
163163
; SI: buffer_load_dword [[VAL:v[0-9]+]],
164164
; SI: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
165-
; SI-NEXT: buffer_store_dword [[RESULT]],
165+
; SI: buffer_store_dword [[RESULT]],
166166
define void @v_ctlz_zero_undef_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
167167
%val = load i32, i32 addrspace(1)* %valptr
168168
%ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone

test/CodeGen/AMDGPU/ctpop64.ll

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -116,9 +116,10 @@ define void @v_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> addrs
116116
; FUNC-LABEL: {{^}}ctpop_i64_in_br:
117117
; SI: s_load_dwordx2 s{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0xd
118118
; VI: s_load_dwordx2 s{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0x34
119-
; GCN: s_bcnt1_i32_b64 [[RESULT:s[0-9]+]], {{s\[}}[[LOVAL]]:[[HIVAL]]{{\]}}
119+
; GCN-DAG: s_bcnt1_i32_b64 [[RESULT:s[0-9]+]], {{s\[}}[[LOVAL]]:[[HIVAL]]{{\]}}
120+
; GCN-DAG: s_mov_b32 [[ZERO:s[0-9]+]], 0
120121
; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], [[RESULT]]
121-
; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[HIVAL]]
122+
; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[ZERO]]
122123
; GCN: buffer_store_dwordx2 {{v\[}}[[VLO]]:[[VHI]]{{\]}}
123124
; GCN: s_endpgm
124125
define void @ctpop_i64_in_br(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %ctpop_arg, i32 %cond) {

test/CodeGen/AMDGPU/cvt_f32_ubyte.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,8 @@ define void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8>
3333
; SI-NOT: bfe
3434
; SI-NOT: v_cvt_f32_ubyte3_e32
3535
; SI-DAG: v_cvt_f32_ubyte2_e32
36-
; SI-DAG: v_cvt_f32_ubyte1_e32
37-
; SI-DAG: v_cvt_f32_ubyte0_e32
36+
; SI-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[LOADREG]]
37+
; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]]
3838
; SI: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
3939
define void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind {
4040
%load = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4

test/CodeGen/AMDGPU/ds_read2_superreg.ll

Lines changed: 16 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -85,14 +85,8 @@ define void @simple_read2_v3f32_superreg_align4(float addrspace(1)* %out) #0 {
8585
}
8686

8787
; CI-LABEL: {{^}}simple_read2_v4f32_superreg_align8:
88-
; CI-DAG: ds_read2_b64 v{{\[}}[[REG_W:[0-9]+]]:[[REG_Z:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1{{$}}
89-
90-
; FIXME: These moves shouldn't be necessary, it should be able to
91-
; store the same register if offset1 was the non-zero offset.
92-
93-
; CI: v_mov_b32
94-
; CI: v_mov_b32
95-
; CI: buffer_store_dwordx4
88+
; CI: ds_read2_b64 [[REG_ZW:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} offset1:1{{$}}
89+
; CI: buffer_store_dwordx4 [[REG_ZW]]
9690
; CI: s_endpgm
9791
define void @simple_read2_v4f32_superreg_align8(<4 x float> addrspace(1)* %out) #0 {
9892
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
@@ -104,10 +98,8 @@ define void @simple_read2_v4f32_superreg_align8(<4 x float> addrspace(1)* %out)
10498
}
10599

106100
; CI-LABEL: {{^}}simple_read2_v4f32_superreg:
107-
; CI: ds_read2_b64 v{{\[}}[[REG_W:[0-9]+]]:[[REG_Z:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1{{$}}
108-
; CI: v_mov_b32
109-
; CI: v_mov_b32
110-
; CI: buffer_store_dwordx4
101+
; CI-DAG: ds_read2_b64 [[REG_ZW:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} offset1:1{{$}}
102+
; CI: buffer_store_dwordx4 [[REG_ZW]]
111103
; CI: s_endpgm
112104
define void @simple_read2_v4f32_superreg(<4 x float> addrspace(1)* %out) #0 {
113105
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
@@ -120,14 +112,10 @@ define void @simple_read2_v4f32_superreg(<4 x float> addrspace(1)* %out) #0 {
120112

121113
; FIXME: Extra moves shuffling superregister
122114
; CI-LABEL: {{^}}simple_read2_v8f32_superreg:
123-
; CI: ds_read2_b64 v{{\[}}[[REG_ELT3:[0-9]+]]:[[REG_ELT7:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:3{{$}}
124-
; CI: v_mov_b32
125-
; CI: v_mov_b32
126-
; CI: ds_read2_b64 v{{\[}}[[REG_ELT6:[0-9]+]]:[[REG_ELT5:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:2{{$}}
127-
; CI: v_mov_b32
128-
; CI: v_mov_b32
129-
; CI: buffer_store_dwordx4
130-
; CI: buffer_store_dwordx4
115+
; CI-DAG: ds_read2_b64 [[VEC_HI:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} offset0:2 offset1:3{{$}}
116+
; CI-DAG: ds_read2_b64 [[VEC_LO:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} offset1:1{{$}}
117+
; CI-DAG: buffer_store_dwordx4 [[VEC_HI]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16
118+
; CI-DAG: buffer_store_dwordx4 [[VEC_LO]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64{{$}}
131119
; CI: s_endpgm
132120
define void @simple_read2_v8f32_superreg(<8 x float> addrspace(1)* %out) #0 {
133121
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
@@ -140,22 +128,15 @@ define void @simple_read2_v8f32_superreg(<8 x float> addrspace(1)* %out) #0 {
140128

141129
; FIXME: Extra moves shuffling superregister
142130
; CI-LABEL: {{^}}simple_read2_v16f32_superreg:
143-
; CI: ds_read2_b64 v{{\[}}[[REG_ELT11:[0-9]+]]:[[REG_ELT15:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:3{{$}}
144-
; CI: v_mov_b32
145-
; CI: v_mov_b32
146-
; CI: ds_read2_b64 v{{\[}}[[REG_ELT14:[0-9]+]]:[[REG_ELT13:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:5 offset1:7{{$}}
147-
; CI: ds_read2_b64 v{{\[}}[[REG_ELT14:[0-9]+]]:[[REG_ELT13:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:6 offset1:4{{$}}
148-
; CI: v_mov_b32
149-
; CI: v_mov_b32
150-
; CI: ds_read2_b64 v{{\[}}[[REG_ELT12:[0-9]+]]:[[REG_ELT10:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:2{{$}}
151-
; CI: v_mov_b32
152-
; CI: v_mov_b32
153-
131+
; CI-DAG: ds_read2_b64 [[VEC0_3:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} offset1:1{{$}}
132+
; CI-DAG: ds_read2_b64 [[VEC4_7:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} offset0:2 offset1:3{{$}}
133+
; CI-DAG: ds_read2_b64 [[VEC8_11:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} offset0:4 offset1:5{{$}}
134+
; CI-DAG: ds_read2_b64 [[VEC12_15:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} offset0:6 offset1:7{{$}}
154135
; CI: s_waitcnt lgkmcnt(0)
155-
; CI: buffer_store_dwordx4
156-
; CI: buffer_store_dwordx4
157-
; CI: buffer_store_dwordx4
158-
; CI: buffer_store_dwordx4
136+
; CI-DAG: buffer_store_dwordx4 [[VEC0_3]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64{{$}}
137+
; CI-DAG: buffer_store_dwordx4 [[VEC4_7]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16
138+
; CI-DAG: buffer_store_dwordx4 [[VEC8_11]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:32
139+
; CI-DAG: buffer_store_dwordx4 [[VEC12_15]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:48
159140
; CI: s_endpgm
160141
define void @simple_read2_v16f32_superreg(<16 x float> addrspace(1)* %out) #0 {
161142
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1

test/CodeGen/AMDGPU/ds_read2st64.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -65,9 +65,9 @@ define void @simple_read2st64_f32_max_offset(float addrspace(1)* %out, float add
6565

6666
; SI-LABEL: @simple_read2st64_f32_over_max_offset
6767
; SI-NOT: ds_read2st64_b32
68-
; SI: v_add_i32_e32 [[BIGADD:v[0-9]+]], vcc, 0x10000, {{v[0-9]+}}
69-
; SI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:256
70-
; SI: ds_read_b32 {{v[0-9]+}}, [[BIGADD]]
68+
; SI-DAG: v_add_i32_e32 [[BIGADD:v[0-9]+]], vcc, 0x10000, {{v[0-9]+}}
69+
; SI-DAG: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:256
70+
; SI-DAG: ds_read_b32 {{v[0-9]+}}, [[BIGADD]]{{$}}
7171
; SI: s_endpgm
7272
define void @simple_read2st64_f32_over_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
7373
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1

test/CodeGen/AMDGPU/ds_write2.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -179,8 +179,8 @@ define void @simple_write2_two_val_too_far_f32(float addrspace(1)* %C, float add
179179
}
180180

181181
; SI-LABEL: @simple_write2_two_val_f32_x2
182-
; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset1:8
183-
; SI-NEXT: ds_write2_b32 [[BASEADDR]], [[VAL0]], [[VAL1]] offset0:11 offset1:27
182+
; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL0]] offset1:11
183+
; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL1:v[0-9]+]], [[VAL1]] offset0:8 offset1:27
184184
; SI: s_endpgm
185185
define void @simple_write2_two_val_f32_x2(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
186186
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
@@ -209,8 +209,8 @@ define void @simple_write2_two_val_f32_x2(float addrspace(1)* %C, float addrspac
209209
}
210210

211211
; SI-LABEL: @simple_write2_two_val_f32_x2_nonzero_base
212-
; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset0:3 offset1:8
213-
; SI-NEXT: ds_write2_b32 [[BASEADDR]], [[VAL0]], [[VAL1]] offset0:11 offset1:27
212+
; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL0]] offset0:3 offset1:11
213+
; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL1:v[0-9]+]], [[VAL1]] offset0:8 offset1:27
214214
; SI: s_endpgm
215215
define void @simple_write2_two_val_f32_x2_nonzero_base(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
216216
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1

test/CodeGen/AMDGPU/fcopysign.f64.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,8 @@ declare <4 x double> @llvm.copysign.v4f64(<4 x double>, <4 x double>) nounwind r
1313
; GCN-DAG: v_mov_b32_e32 v[[VSIGN_HI:[0-9]+]], s[[SSIGN_HI]]
1414
; GCN-DAG: v_mov_b32_e32 v[[VMAG_HI:[0-9]+]], s[[SMAG_HI]]
1515
; GCN-DAG: s_mov_b32 [[SCONST:s[0-9]+]], 0x7fffffff
16-
; GCN: v_bfi_b32 v[[VRESULT_HI:[0-9]+]], [[SCONST]], v[[VMAG_HI]], v[[VSIGN_HI]]
17-
; GCN: v_mov_b32_e32 v[[VMAG_LO:[0-9]+]], s[[SMAG_LO]]
16+
; GCN-DAG: v_bfi_b32 v[[VRESULT_HI:[0-9]+]], [[SCONST]], v[[VMAG_HI]], v[[VSIGN_HI]]
17+
; GCN-DAG: v_mov_b32_e32 v[[VMAG_LO:[0-9]+]], s[[SMAG_LO]]
1818
; GCN: buffer_store_dwordx2 v{{\[}}[[VMAG_LO]]:[[VRESULT_HI]]{{\]}}
1919
; GCN: s_endpgm
2020
define void @test_copysign_f64(double addrspace(1)* %out, double %mag, double %sign) nounwind {

0 commit comments

Comments
 (0)