Skip to content

Commit 7ddd036

Browse files
committed
AMDGPU/GlobalISel: Insert m0 initialization before sextload/zextload
Fixes missing m0 initialize for pre-gfx9 targets with local extending loads.
1 parent a075e78 commit 7ddd036

File tree

7 files changed

+152
-139
lines changed

7 files changed

+152
-139
lines changed

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3532,6 +3532,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
35323532
return true;
35333533
return selectImpl(I, *CoverageInfo);
35343534
case TargetOpcode::G_LOAD:
3535+
case TargetOpcode::G_ZEXTLOAD:
3536+
case TargetOpcode::G_SEXTLOAD:
35353537
case TargetOpcode::G_STORE:
35363538
case TargetOpcode::G_ATOMIC_CMPXCHG:
35373539
case TargetOpcode::G_ATOMICRMW_XCHG:

llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sextload-local.mir

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,14 +19,18 @@ body: |
1919
; GFX6: liveins: $vgpr0
2020
; GFX6-NEXT: {{ $}}
2121
; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
22+
; GFX6-NEXT: $m0 = S_MOV_B32 -1
2223
; GFX6-NEXT: [[DS_READ_I8_:%[0-9]+]]:vgpr_32 = DS_READ_I8 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (s8), addrspace 3)
2324
; GFX6-NEXT: $vgpr0 = COPY [[DS_READ_I8_]]
25+
;
2426
; GFX7-LABEL: name: sextload_local_s32_from_s8_align1
2527
; GFX7: liveins: $vgpr0
2628
; GFX7-NEXT: {{ $}}
2729
; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
30+
; GFX7-NEXT: $m0 = S_MOV_B32 -1
2831
; GFX7-NEXT: [[DS_READ_I8_:%[0-9]+]]:vgpr_32 = DS_READ_I8 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (s8), addrspace 3)
2932
; GFX7-NEXT: $vgpr0 = COPY [[DS_READ_I8_]]
33+
;
3034
; GFX9-LABEL: name: sextload_local_s32_from_s8_align1
3135
; GFX9: liveins: $vgpr0
3236
; GFX9-NEXT: {{ $}}
@@ -53,14 +57,18 @@ body: |
5357
; GFX6: liveins: $vgpr0
5458
; GFX6-NEXT: {{ $}}
5559
; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
60+
; GFX6-NEXT: $m0 = S_MOV_B32 -1
5661
; GFX6-NEXT: [[DS_READ_I16_:%[0-9]+]]:vgpr_32 = DS_READ_I16 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (s16), addrspace 3)
5762
; GFX6-NEXT: $vgpr0 = COPY [[DS_READ_I16_]]
63+
;
5864
; GFX7-LABEL: name: sextload_local_s32_from_s16_align2
5965
; GFX7: liveins: $vgpr0
6066
; GFX7-NEXT: {{ $}}
6167
; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
68+
; GFX7-NEXT: $m0 = S_MOV_B32 -1
6269
; GFX7-NEXT: [[DS_READ_I16_:%[0-9]+]]:vgpr_32 = DS_READ_I16 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (s16), addrspace 3)
6370
; GFX7-NEXT: $vgpr0 = COPY [[DS_READ_I16_]]
71+
;
6472
; GFX9-LABEL: name: sextload_local_s32_from_s16_align2
6573
; GFX9: liveins: $vgpr0
6674
; GFX9-NEXT: {{ $}}
@@ -105,15 +113,19 @@ body: |
105113
; GFX6-NEXT: {{ $}}
106114
; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
107115
; GFX6-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
108-
; GFX6-NEXT: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
109-
; GFX6-NEXT: [[DS_READ_I8_:%[0-9]+]]:vgpr_32 = DS_READ_I8 %2, 0, 0, implicit $m0, implicit $exec :: (load (s8), addrspace 3)
116+
; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
117+
; GFX6-NEXT: $m0 = S_MOV_B32 -1
118+
; GFX6-NEXT: [[DS_READ_I8_:%[0-9]+]]:vgpr_32 = DS_READ_I8 [[V_ADD_CO_U32_e64_]], 0, 0, implicit $m0, implicit $exec :: (load (s8), addrspace 3)
110119
; GFX6-NEXT: $vgpr0 = COPY [[DS_READ_I8_]]
120+
;
111121
; GFX7-LABEL: name: sextload_local_s32_from_s8_align1_offset4095
112122
; GFX7: liveins: $vgpr0
113123
; GFX7-NEXT: {{ $}}
114124
; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
125+
; GFX7-NEXT: $m0 = S_MOV_B32 -1
115126
; GFX7-NEXT: [[DS_READ_I8_:%[0-9]+]]:vgpr_32 = DS_READ_I8 [[COPY]], 4095, 0, implicit $m0, implicit $exec :: (load (s8), addrspace 3)
116127
; GFX7-NEXT: $vgpr0 = COPY [[DS_READ_I8_]]
128+
;
117129
; GFX9-LABEL: name: sextload_local_s32_from_s8_align1_offset4095
118130
; GFX9: liveins: $vgpr0
119131
; GFX9-NEXT: {{ $}}

llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-zextload-local.mir

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,14 +19,18 @@ body: |
1919
; GFX6: liveins: $vgpr0
2020
; GFX6-NEXT: {{ $}}
2121
; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
22+
; GFX6-NEXT: $m0 = S_MOV_B32 -1
2223
; GFX6-NEXT: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (s8), addrspace 3)
2324
; GFX6-NEXT: $vgpr0 = COPY [[DS_READ_U8_]]
25+
;
2426
; GFX7-LABEL: name: zextload_local_s32_from_s8_align1
2527
; GFX7: liveins: $vgpr0
2628
; GFX7-NEXT: {{ $}}
2729
; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
30+
; GFX7-NEXT: $m0 = S_MOV_B32 -1
2831
; GFX7-NEXT: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (s8), addrspace 3)
2932
; GFX7-NEXT: $vgpr0 = COPY [[DS_READ_U8_]]
33+
;
3034
; GFX9-LABEL: name: zextload_local_s32_from_s8_align1
3135
; GFX9: liveins: $vgpr0
3236
; GFX9-NEXT: {{ $}}
@@ -53,14 +57,18 @@ body: |
5357
; GFX6: liveins: $vgpr0
5458
; GFX6-NEXT: {{ $}}
5559
; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
60+
; GFX6-NEXT: $m0 = S_MOV_B32 -1
5661
; GFX6-NEXT: [[DS_READ_U16_:%[0-9]+]]:vgpr_32 = DS_READ_U16 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (s16), addrspace 3)
5762
; GFX6-NEXT: $vgpr0 = COPY [[DS_READ_U16_]]
63+
;
5864
; GFX7-LABEL: name: zextload_local_s32_from_s16_align2
5965
; GFX7: liveins: $vgpr0
6066
; GFX7-NEXT: {{ $}}
6167
; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
68+
; GFX7-NEXT: $m0 = S_MOV_B32 -1
6269
; GFX7-NEXT: [[DS_READ_U16_:%[0-9]+]]:vgpr_32 = DS_READ_U16 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (s16), addrspace 3)
6370
; GFX7-NEXT: $vgpr0 = COPY [[DS_READ_U16_]]
71+
;
6472
; GFX9-LABEL: name: zextload_local_s32_from_s16_align2
6573
; GFX9: liveins: $vgpr0
6674
; GFX9-NEXT: {{ $}}
@@ -105,15 +113,19 @@ body: |
105113
; GFX6-NEXT: {{ $}}
106114
; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
107115
; GFX6-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
108-
; GFX6-NEXT: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
109-
; GFX6-NEXT: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 %2, 0, 0, implicit $m0, implicit $exec :: (load (s8), addrspace 3)
116+
; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
117+
; GFX6-NEXT: $m0 = S_MOV_B32 -1
118+
; GFX6-NEXT: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 [[V_ADD_CO_U32_e64_]], 0, 0, implicit $m0, implicit $exec :: (load (s8), addrspace 3)
110119
; GFX6-NEXT: $vgpr0 = COPY [[DS_READ_U8_]]
120+
;
111121
; GFX7-LABEL: name: zextload_local_s32_from_s8_align1_offset4095
112122
; GFX7: liveins: $vgpr0
113123
; GFX7-NEXT: {{ $}}
114124
; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
125+
; GFX7-NEXT: $m0 = S_MOV_B32 -1
115126
; GFX7-NEXT: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 [[COPY]], 4095, 0, implicit $m0, implicit $exec :: (load (s8), addrspace 3)
116127
; GFX7-NEXT: $vgpr0 = COPY [[DS_READ_U8_]]
128+
;
117129
; GFX9-LABEL: name: zextload_local_s32_from_s8_align1_offset4095
118130
; GFX9: liveins: $vgpr0
119131
; GFX9-NEXT: {{ $}}

llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll

Lines changed: 31 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -90,54 +90,53 @@ define <4 x i32> @load_lds_v4i32_align1(ptr addrspace(3) %ptr) {
9090
; GFX7-LABEL: load_lds_v4i32_align1:
9191
; GFX7: ; %bb.0:
9292
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
93-
; GFX7-NEXT: ds_read_u8 v1, v0 offset:1
94-
; GFX7-NEXT: ds_read_u8 v2, v0
95-
; GFX7-NEXT: ds_read_u8 v3, v0 offset:2
9693
; GFX7-NEXT: s_mov_b32 m0, -1
97-
; GFX7-NEXT: s_waitcnt lgkmcnt(2)
98-
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1
99-
; GFX7-NEXT: s_waitcnt lgkmcnt(1)
100-
; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
101-
; GFX7-NEXT: ds_read_u8 v2, v0 offset:3
94+
; GFX7-NEXT: ds_read_u8 v1, v0
95+
; GFX7-NEXT: ds_read_u8 v2, v0 offset:1
96+
; GFX7-NEXT: ds_read_u8 v3, v0 offset:2
97+
; GFX7-NEXT: ds_read_u8 v4, v0 offset:3
10298
; GFX7-NEXT: ds_read_u8 v5, v0 offset:4
10399
; GFX7-NEXT: ds_read_u8 v6, v0 offset:5
104100
; GFX7-NEXT: ds_read_u8 v7, v0 offset:6
105101
; GFX7-NEXT: ds_read_u8 v8, v0 offset:7
106-
; GFX7-NEXT: ds_read_u8 v9, v0 offset:8
107-
; GFX7-NEXT: ds_read_u8 v10, v0 offset:9
108-
; GFX7-NEXT: ds_read_u8 v11, v0 offset:10
109-
; GFX7-NEXT: s_waitcnt lgkmcnt(7)
110-
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2
102+
; GFX7-NEXT: s_waitcnt lgkmcnt(6)
103+
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2
104+
; GFX7-NEXT: v_or_b32_e32 v1, v2, v1
105+
; GFX7-NEXT: s_waitcnt lgkmcnt(4)
106+
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v4
111107
; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
112108
; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
113109
; GFX7-NEXT: v_or_b32_e32 v4, v2, v1
114-
; GFX7-NEXT: s_waitcnt lgkmcnt(5)
110+
; GFX7-NEXT: s_waitcnt lgkmcnt(2)
115111
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v6
116-
; GFX7-NEXT: s_waitcnt lgkmcnt(3)
112+
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
117113
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v8
118114
; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7
119115
; GFX7-NEXT: v_or_b32_e32 v1, v1, v5
120116
; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
121-
; GFX7-NEXT: ds_read_u8 v3, v0 offset:11
122-
; GFX7-NEXT: ds_read_u8 v5, v0 offset:12
123-
; GFX7-NEXT: ds_read_u8 v6, v0 offset:13
124-
; GFX7-NEXT: ds_read_u8 v7, v0 offset:14
125-
; GFX7-NEXT: ds_read_u8 v0, v0 offset:15
126117
; GFX7-NEXT: v_or_b32_e32 v1, v2, v1
118+
; GFX7-NEXT: ds_read_u8 v2, v0 offset:8
119+
; GFX7-NEXT: ds_read_u8 v3, v0 offset:9
120+
; GFX7-NEXT: ds_read_u8 v5, v0 offset:10
121+
; GFX7-NEXT: ds_read_u8 v6, v0 offset:11
122+
; GFX7-NEXT: ds_read_u8 v7, v0 offset:12
123+
; GFX7-NEXT: ds_read_u8 v8, v0 offset:13
124+
; GFX7-NEXT: ds_read_u8 v9, v0 offset:14
125+
; GFX7-NEXT: ds_read_u8 v0, v0 offset:15
127126
; GFX7-NEXT: s_waitcnt lgkmcnt(6)
128-
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v10
127+
; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3
128+
; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
129129
; GFX7-NEXT: s_waitcnt lgkmcnt(4)
130-
; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3
131-
; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v11
132-
; GFX7-NEXT: v_or_b32_e32 v2, v2, v9
133-
; GFX7-NEXT: v_or_b32_e32 v3, v3, v8
130+
; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v6
131+
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
132+
; GFX7-NEXT: v_or_b32_e32 v3, v3, v5
134133
; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
135134
; GFX7-NEXT: s_waitcnt lgkmcnt(2)
136-
; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v6
137-
; GFX7-NEXT: v_or_b32_e32 v3, v3, v5
135+
; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v8
138136
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
139137
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0
140-
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v7
138+
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v9
139+
; GFX7-NEXT: v_or_b32_e32 v3, v3, v7
141140
; GFX7-NEXT: v_or_b32_e32 v0, v0, v5
142141
; GFX7-NEXT: v_or_b32_e32 v3, v0, v3
143142
; GFX7-NEXT: v_mov_b32_e32 v0, v4
@@ -270,8 +269,8 @@ define <4 x i32> @load_lds_v4i32_align2(ptr addrspace(3) %ptr) {
270269
; GFX7-LABEL: load_lds_v4i32_align2:
271270
; GFX7: ; %bb.0:
272271
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
273-
; GFX7-NEXT: ds_read_u16 v1, v0
274272
; GFX7-NEXT: s_mov_b32 m0, -1
273+
; GFX7-NEXT: ds_read_u16 v1, v0
275274
; GFX7-NEXT: ds_read_u16 v2, v0 offset:2
276275
; GFX7-NEXT: ds_read_u16 v3, v0 offset:4
277276
; GFX7-NEXT: ds_read_u16 v4, v0 offset:6
@@ -281,11 +280,12 @@ define <4 x i32> @load_lds_v4i32_align2(ptr addrspace(3) %ptr) {
281280
; GFX7-NEXT: ds_read_u16 v8, v0 offset:14
282281
; GFX7-NEXT: s_waitcnt lgkmcnt(6)
283282
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v2
284-
; GFX7-NEXT: s_waitcnt lgkmcnt(2)
285-
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6
286283
; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
284+
; GFX7-NEXT: s_waitcnt lgkmcnt(4)
287285
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4
288286
; GFX7-NEXT: v_or_b32_e32 v1, v1, v3
287+
; GFX7-NEXT: s_waitcnt lgkmcnt(2)
288+
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6
289289
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
290290
; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v8
291291
; GFX7-NEXT: v_or_b32_e32 v2, v2, v5

llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll

Lines changed: 28 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -81,42 +81,42 @@ define <3 x i32> @load_lds_v3i32_align1(ptr addrspace(3) %ptr) {
8181
; GFX7-LABEL: load_lds_v3i32_align1:
8282
; GFX7: ; %bb.0:
8383
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
84-
; GFX7-NEXT: ds_read_u8 v1, v0 offset:1
85-
; GFX7-NEXT: ds_read_u8 v2, v0
86-
; GFX7-NEXT: ds_read_u8 v3, v0 offset:2
8784
; GFX7-NEXT: s_mov_b32 m0, -1
88-
; GFX7-NEXT: s_waitcnt lgkmcnt(2)
89-
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1
90-
; GFX7-NEXT: s_waitcnt lgkmcnt(1)
91-
; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
92-
; GFX7-NEXT: ds_read_u8 v2, v0 offset:3
93-
; GFX7-NEXT: ds_read_u8 v4, v0 offset:4
94-
; GFX7-NEXT: ds_read_u8 v5, v0 offset:5
95-
; GFX7-NEXT: ds_read_u8 v6, v0 offset:6
96-
; GFX7-NEXT: ds_read_u8 v7, v0 offset:7
97-
; GFX7-NEXT: ds_read_u8 v8, v0 offset:8
98-
; GFX7-NEXT: ds_read_u8 v9, v0 offset:9
99-
; GFX7-NEXT: ds_read_u8 v10, v0 offset:10
100-
; GFX7-NEXT: s_waitcnt lgkmcnt(7)
101-
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2
85+
; GFX7-NEXT: ds_read_u8 v1, v0
86+
; GFX7-NEXT: ds_read_u8 v2, v0 offset:1
87+
; GFX7-NEXT: ds_read_u8 v3, v0 offset:2
88+
; GFX7-NEXT: ds_read_u8 v4, v0 offset:3
89+
; GFX7-NEXT: ds_read_u8 v5, v0 offset:4
90+
; GFX7-NEXT: ds_read_u8 v6, v0 offset:5
91+
; GFX7-NEXT: ds_read_u8 v7, v0 offset:6
92+
; GFX7-NEXT: ds_read_u8 v8, v0 offset:7
93+
; GFX7-NEXT: s_waitcnt lgkmcnt(6)
94+
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2
95+
; GFX7-NEXT: v_or_b32_e32 v1, v2, v1
96+
; GFX7-NEXT: s_waitcnt lgkmcnt(4)
97+
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v4
10298
; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
103-
; GFX7-NEXT: ds_read_u8 v0, v0 offset:11
10499
; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
105100
; GFX7-NEXT: v_or_b32_e32 v3, v2, v1
106-
; GFX7-NEXT: s_waitcnt lgkmcnt(6)
107-
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v5
108-
; GFX7-NEXT: v_or_b32_e32 v1, v1, v4
101+
; GFX7-NEXT: s_waitcnt lgkmcnt(2)
102+
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v6
103+
; GFX7-NEXT: v_or_b32_e32 v1, v1, v5
104+
; GFX7-NEXT: s_waitcnt lgkmcnt(1)
105+
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v7
106+
; GFX7-NEXT: ds_read_u8 v5, v0 offset:8
107+
; GFX7-NEXT: ds_read_u8 v6, v0 offset:9
108+
; GFX7-NEXT: ds_read_u8 v7, v0 offset:10
109+
; GFX7-NEXT: ds_read_u8 v0, v0 offset:11
109110
; GFX7-NEXT: s_waitcnt lgkmcnt(4)
110-
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v7
111-
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6
111+
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v8
112112
; GFX7-NEXT: v_or_b32_e32 v2, v2, v4
113113
; GFX7-NEXT: v_or_b32_e32 v1, v2, v1
114114
; GFX7-NEXT: s_waitcnt lgkmcnt(2)
115-
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v9
115+
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v6
116116
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
117117
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0
118-
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v10
119-
; GFX7-NEXT: v_or_b32_e32 v2, v2, v8
118+
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v7
119+
; GFX7-NEXT: v_or_b32_e32 v2, v2, v5
120120
; GFX7-NEXT: v_or_b32_e32 v0, v0, v4
121121
; GFX7-NEXT: v_or_b32_e32 v2, v0, v2
122122
; GFX7-NEXT: v_mov_b32_e32 v0, v3
@@ -223,8 +223,8 @@ define <3 x i32> @load_lds_v3i32_align2(ptr addrspace(3) %ptr) {
223223
; GFX7-LABEL: load_lds_v3i32_align2:
224224
; GFX7: ; %bb.0:
225225
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
226-
; GFX7-NEXT: ds_read_u16 v1, v0
227226
; GFX7-NEXT: s_mov_b32 m0, -1
227+
; GFX7-NEXT: ds_read_u16 v1, v0
228228
; GFX7-NEXT: ds_read_u16 v2, v0 offset:2
229229
; GFX7-NEXT: ds_read_u16 v3, v0 offset:4
230230
; GFX7-NEXT: ds_read_u16 v4, v0 offset:6
@@ -235,9 +235,9 @@ define <3 x i32> @load_lds_v3i32_align2(ptr addrspace(3) %ptr) {
235235
; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
236236
; GFX7-NEXT: s_waitcnt lgkmcnt(2)
237237
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v4
238+
; GFX7-NEXT: v_or_b32_e32 v1, v1, v3
238239
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
239240
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6
240-
; GFX7-NEXT: v_or_b32_e32 v1, v1, v3
241241
; GFX7-NEXT: v_or_b32_e32 v2, v2, v5
242242
; GFX7-NEXT: s_setpc_b64 s[30:31]
243243
;

0 commit comments

Comments
 (0)