Skip to content

Commit 9968ba8

Browse files
committed
Revert "[AMDGPU] Insert readfirstlane in the function returns in sgpr. (#135326)"
This reverts commit 76ced7f since it breaks a lot of bots.
1 parent 4c0ea47 commit 9968ba8

14 files changed

+538
-733
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3221,7 +3221,6 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
32213221
const SDLoc &DL, SelectionDAG &DAG) const {
32223222
MachineFunction &MF = DAG.getMachineFunction();
32233223
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
3224-
const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
32253224

32263225
if (AMDGPU::isKernel(CallConv)) {
32273226
return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
@@ -3248,8 +3247,6 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
32483247
SmallVector<SDValue, 48> RetOps;
32493248
RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
32503249

3251-
SDValue ReadFirstLane =
3252-
DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
32533250
// Copy the result values into the output registers.
32543251
for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
32553252
++I, ++RealRVLocIdx) {
@@ -3277,9 +3274,7 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
32773274
default:
32783275
llvm_unreachable("Unknown loc info!");
32793276
}
3280-
if (TRI->isSGPRPhysReg(VA.getLocReg()))
3281-
Arg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Arg.getValueType(),
3282-
ReadFirstLane, Arg);
3277+
32833278
Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Glue);
32843279
Glue = Chain.getValue(1);
32853280
RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));

llvm/test/CodeGen/AMDGPU/add64-low-32-bits-known-zero.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -148,8 +148,8 @@ define <2 x i64> @v_add_v2i64_nonsplat_const_low_bits_known0_0(<2 x i64> %reg) {
148148
define amdgpu_ps <2 x i64> @s_add_v2i64_splat_const_low_bits_known0_0(<2 x i64> inreg %reg) {
149149
; GFX9-LABEL: s_add_v2i64_splat_const_low_bits_known0_0:
150150
; GFX9: ; %bb.0:
151-
; GFX9-NEXT: s_add_i32 s3, s3, 1
152151
; GFX9-NEXT: s_add_i32 s1, s1, 1
152+
; GFX9-NEXT: s_add_i32 s3, s3, 1
153153
; GFX9-NEXT: ; return to shader part epilog
154154
%add = add <2 x i64> %reg, <i64 4294967296, i64 4294967296> ; (1 << 32)
155155
ret <2 x i64> %add
@@ -158,8 +158,8 @@ define amdgpu_ps <2 x i64> @s_add_v2i64_splat_const_low_bits_known0_0(<2 x i64>
158158
define amdgpu_ps <2 x i64> @s_add_v2i64_nonsplat_const_low_bits_known0_0(<2 x i64> inreg %reg) {
159159
; GFX9-LABEL: s_add_v2i64_nonsplat_const_low_bits_known0_0:
160160
; GFX9: ; %bb.0:
161-
; GFX9-NEXT: s_add_i32 s3, s3, 2
162161
; GFX9-NEXT: s_add_i32 s1, s1, 1
162+
; GFX9-NEXT: s_add_i32 s3, s3, 2
163163
; GFX9-NEXT: ; return to shader part epilog
164164
%add = add <2 x i64> %reg, <i64 4294967296, i64 8589934592> ; (1 << 32), (1 << 33)
165165
ret <2 x i64> %add

llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f64.ll

Lines changed: 16 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -110,11 +110,9 @@ define amdgpu_ps double @buffer_atomic_fadd_f64_offset_rtn(double %val, <4 x i32
110110
; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
111111
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFSET_RTN [[COPY7]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
112112
; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub0
113-
; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY8]], implicit $exec
114113
; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub1
115-
; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY9]], implicit $exec
116-
; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
117-
; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
114+
; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[COPY8]]
115+
; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[COPY9]]
118116
; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
119117
%ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0)
120118
ret double %ret
@@ -138,11 +136,9 @@ define amdgpu_ps double @buffer_atomic_fadd_f64_offen_rtn(double %val, <4 x i32>
138136
; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
139137
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFEN_RTN [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
140138
; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub0
141-
; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY9]], implicit $exec
142139
; GFX90A_GFX942-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub1
143-
; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY10]], implicit $exec
144-
; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
145-
; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
140+
; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[COPY9]]
141+
; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[COPY10]]
146142
; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
147143
%ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
148144
ret double %ret
@@ -166,11 +162,9 @@ define amdgpu_ps double @buffer_atomic_fadd_f64_idxen_rtn(double %val, <4 x i32>
166162
; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
167163
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_IDXEN_RTN [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
168164
; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub0
169-
; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY9]], implicit $exec
170165
; GFX90A_GFX942-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub1
171-
; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY10]], implicit $exec
172-
; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
173-
; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
166+
; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[COPY9]]
167+
; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[COPY10]]
174168
; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
175169
%ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
176170
ret double %ret
@@ -196,11 +190,9 @@ define amdgpu_ps double @buffer_atomic_fadd_f64_bothen_rtn(double %val, <4 x i32
196190
; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
197191
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN [[COPY9]], killed [[REG_SEQUENCE2]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
198192
; GFX90A_GFX942-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub0
199-
; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY10]], implicit $exec
200193
; GFX90A_GFX942-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub1
201-
; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY11]], implicit $exec
202-
; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
203-
; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
194+
; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[COPY10]]
195+
; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[COPY11]]
204196
; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
205197
%ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
206198
ret double %ret
@@ -342,11 +334,9 @@ define amdgpu_ps double @buffer_ptr_atomic_fadd_f64_offset_rtn(double %val, ptr
342334
; GFX90A_GFX942-NEXT: [[COPY11:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
343335
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFSET_RTN [[COPY11]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
344336
; GFX90A_GFX942-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub0
345-
; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY12]], implicit $exec
346337
; GFX90A_GFX942-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub1
347-
; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY13]], implicit $exec
348-
; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
349-
; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
338+
; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[COPY12]]
339+
; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[COPY13]]
350340
; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
351341
%ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(double %val, ptr addrspace(8) %rsrc, i32 0, i32 %soffset, i32 0)
352342
ret double %ret
@@ -376,11 +366,9 @@ define amdgpu_ps double @buffer_ptr_atomic_fadd_f64_offen_rtn(double %val, ptr a
376366
; GFX90A_GFX942-NEXT: [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
377367
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFEN_RTN [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
378368
; GFX90A_GFX942-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub0
379-
; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY13]], implicit $exec
380369
; GFX90A_GFX942-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub1
381-
; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY14]], implicit $exec
382-
; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
383-
; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
370+
; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[COPY13]]
371+
; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[COPY14]]
384372
; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
385373
%ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(double %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0)
386374
ret double %ret
@@ -410,11 +398,9 @@ define amdgpu_ps double @buffer_ptr_atomic_fadd_f64_idxen_rtn(double %val, ptr a
410398
; GFX90A_GFX942-NEXT: [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
411399
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_IDXEN_RTN [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
412400
; GFX90A_GFX942-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub0
413-
; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY13]], implicit $exec
414401
; GFX90A_GFX942-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub1
415-
; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY14]], implicit $exec
416-
; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
417-
; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
402+
; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[COPY13]]
403+
; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[COPY14]]
418404
; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
419405
%ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
420406
ret double %ret
@@ -446,11 +432,9 @@ define amdgpu_ps double @buffer_ptr_atomic_fadd_f64_bothen_rtn(double %val, ptr
446432
; GFX90A_GFX942-NEXT: [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
447433
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN [[COPY13]], killed [[REG_SEQUENCE4]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
448434
; GFX90A_GFX942-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub0
449-
; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY14]], implicit $exec
450435
; GFX90A_GFX942-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub1
451-
; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY15]], implicit $exec
452-
; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
453-
; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
436+
; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[COPY14]]
437+
; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[COPY15]]
454438
; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
455439
%ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
456440
ret double %ret

llvm/test/CodeGen/AMDGPU/constrained-shift.ll

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -168,26 +168,26 @@ define <4 x i32> @csh_v4i32(<4 x i32> %a, <4 x i32> %b) {
168168
define amdgpu_ps <4 x i32> @s_csh_v4i32(<4 x i32> inreg %a, <4 x i32> inreg %b) {
169169
; CHECK-LABEL: s_csh_v4i32:
170170
; CHECK: ; %bb.0:
171-
; CHECK-NEXT: s_lshl_b32 s8, s3, s7
172-
; CHECK-NEXT: s_lshl_b32 s9, s2, s6
173-
; CHECK-NEXT: s_lshl_b32 s10, s1, s5
174-
; CHECK-NEXT: s_lshl_b32 s11, s0, s4
175-
; CHECK-NEXT: s_lshr_b32 s12, s3, s7
176-
; CHECK-NEXT: s_lshr_b32 s13, s2, s6
177-
; CHECK-NEXT: s_lshr_b32 s14, s1, s5
178-
; CHECK-NEXT: s_lshr_b32 s15, s0, s4
179-
; CHECK-NEXT: s_ashr_i32 s0, s0, s4
180-
; CHECK-NEXT: s_ashr_i32 s1, s1, s5
181-
; CHECK-NEXT: s_ashr_i32 s2, s2, s6
171+
; CHECK-NEXT: s_lshl_b32 s8, s0, s4
172+
; CHECK-NEXT: s_lshl_b32 s9, s1, s5
173+
; CHECK-NEXT: s_lshl_b32 s10, s2, s6
174+
; CHECK-NEXT: s_lshl_b32 s11, s3, s7
175+
; CHECK-NEXT: s_lshr_b32 s12, s0, s4
176+
; CHECK-NEXT: s_lshr_b32 s13, s1, s5
177+
; CHECK-NEXT: s_lshr_b32 s14, s2, s6
178+
; CHECK-NEXT: s_lshr_b32 s15, s3, s7
182179
; CHECK-NEXT: s_ashr_i32 s3, s3, s7
180+
; CHECK-NEXT: s_ashr_i32 s2, s2, s6
181+
; CHECK-NEXT: s_ashr_i32 s1, s1, s5
182+
; CHECK-NEXT: s_ashr_i32 s0, s0, s4
183183
; CHECK-NEXT: s_add_i32 s4, s11, s15
184184
; CHECK-NEXT: s_add_i32 s5, s10, s14
185185
; CHECK-NEXT: s_add_i32 s6, s9, s13
186186
; CHECK-NEXT: s_add_i32 s7, s8, s12
187-
; CHECK-NEXT: s_add_i32 s3, s7, s3
188-
; CHECK-NEXT: s_add_i32 s2, s6, s2
189-
; CHECK-NEXT: s_add_i32 s1, s5, s1
190-
; CHECK-NEXT: s_add_i32 s0, s4, s0
187+
; CHECK-NEXT: s_add_i32 s0, s7, s0
188+
; CHECK-NEXT: s_add_i32 s1, s6, s1
189+
; CHECK-NEXT: s_add_i32 s2, s5, s2
190+
; CHECK-NEXT: s_add_i32 s3, s4, s3
191191
; CHECK-NEXT: ; return to shader part epilog
192192
;
193193
; GISEL-LABEL: s_csh_v4i32:

0 commit comments

Comments
 (0)