Skip to content

Commit fe8335b

Browse files
authored
[AMDGPU] Select 64-bit imm moves if can be encoded as 32 bit operand (#70395)
This allows folding of 64-bit operands if fit into 32-bit. Fixes #67781
1 parent ee6d62d commit fe8335b

File tree

85 files changed

+10087
-11656
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

85 files changed

+10087
-11656
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -595,11 +595,15 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
595595
break;
596596

597597
uint64_t Imm;
598-
if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(N))
598+
if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(N)) {
599599
Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue();
600-
else {
600+
if (AMDGPU::isValid32BitLiteral(Imm, true))
601+
break;
602+
} else {
601603
ConstantSDNode *C = cast<ConstantSDNode>(N);
602604
Imm = C->getZExtValue();
605+
if (AMDGPU::isValid32BitLiteral(Imm, false))
606+
break;
603607
}
604608

605609
SDLoc DL(N);
@@ -3014,7 +3018,7 @@ bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
30143018
if (!RC || SIRI->isSGPRClass(RC))
30153019
return false;
30163020

3017-
if (RC != &AMDGPU::VS_32RegClass) {
3021+
if (RC != &AMDGPU::VS_32RegClass && RC != &AMDGPU::VS_64RegClass) {
30183022
AllUsesAcceptSReg = false;
30193023
SDNode * User = *U;
30203024
if (User->isMachineOpcode()) {
@@ -3026,7 +3030,8 @@ bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
30263030
if (SII->findCommutedOpIndices(Desc, OpIdx, CommuteIdx1)) {
30273031
unsigned CommutedOpNo = CommuteIdx1 - Desc.getNumDefs();
30283032
const TargetRegisterClass *CommutedRC = getOperandRegClass(*U, CommutedOpNo);
3029-
if (CommutedRC == &AMDGPU::VS_32RegClass)
3033+
if (CommutedRC == &AMDGPU::VS_32RegClass ||
3034+
CommutedRC == &AMDGPU::VS_64RegClass)
30303035
AllUsesAcceptSReg = true;
30313036
}
30323037
}

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2551,11 +2551,13 @@ bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
25512551
MachineOperand &ImmOp = I.getOperand(1);
25522552
Register DstReg = I.getOperand(0).getReg();
25532553
unsigned Size = MRI->getType(DstReg).getSizeInBits();
2554+
bool IsFP = false;
25542555

25552556
// The AMDGPU backend only supports Imm operands and not CImm or FPImm.
25562557
if (ImmOp.isFPImm()) {
25572558
const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt();
25582559
ImmOp.ChangeToImmediate(Imm.getZExtValue());
2560+
IsFP = true;
25592561
} else if (ImmOp.isCImm()) {
25602562
ImmOp.ChangeToImmediate(ImmOp.getCImm()->getSExtValue());
25612563
} else {
@@ -2568,6 +2570,12 @@ bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
25682570
unsigned Opcode;
25692571
if (DstRB->getID() == AMDGPU::VCCRegBankID) {
25702572
Opcode = STI.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
2573+
} else if (Size == 64 &&
2574+
AMDGPU::isValid32BitLiteral(I.getOperand(1).getImm(), IsFP)) {
2575+
Opcode = IsSgpr ? AMDGPU::S_MOV_B64_IMM_PSEUDO : AMDGPU::V_MOV_B64_PSEUDO;
2576+
I.setDesc(TII.get(Opcode));
2577+
I.addImplicitDefUseOperands(*MF);
2578+
return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
25712579
} else {
25722580
Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
25732581

llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -367,7 +367,7 @@ static bool isSafeToFoldImmIntoCopy(const MachineInstr *Copy,
367367
SMovOp = AMDGPU::S_MOV_B32;
368368
break;
369369
case AMDGPU::V_MOV_B64_PSEUDO:
370-
SMovOp = AMDGPU::S_MOV_B64;
370+
SMovOp = AMDGPU::S_MOV_B64_IMM_PSEUDO;
371371
break;
372372
}
373373
Imm = ImmOp->getImm();

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1966,6 +1966,29 @@ def : GCNPat <
19661966
(V_MOV_B32_e32 (f16 (bitcast_fpimm_to_i32 $imm)))
19671967
>;
19681968

1969+
// V_MOV_B64_PSEUDO and S_MOV_B64_IMM_PSEUDO can be used with any 64-bit
1970+
// immediate and wil be expanded as needed, but we will only use these patterns
1971+
// for values which can be encoded.
1972+
def : GCNPat <
1973+
(VGPRImm<(i64 imm)>:$imm),
1974+
(V_MOV_B64_PSEUDO imm:$imm)
1975+
>;
1976+
1977+
def : GCNPat <
1978+
(VGPRImm<(f64 fpimm)>:$imm),
1979+
(V_MOV_B64_PSEUDO (f64 (bitcast_fpimm_to_i64 $imm)))
1980+
>;
1981+
1982+
def : GCNPat <
1983+
(i64 imm:$imm),
1984+
(S_MOV_B64_IMM_PSEUDO imm:$imm)
1985+
>;
1986+
1987+
def : GCNPat <
1988+
(f64 fpimm:$imm),
1989+
(S_MOV_B64_IMM_PSEUDO (i64 (bitcast_fpimm_to_i64 fpimm:$imm)))
1990+
>;
1991+
19691992
def : GCNPat <
19701993
(f32 fpimm:$imm),
19711994
(S_MOV_B32 (f32 (bitcast_fpimm_to_i32 $imm)))

llvm/test/CodeGen/AMDGPU/GlobalISel/combine-short-clamp.ll

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,11 +41,12 @@ entry:
4141
}
4242

4343
; GCN-LABEL: {{^}}v_clamp_i64_i16_invalid_lower
44+
; GFX6789: v_mov_b32_e32 v{{[0-9]+}}, 0x8001
4445
; GFX6789: v_mov_b32_e32 [[B:v[0-9]+]], 0x8001
4546
; GFX6789: v_cndmask_b32_e32 [[A:v[0-9]+]], [[B]], [[A]], vcc
4647
; GFX6789: v_cndmask_b32_e32 [[C:v[0-9]+]], 0, [[C]], vcc
4748

48-
; GFX10: v_cndmask_b32_e32 [[A:v[0-9]+]], 0x8001, [[A]], vcc_lo
49+
; GFX10: v_{{(dual_)?}}cndmask_b32{{(_e32)?}} [[A:v[0-9]+]], 0x8001, [[A]]
4950
; GFX10: v_cndmask_b32_e32 [[B:v[0-9]+]], 0, [[B]], vcc_lo
5051
define i16 @v_clamp_i64_i16_invalid_lower(i64 %in) #0 {
5152
entry:
@@ -56,6 +57,7 @@ entry:
5657
}
5758

5859
; GCN-LABEL: {{^}}v_clamp_i64_i16_invalid_lower_and_higher
60+
; GFX6789: v_mov_b32_e32 v{{[0-9]+}}, 0x8000
5961
; GFX6789: v_mov_b32_e32 [[B:v[0-9]+]], 0x8000
6062
; GFX6789: v_cndmask_b32_e32 [[A:v[0-9]+]], [[B]], [[A]], vcc
6163
; GFX10: v_cndmask_b32_e32 [[A:v[0-9]+]], 0x8000, [[A]], vcc_lo

llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll

Lines changed: 59 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -2090,69 +2090,69 @@ define amdgpu_ps double @dyn_extract_v16f64_s_s(i32 inreg %sel) {
20902090
; GCN-LABEL: dyn_extract_v16f64_s_s:
20912091
; GCN: ; %bb.0: ; %entry
20922092
; GCN-NEXT: s_mov_b32 s66, 0
2093+
; GCN-NEXT: s_mov_b32 s64, 0
2094+
; GCN-NEXT: s_mov_b32 s62, 0
2095+
; GCN-NEXT: s_mov_b32 s60, 0
2096+
; GCN-NEXT: s_mov_b32 s58, 0
2097+
; GCN-NEXT: s_mov_b32 s56, 0
2098+
; GCN-NEXT: s_mov_b32 s54, 0
2099+
; GCN-NEXT: s_mov_b32 s52, 0
2100+
; GCN-NEXT: s_mov_b32 s50, 0
2101+
; GCN-NEXT: s_mov_b32 s48, 0
2102+
; GCN-NEXT: s_mov_b32 s46, 0
2103+
; GCN-NEXT: s_mov_b32 s44, 0
2104+
; GCN-NEXT: s_mov_b32 s40, 0
20932105
; GCN-NEXT: s_mov_b64 s[36:37], 1.0
20942106
; GCN-NEXT: s_mov_b32 m0, s2
20952107
; GCN-NEXT: s_mov_b32 s67, 0x40300000
20962108
; GCN-NEXT: s_mov_b32 s65, 0x402e0000
2097-
; GCN-NEXT: s_mov_b32 s64, s66
20982109
; GCN-NEXT: s_mov_b32 s63, 0x402c0000
2099-
; GCN-NEXT: s_mov_b32 s62, s66
21002110
; GCN-NEXT: s_mov_b32 s61, 0x402a0000
2101-
; GCN-NEXT: s_mov_b32 s60, s66
21022111
; GCN-NEXT: s_mov_b32 s59, 0x40280000
2103-
; GCN-NEXT: s_mov_b32 s58, s66
21042112
; GCN-NEXT: s_mov_b32 s57, 0x40260000
2105-
; GCN-NEXT: s_mov_b32 s56, s66
21062113
; GCN-NEXT: s_mov_b32 s55, 0x40240000
2107-
; GCN-NEXT: s_mov_b32 s54, s66
21082114
; GCN-NEXT: s_mov_b32 s53, 0x40220000
2109-
; GCN-NEXT: s_mov_b32 s52, s66
21102115
; GCN-NEXT: s_mov_b32 s51, 0x40200000
2111-
; GCN-NEXT: s_mov_b32 s50, s66
21122116
; GCN-NEXT: s_mov_b32 s49, 0x401c0000
2113-
; GCN-NEXT: s_mov_b32 s48, s66
21142117
; GCN-NEXT: s_mov_b32 s47, 0x40180000
2115-
; GCN-NEXT: s_mov_b32 s46, s66
21162118
; GCN-NEXT: s_mov_b32 s45, 0x40140000
2117-
; GCN-NEXT: s_mov_b32 s44, s66
21182119
; GCN-NEXT: s_mov_b64 s[42:43], 4.0
21192120
; GCN-NEXT: s_mov_b32 s41, 0x40080000
2120-
; GCN-NEXT: s_mov_b32 s40, s66
21212121
; GCN-NEXT: s_mov_b64 s[38:39], 2.0
21222122
; GCN-NEXT: s_movrels_b64 s[0:1], s[36:37]
21232123
; GCN-NEXT: ; return to shader part epilog
21242124
;
21252125
; GFX10PLUS-LABEL: dyn_extract_v16f64_s_s:
21262126
; GFX10PLUS: ; %bb.0: ; %entry
2127-
; GFX10PLUS-NEXT: s_mov_b32 s66, 0
21282127
; GFX10PLUS-NEXT: s_mov_b64 s[36:37], 1.0
21292128
; GFX10PLUS-NEXT: s_mov_b32 m0, s2
2129+
; GFX10PLUS-NEXT: s_mov_b32 s66, 0
2130+
; GFX10PLUS-NEXT: s_mov_b32 s64, 0
2131+
; GFX10PLUS-NEXT: s_mov_b32 s62, 0
2132+
; GFX10PLUS-NEXT: s_mov_b32 s60, 0
2133+
; GFX10PLUS-NEXT: s_mov_b32 s58, 0
2134+
; GFX10PLUS-NEXT: s_mov_b32 s56, 0
2135+
; GFX10PLUS-NEXT: s_mov_b32 s54, 0
2136+
; GFX10PLUS-NEXT: s_mov_b32 s52, 0
2137+
; GFX10PLUS-NEXT: s_mov_b32 s50, 0
2138+
; GFX10PLUS-NEXT: s_mov_b32 s48, 0
2139+
; GFX10PLUS-NEXT: s_mov_b32 s46, 0
2140+
; GFX10PLUS-NEXT: s_mov_b32 s44, 0
2141+
; GFX10PLUS-NEXT: s_mov_b32 s40, 0
21302142
; GFX10PLUS-NEXT: s_mov_b32 s67, 0x40300000
21312143
; GFX10PLUS-NEXT: s_mov_b32 s65, 0x402e0000
2132-
; GFX10PLUS-NEXT: s_mov_b32 s64, s66
21332144
; GFX10PLUS-NEXT: s_mov_b32 s63, 0x402c0000
2134-
; GFX10PLUS-NEXT: s_mov_b32 s62, s66
21352145
; GFX10PLUS-NEXT: s_mov_b32 s61, 0x402a0000
2136-
; GFX10PLUS-NEXT: s_mov_b32 s60, s66
21372146
; GFX10PLUS-NEXT: s_mov_b32 s59, 0x40280000
2138-
; GFX10PLUS-NEXT: s_mov_b32 s58, s66
21392147
; GFX10PLUS-NEXT: s_mov_b32 s57, 0x40260000
2140-
; GFX10PLUS-NEXT: s_mov_b32 s56, s66
21412148
; GFX10PLUS-NEXT: s_mov_b32 s55, 0x40240000
2142-
; GFX10PLUS-NEXT: s_mov_b32 s54, s66
21432149
; GFX10PLUS-NEXT: s_mov_b32 s53, 0x40220000
2144-
; GFX10PLUS-NEXT: s_mov_b32 s52, s66
21452150
; GFX10PLUS-NEXT: s_mov_b32 s51, 0x40200000
2146-
; GFX10PLUS-NEXT: s_mov_b32 s50, s66
21472151
; GFX10PLUS-NEXT: s_mov_b32 s49, 0x401c0000
2148-
; GFX10PLUS-NEXT: s_mov_b32 s48, s66
21492152
; GFX10PLUS-NEXT: s_mov_b32 s47, 0x40180000
2150-
; GFX10PLUS-NEXT: s_mov_b32 s46, s66
21512153
; GFX10PLUS-NEXT: s_mov_b32 s45, 0x40140000
2152-
; GFX10PLUS-NEXT: s_mov_b32 s44, s66
21532154
; GFX10PLUS-NEXT: s_mov_b64 s[42:43], 4.0
21542155
; GFX10PLUS-NEXT: s_mov_b32 s41, 0x40080000
2155-
; GFX10PLUS-NEXT: s_mov_b32 s40, s66
21562156
; GFX10PLUS-NEXT: s_mov_b64 s[38:39], 2.0
21572157
; GFX10PLUS-NEXT: s_movrels_b64 s[0:1], s[36:37]
21582158
; GFX10PLUS-NEXT: ; return to shader part epilog
@@ -3085,10 +3085,10 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
30853085
; GPRIDX-NEXT: ; %bb.0: ; %entry
30863086
; GPRIDX-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
30873087
; GPRIDX-NEXT: s_load_dword s8, s[4:5], 0x8
3088+
; GPRIDX-NEXT: s_mov_b32 s4, 0
3089+
; GPRIDX-NEXT: s_mov_b32 s5, 0x40080000
30883090
; GPRIDX-NEXT: s_mov_b32 s2, 0
30893091
; GPRIDX-NEXT: s_mov_b32 s3, 0x40140000
3090-
; GPRIDX-NEXT: s_mov_b32 s5, 0x40080000
3091-
; GPRIDX-NEXT: s_mov_b32 s4, s2
30923092
; GPRIDX-NEXT: s_waitcnt lgkmcnt(0)
30933093
; GPRIDX-NEXT: s_cmp_eq_u32 s8, 1
30943094
; GPRIDX-NEXT: s_cselect_b64 s[6:7], 2.0, 1.0
@@ -3176,10 +3176,10 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
31763176
; MOVREL-NEXT: ; %bb.0: ; %entry
31773177
; MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
31783178
; MOVREL-NEXT: s_load_dword s8, s[4:5], 0x8
3179+
; MOVREL-NEXT: s_mov_b32 s4, 0
3180+
; MOVREL-NEXT: s_mov_b32 s5, 0x40080000
31793181
; MOVREL-NEXT: s_mov_b32 s2, 0
31803182
; MOVREL-NEXT: s_mov_b32 s3, 0x40140000
3181-
; MOVREL-NEXT: s_mov_b32 s5, 0x40080000
3182-
; MOVREL-NEXT: s_mov_b32 s4, s2
31833183
; MOVREL-NEXT: s_waitcnt lgkmcnt(0)
31843184
; MOVREL-NEXT: s_cmp_eq_u32 s8, 1
31853185
; MOVREL-NEXT: s_cselect_b64 s[6:7], 2.0, 1.0
@@ -3207,7 +3207,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
32073207
; GFX10-NEXT: kernel_code_entry_byte_offset = 256
32083208
; GFX10-NEXT: kernel_code_prefetch_byte_size = 0
32093209
; GFX10-NEXT: granulated_workitem_vgpr_count = 0
3210-
; GFX10-NEXT: granulated_wavefront_sgpr_count = 1
3210+
; GFX10-NEXT: granulated_wavefront_sgpr_count = 0
32113211
; GFX10-NEXT: priority = 0
32123212
; GFX10-NEXT: float_mode = 240
32133213
; GFX10-NEXT: priv = 0
@@ -3250,7 +3250,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
32503250
; GFX10-NEXT: gds_segment_byte_size = 0
32513251
; GFX10-NEXT: kernarg_segment_byte_size = 12
32523252
; GFX10-NEXT: workgroup_fbarrier_count = 0
3253-
; GFX10-NEXT: wavefront_sgpr_count = 9
3253+
; GFX10-NEXT: wavefront_sgpr_count = 7
32543254
; GFX10-NEXT: workitem_vgpr_count = 3
32553255
; GFX10-NEXT: reserved_vgpr_first = 0
32563256
; GFX10-NEXT: reserved_vgpr_count = 0
@@ -3267,22 +3267,22 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
32673267
; GFX10-NEXT: .end_amd_kernel_code_t
32683268
; GFX10-NEXT: ; %bb.0: ; %entry
32693269
; GFX10-NEXT: s_clause 0x1
3270-
; GFX10-NEXT: s_load_dword s8, s[4:5], 0x8
3270+
; GFX10-NEXT: s_load_dword s6, s[4:5], 0x8
32713271
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
32723272
; GFX10-NEXT: s_mov_b32 s2, 0
3273-
; GFX10-NEXT: s_mov_b32 s3, 0x40140000
3274-
; GFX10-NEXT: s_mov_b32 s5, 0x40080000
3275-
; GFX10-NEXT: s_mov_b32 s4, s2
3273+
; GFX10-NEXT: s_mov_b32 s3, 0x40080000
32763274
; GFX10-NEXT: v_mov_b32_e32 v2, 0
32773275
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
3278-
; GFX10-NEXT: s_cmp_eq_u32 s8, 1
3279-
; GFX10-NEXT: s_cselect_b64 s[6:7], 2.0, 1.0
3280-
; GFX10-NEXT: s_cmp_eq_u32 s8, 2
3281-
; GFX10-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7]
3282-
; GFX10-NEXT: s_cmp_eq_u32 s8, 3
3283-
; GFX10-NEXT: s_cselect_b64 s[4:5], 4.0, s[4:5]
3284-
; GFX10-NEXT: s_cmp_eq_u32 s8, 4
3276+
; GFX10-NEXT: s_cmp_eq_u32 s6, 1
3277+
; GFX10-NEXT: s_cselect_b64 s[4:5], 2.0, 1.0
3278+
; GFX10-NEXT: s_cmp_eq_u32 s6, 2
32853279
; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5]
3280+
; GFX10-NEXT: s_cmp_eq_u32 s6, 3
3281+
; GFX10-NEXT: s_mov_b32 s4, 0
3282+
; GFX10-NEXT: s_mov_b32 s5, 0x40140000
3283+
; GFX10-NEXT: s_cselect_b64 s[2:3], 4.0, s[2:3]
3284+
; GFX10-NEXT: s_cmp_eq_u32 s6, 4
3285+
; GFX10-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3]
32863286
; GFX10-NEXT: v_mov_b32_e32 v0, s2
32873287
; GFX10-NEXT: v_mov_b32_e32 v1, s3
32883288
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
@@ -3299,7 +3299,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
32993299
; GFX11-NEXT: kernel_code_entry_byte_offset = 256
33003300
; GFX11-NEXT: kernel_code_prefetch_byte_size = 0
33013301
; GFX11-NEXT: granulated_workitem_vgpr_count = 0
3302-
; GFX11-NEXT: granulated_wavefront_sgpr_count = 1
3302+
; GFX11-NEXT: granulated_wavefront_sgpr_count = 0
33033303
; GFX11-NEXT: priority = 0
33043304
; GFX11-NEXT: float_mode = 240
33053305
; GFX11-NEXT: priv = 0
@@ -3342,7 +3342,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
33423342
; GFX11-NEXT: gds_segment_byte_size = 0
33433343
; GFX11-NEXT: kernarg_segment_byte_size = 12
33443344
; GFX11-NEXT: workgroup_fbarrier_count = 0
3345-
; GFX11-NEXT: wavefront_sgpr_count = 9
3345+
; GFX11-NEXT: wavefront_sgpr_count = 7
33463346
; GFX11-NEXT: workitem_vgpr_count = 3
33473347
; GFX11-NEXT: reserved_vgpr_first = 0
33483348
; GFX11-NEXT: reserved_vgpr_count = 0
@@ -3359,22 +3359,22 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
33593359
; GFX11-NEXT: .end_amd_kernel_code_t
33603360
; GFX11-NEXT: ; %bb.0: ; %entry
33613361
; GFX11-NEXT: s_clause 0x1
3362-
; GFX11-NEXT: s_load_b32 s8, s[0:1], 0x8
3362+
; GFX11-NEXT: s_load_b32 s6, s[0:1], 0x8
33633363
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
33643364
; GFX11-NEXT: s_mov_b32 s2, 0
3365-
; GFX11-NEXT: s_mov_b32 s3, 0x40140000
3366-
; GFX11-NEXT: s_mov_b32 s5, 0x40080000
3367-
; GFX11-NEXT: s_mov_b32 s4, s2
3365+
; GFX11-NEXT: s_mov_b32 s3, 0x40080000
33683366
; GFX11-NEXT: v_mov_b32_e32 v2, 0
33693367
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
3370-
; GFX11-NEXT: s_cmp_eq_u32 s8, 1
3371-
; GFX11-NEXT: s_cselect_b64 s[6:7], 2.0, 1.0
3372-
; GFX11-NEXT: s_cmp_eq_u32 s8, 2
3373-
; GFX11-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7]
3374-
; GFX11-NEXT: s_cmp_eq_u32 s8, 3
3375-
; GFX11-NEXT: s_cselect_b64 s[4:5], 4.0, s[4:5]
3376-
; GFX11-NEXT: s_cmp_eq_u32 s8, 4
3368+
; GFX11-NEXT: s_cmp_eq_u32 s6, 1
3369+
; GFX11-NEXT: s_cselect_b64 s[4:5], 2.0, 1.0
3370+
; GFX11-NEXT: s_cmp_eq_u32 s6, 2
33773371
; GFX11-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5]
3372+
; GFX11-NEXT: s_cmp_eq_u32 s6, 3
3373+
; GFX11-NEXT: s_mov_b32 s4, 0
3374+
; GFX11-NEXT: s_mov_b32 s5, 0x40140000
3375+
; GFX11-NEXT: s_cselect_b64 s[2:3], 4.0, s[2:3]
3376+
; GFX11-NEXT: s_cmp_eq_u32 s6, 4
3377+
; GFX11-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3]
33783378
; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
33793379
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
33803380
; GFX11-NEXT: s_nop 0
@@ -4784,11 +4784,8 @@ define i32 @v_extract_v64i32_32(ptr addrspace(1) %ptr) {
47844784
; MOVREL-LABEL: v_extract_v64i32_32:
47854785
; MOVREL: ; %bb.0:
47864786
; MOVREL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4787-
; MOVREL-NEXT: s_mov_b64 s[4:5], 0x80
4788-
; MOVREL-NEXT: v_mov_b32_e32 v2, s4
4789-
; MOVREL-NEXT: v_mov_b32_e32 v3, s5
4790-
; MOVREL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
4791-
; MOVREL-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
4787+
; MOVREL-NEXT: v_add_u32_e32 v0, vcc, 0x80, v0
4788+
; MOVREL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
47924789
; MOVREL-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
47934790
; MOVREL-NEXT: s_waitcnt vmcnt(0)
47944791
; MOVREL-NEXT: s_setpc_b64 s[30:31]
@@ -4823,11 +4820,8 @@ define i32 @v_extract_v64i32_33(ptr addrspace(1) %ptr) {
48234820
; MOVREL-LABEL: v_extract_v64i32_33:
48244821
; MOVREL: ; %bb.0:
48254822
; MOVREL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4826-
; MOVREL-NEXT: s_mov_b64 s[4:5], 0x80
4827-
; MOVREL-NEXT: v_mov_b32_e32 v2, s4
4828-
; MOVREL-NEXT: v_mov_b32_e32 v3, s5
4829-
; MOVREL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
4830-
; MOVREL-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
4823+
; MOVREL-NEXT: v_add_u32_e32 v0, vcc, 0x80, v0
4824+
; MOVREL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
48314825
; MOVREL-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
48324826
; MOVREL-NEXT: s_waitcnt vmcnt(0)
48334827
; MOVREL-NEXT: v_mov_b32_e32 v0, v1

0 commit comments

Comments
 (0)