Skip to content

Commit d2edff8

Browse files
authored
[AMDGPU] PeepholeSDWA: Don't assume inst srcs are registers (#69576)
To fix that ticket we only needed to address the V_LSHLREV_B16 case, but I did it for all insts just in case. Fixes #66899
1 parent 2b97fe2 commit d2edff8

File tree

2 files changed

+115
-4
lines changed

2 files changed

+115
-4
lines changed

llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -546,7 +546,8 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
546546

547547
MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
548548
MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
549-
if (Src1->getReg().isPhysical() || Dst->getReg().isPhysical())
549+
if (!Src1->isReg() || Src1->getReg().isPhysical() ||
550+
Dst->getReg().isPhysical())
550551
break;
551552

552553
if (Opcode == AMDGPU::V_LSHLREV_B32_e32 ||
@@ -584,7 +585,8 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
584585
MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
585586
MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
586587

587-
if (Src1->getReg().isPhysical() || Dst->getReg().isPhysical())
588+
if (!Src1->isReg() || Src1->getReg().isPhysical() ||
589+
Dst->getReg().isPhysical())
588590
break;
589591

590592
if (Opcode == AMDGPU::V_LSHLREV_B16_e32 ||
@@ -647,7 +649,8 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
647649
MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
648650
MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
649651

650-
if (Src0->getReg().isPhysical() || Dst->getReg().isPhysical())
652+
if (!Src0->isReg() || Src0->getReg().isPhysical() ||
653+
Dst->getReg().isPhysical())
651654
break;
652655

653656
return std::make_unique<SDWASrcOperand>(
@@ -675,7 +678,8 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
675678

676679
MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
677680

678-
if (ValSrc->getReg().isPhysical() || Dst->getReg().isPhysical())
681+
if (!ValSrc->isReg() || ValSrc->getReg().isPhysical() ||
682+
Dst->getReg().isPhysical())
679683
break;
680684

681685
return std::make_unique<SDWASrcOperand>(

llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2098,6 +2098,113 @@ bb11: ; preds = %bb10, %bb2
20982098
br label %bb1
20992099
}
21002100

2101+
define void @crash_lshlrevb16_not_reg_op() {
2102+
; NOSDWA-LABEL: crash_lshlrevb16_not_reg_op:
2103+
; NOSDWA: ; %bb.0: ; %bb0
2104+
; NOSDWA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2105+
; NOSDWA-NEXT: s_mov_b64 s[4:5], 0
2106+
; NOSDWA-NEXT: s_and_b64 vcc, exec, -1
2107+
; NOSDWA-NEXT: v_lshlrev_b16_e64 v3, 8, 1
2108+
; NOSDWA-NEXT: .LBB22_1: ; %bb1
2109+
; NOSDWA-NEXT: ; =>This Inner Loop Header: Depth=1
2110+
; NOSDWA-NEXT: v_mov_b32_e32 v0, s4
2111+
; NOSDWA-NEXT: v_mov_b32_e32 v2, 0xff
2112+
; NOSDWA-NEXT: s_lshl_b32 s6, s4, 3
2113+
; NOSDWA-NEXT: v_mov_b32_e32 v1, s5
2114+
; NOSDWA-NEXT: s_mov_b64 s[4:5], 1
2115+
; NOSDWA-NEXT: v_and_b32_e32 v2, s4, v2
2116+
; NOSDWA-NEXT: v_or_b32_e32 v2, v2, v3
2117+
; NOSDWA-NEXT: v_lshrrev_b16_e32 v2, s6, v2
2118+
; NOSDWA-NEXT: flat_store_byte v[0:1], v2
2119+
; NOSDWA-NEXT: s_mov_b64 vcc, vcc
2120+
; NOSDWA-NEXT: s_cbranch_vccnz .LBB22_1
2121+
; NOSDWA-NEXT: ; %bb.2: ; %DummyReturnBlock
2122+
; NOSDWA-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2123+
; NOSDWA-NEXT: s_setpc_b64 s[30:31]
2124+
;
2125+
; GFX89-LABEL: crash_lshlrevb16_not_reg_op:
2126+
; GFX89: ; %bb.0: ; %bb0
2127+
; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2128+
; GFX89-NEXT: s_mov_b64 s[4:5], 0
2129+
; GFX89-NEXT: s_and_b64 vcc, exec, -1
2130+
; GFX89-NEXT: v_lshlrev_b16_e64 v0, 8, 1
2131+
; GFX89-NEXT: .LBB22_1: ; %bb1
2132+
; GFX89-NEXT: ; =>This Inner Loop Header: Depth=1
2133+
; GFX89-NEXT: v_mov_b32_e32 v3, s4
2134+
; GFX89-NEXT: s_lshl_b32 s6, s4, 3
2135+
; GFX89-NEXT: v_mov_b32_e32 v1, s4
2136+
; GFX89-NEXT: v_or_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2137+
; GFX89-NEXT: v_mov_b32_e32 v2, s5
2138+
; GFX89-NEXT: s_mov_b64 s[4:5], 1
2139+
; GFX89-NEXT: v_lshrrev_b16_e32 v3, s6, v3
2140+
; GFX89-NEXT: flat_store_byte v[1:2], v3
2141+
; GFX89-NEXT: s_mov_b64 vcc, vcc
2142+
; GFX89-NEXT: s_cbranch_vccnz .LBB22_1
2143+
; GFX89-NEXT: ; %bb.2: ; %DummyReturnBlock
2144+
; GFX89-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2145+
; GFX89-NEXT: s_setpc_b64 s[30:31]
2146+
;
2147+
; GFX9-LABEL: crash_lshlrevb16_not_reg_op:
2148+
; GFX9: ; %bb.0: ; %bb0
2149+
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2150+
; GFX9-NEXT: s_mov_b64 s[4:5], 0
2151+
; GFX9-NEXT: v_lshlrev_b16_e64 v0, 8, 1
2152+
; GFX9-NEXT: s_and_b64 vcc, exec, -1
2153+
; GFX9-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2154+
; GFX9-NEXT: .LBB22_1: ; %bb1
2155+
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
2156+
; GFX9-NEXT: s_lshl_b32 s6, s4, 3
2157+
; GFX9-NEXT: v_mov_b32_e32 v1, s4
2158+
; GFX9-NEXT: v_mov_b32_e32 v2, s5
2159+
; GFX9-NEXT: s_mov_b64 s[4:5], 1
2160+
; GFX9-NEXT: v_lshrrev_b16_e32 v3, s6, v0
2161+
; GFX9-NEXT: flat_store_byte v[1:2], v3
2162+
; GFX9-NEXT: s_mov_b64 vcc, vcc
2163+
; GFX9-NEXT: s_cbranch_vccnz .LBB22_1
2164+
; GFX9-NEXT: ; %bb.2: ; %DummyReturnBlock
2165+
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2166+
; GFX9-NEXT: s_setpc_b64 s[30:31]
2167+
;
2168+
; GFX10-LABEL: crash_lshlrevb16_not_reg_op:
2169+
; GFX10: ; %bb.0: ; %bb0
2170+
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2171+
; GFX10-NEXT: v_lshlrev_b16 v0, 8, 1
2172+
; GFX10-NEXT: s_mov_b32 vcc_lo, exec_lo
2173+
; GFX10-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2174+
; GFX10-NEXT: s_mov_b64 s[4:5], 0
2175+
; GFX10-NEXT: .LBB22_1: ; %bb1
2176+
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
2177+
; GFX10-NEXT: s_lshl_b32 s6, s4, 3
2178+
; GFX10-NEXT: v_mov_b32_e32 v1, s4
2179+
; GFX10-NEXT: v_mov_b32_e32 v2, s5
2180+
; GFX10-NEXT: v_lshrrev_b16 v3, s6, v0
2181+
; GFX10-NEXT: s_mov_b64 s[4:5], 1
2182+
; GFX10-NEXT: flat_store_byte v[1:2], v3
2183+
; GFX10-NEXT: s_cbranch_vccnz .LBB22_1
2184+
; GFX10-NEXT: ; %bb.2: ; %DummyReturnBlock
2185+
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2186+
; GFX10-NEXT: s_setpc_b64 s[30:31]
2187+
%1 = alloca [2 x i8], align 1, addrspace(5)
2188+
%2 = getelementptr [2 x i8], ptr addrspace(5) %1, i32 0, i32 1
2189+
br label %bb0
2190+
2191+
bb0:
2192+
store i8 1, ptr addrspace(5) %2, align 1
2193+
br label %bb1
2194+
2195+
bb1:
2196+
%3 = phi i64 [ 1, %bb1 ], [ 0, %bb0 ]
2197+
%4 = trunc i64 %3 to i32
2198+
%5 = getelementptr i8, ptr addrspace(5) %1, i32 %4
2199+
%6 = load i8, ptr addrspace(5) %5, align 1
2200+
%7 = getelementptr i8, ptr null, i64 %3
2201+
store i8 %6, ptr %7, align 1
2202+
br i1 false, label %bb2, label %bb1
2203+
2204+
bb2:
2205+
br label %bb0
2206+
}
2207+
21012208
declare i32 @llvm.amdgcn.workitem.id.x()
21022209

21032210
attributes #0 = { "denormal-fp-math"="preserve-sign,preserve-sign" }

0 commit comments

Comments
 (0)