Skip to content

Commit 06c8c5b

Browse files
committed
Updating tests and removing superfluous check
1 parent 2404de9 commit 06c8c5b

File tree

8 files changed

+101
-50
lines changed

8 files changed

+101
-50
lines changed

llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -45,15 +45,15 @@ class SDWADstOperand;
4545
using SDWAOperandsVector = SmallVector<SDWAOperand *, 4>;
4646

4747
// helper typedef to make code cleaner
48-
typedef std::unordered_map<MachineInstr *, SDWAOperandsVector> SDWAOperandsMap;
48+
typedef MapVector<MachineInstr *, SDWAOperandsVector> SDWAOperandsMap;
4949

5050
class SIPeepholeSDWA : public MachineFunctionPass {
5151
private:
5252
MachineRegisterInfo *MRI;
5353
const SIRegisterInfo *TRI;
5454
const SIInstrInfo *TII;
5555

56-
std::unordered_map<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands;
56+
MapVector<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands;
5757
SDWAOperandsMap PotentialMatches;
5858
SmallVector<MachineInstr *, 8> ConvertedInstructions;
5959

@@ -356,11 +356,6 @@ MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII,
356356
if (!isConvertibleToSDWA(*(UseMO.getParent()), ST, TII)) {
357357
return nullptr;
358358
}
359-
360-
// Not handling the obscure case where the same use is in multiple operands
361-
if (PotentialMatches->find(UseMO.getParent()) != PotentialMatches->end()) {
362-
return nullptr;
363-
}
364359
}
365360
// Now that it's guaranteed all uses are legal, iterate over the uses again
366361
// to add them for later conversion.

llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -298,9 +298,8 @@ define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
298298
; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
299299
; GFX9-NEXT: v_pk_add_i16 v0, v0, v1 clamp
300300
; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
301-
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
302-
; GFX9-NEXT: v_and_b32_e32 v1, 0xff, v1
303-
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
301+
; GFX9-NEXT: v_mov_b32_e32 v1, 0xff
302+
; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
304303
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
305304
; GFX9-NEXT: s_setpc_b64 s[30:31]
306305
;
@@ -607,12 +606,10 @@ define i32 @v_saddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
607606
; GFX8-NEXT: v_add_u16_e32 v3, v3, v4
608607
; GFX8-NEXT: v_mov_b32_e32 v4, 0xff
609608
; GFX8-NEXT: v_and_b32_sdwa v1, sext(v1), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
610-
; GFX8-NEXT: v_ashrrev_i16_e32 v2, 8, v2
611609
; GFX8-NEXT: v_and_b32_sdwa v0, sext(v0), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
612610
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1
613611
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
614-
; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v2
615-
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
612+
; GFX8-NEXT: v_and_b32_sdwa v1, sext(v2), v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
616613
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
617614
; GFX8-NEXT: v_and_b32_sdwa v1, sext(v3), v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
618615
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1

llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -279,11 +279,9 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
279279
; GFX8-NEXT: v_max_i16_e32 v1, v1, v2
280280
; GFX8-NEXT: v_min_i16_e32 v1, v1, v4
281281
; GFX8-NEXT: v_sub_u16_e32 v1, v3, v1
282-
; GFX8-NEXT: v_ashrrev_i16_e32 v1, 8, v1
283282
; GFX8-NEXT: v_mov_b32_e32 v2, 0xff
284-
; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v1
285283
; GFX8-NEXT: v_and_b32_sdwa v0, sext(v0), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
286-
; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1
284+
; GFX8-NEXT: v_and_b32_sdwa v1, sext(v1), v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
287285
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
288286
; GFX8-NEXT: s_setpc_b64 s[30:31]
289287
;
@@ -300,9 +298,8 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
300298
; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
301299
; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1 clamp
302300
; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
303-
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
304-
; GFX9-NEXT: v_and_b32_e32 v1, 0xff, v1
305-
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
301+
; GFX9-NEXT: v_mov_b32_e32 v1, 0xff
302+
; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
306303
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
307304
; GFX9-NEXT: s_setpc_b64 s[30:31]
308305
;

llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -679,7 +679,8 @@ define amdgpu_kernel void @fmuladd_v2f16(
679679
; VI-FLUSH-NEXT: s_waitcnt vmcnt(1)
680680
; VI-FLUSH-NEXT: v_lshrrev_b32_e32 v4, 16, v1
681681
; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
682-
; VI-FLUSH-NEXT: v_mac_f16_sdwa v3, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
682+
; VI-FLUSH-NEXT: v_lshrrev_b32_e32 v5, 16, v2
683+
; VI-FLUSH-NEXT: v_mac_f16_e32 v3, v5, v4
683684
; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v3, 16, v3
684685
; VI-FLUSH-NEXT: v_mac_f16_e32 v0, v2, v1
685686
; VI-FLUSH-NEXT: v_or_b32_e32 v0, v0, v3

llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
21
# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=si-peephole-sdwa -verify-machineinstrs -o - %s | FileCheck -check-prefix=VI -check-prefix=GFX89 -check-prefix=GCN %s
32
# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=si-peephole-sdwa -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 -check-prefix=GFX89 -check-prefix=GCN %s
43

@@ -469,8 +468,3 @@ body: |
469468
S_ENDPGM 0, implicit %7
470469
471470
...
472-
## NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
473-
# GCN: {{.*}}
474-
# GFX89: {{.*}}
475-
# GFX9: {{.*}}
476-
# VI: {{.*}}

llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2204,6 +2204,94 @@ bb2:
22042204
br label %bb0
22052205
}
22062206

2207+
define amdgpu_kernel void @mac_v2half_same_srcop(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 {
2208+
; NOSDWA-LABEL: mac_v2half_same_srcop:
2209+
; NOSDWA: ; %bb.0: ; %entry
2210+
; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2211+
; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2212+
; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
2213+
; NOSDWA-NEXT: v_mov_b32_e32 v0, s6
2214+
; NOSDWA-NEXT: v_mov_b32_e32 v2, s0
2215+
; NOSDWA-NEXT: v_mov_b32_e32 v3, s1
2216+
; NOSDWA-NEXT: v_mov_b32_e32 v1, s7
2217+
; NOSDWA-NEXT: flat_load_dword v2, v[2:3]
2218+
; NOSDWA-NEXT: flat_load_dword v3, v[0:1]
2219+
; NOSDWA-NEXT: v_mov_b32_e32 v0, s4
2220+
; NOSDWA-NEXT: v_mov_b32_e32 v1, s5
2221+
; NOSDWA-NEXT: s_waitcnt vmcnt(1)
2222+
; NOSDWA-NEXT: v_lshrrev_b32_e32 v4, 16, v2
2223+
; NOSDWA-NEXT: s_waitcnt vmcnt(0)
2224+
; NOSDWA-NEXT: v_lshrrev_b32_e32 v5, 16, v3
2225+
; NOSDWA-NEXT: v_mac_f16_e32 v5, v4, v4
2226+
; NOSDWA-NEXT: v_lshlrev_b32_e32 v4, 16, v5
2227+
; NOSDWA-NEXT: v_mac_f16_e32 v3, v2, v2
2228+
; NOSDWA-NEXT: v_or_b32_e32 v2, v3, v4
2229+
; NOSDWA-NEXT: flat_store_dword v[0:1], v2
2230+
; NOSDWA-NEXT: s_endpgm
2231+
;
2232+
; GFX89-LABEL: mac_v2half_same_srcop:
2233+
; GFX89: ; %bb.0: ; %entry
2234+
; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2235+
; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2236+
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
2237+
; GFX89-NEXT: v_mov_b32_e32 v0, s6
2238+
; GFX89-NEXT: v_mov_b32_e32 v1, s7
2239+
; GFX89-NEXT: v_mov_b32_e32 v2, s0
2240+
; GFX89-NEXT: v_mov_b32_e32 v3, s1
2241+
; GFX89-NEXT: flat_load_dword v4, v[0:1]
2242+
; GFX89-NEXT: flat_load_dword v2, v[2:3]
2243+
; GFX89-NEXT: v_mov_b32_e32 v0, s4
2244+
; GFX89-NEXT: v_mov_b32_e32 v1, s5
2245+
; GFX89-NEXT: s_waitcnt vmcnt(1)
2246+
; GFX89-NEXT: v_lshrrev_b32_e32 v3, 16, v4
2247+
; GFX89-NEXT: s_waitcnt vmcnt(0)
2248+
; GFX89-NEXT: v_mac_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2249+
; GFX89-NEXT: v_lshlrev_b32_e32 v3, 16, v3
2250+
; GFX89-NEXT: v_mac_f16_e32 v4, v2, v2
2251+
; GFX89-NEXT: v_or_b32_e32 v2, v4, v3
2252+
; GFX89-NEXT: flat_store_dword v[0:1], v2
2253+
; GFX89-NEXT: s_endpgm
2254+
;
2255+
; GFX9-LABEL: mac_v2half_same_srcop:
2256+
; GFX9: ; %bb.0: ; %entry
2257+
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2258+
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2259+
; GFX9-NEXT: v_mov_b32_e32 v0, 0
2260+
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2261+
; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
2262+
; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
2263+
; GFX9-NEXT: s_waitcnt vmcnt(1)
2264+
; GFX9-NEXT: v_pk_mul_f16 v1, v1, v1
2265+
; GFX9-NEXT: s_waitcnt vmcnt(0)
2266+
; GFX9-NEXT: v_pk_add_f16 v1, v1, v2
2267+
; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
2268+
; GFX9-NEXT: s_endpgm
2269+
;
2270+
; GFX10-LABEL: mac_v2half_same_srcop:
2271+
; GFX10: ; %bb.0: ; %entry
2272+
; GFX10-NEXT: s_clause 0x1
2273+
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2274+
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2275+
; GFX10-NEXT: v_mov_b32_e32 v0, 0
2276+
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2277+
; GFX10-NEXT: s_clause 0x1
2278+
; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
2279+
; GFX10-NEXT: global_load_dword v2, v0, s[6:7]
2280+
; GFX10-NEXT: s_waitcnt vmcnt(1)
2281+
; GFX10-NEXT: v_pk_mul_f16 v1, v1, v1
2282+
; GFX10-NEXT: s_waitcnt vmcnt(0)
2283+
; GFX10-NEXT: v_pk_add_f16 v1, v1, v2
2284+
; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
2285+
; GFX10-NEXT: s_endpgm
2286+
entry:
2287+
%a = load <2 x half>, ptr addrspace(1) %ina, align 4
2288+
%b = load <2 x half>, ptr addrspace(1) %inb, align 4
2289+
%mul = fmul <2 x half> %b, %b
2290+
%mac = fadd <2 x half> %mul, %a
2291+
store <2 x half> %mac, ptr addrspace(1) %out, align 4
2292+
ret void
2293+
}
2294+
22072295
declare i32 @llvm.amdgcn.workitem.id.x()
22082296

22092297
attributes #0 = { "denormal-fp-math"="preserve-sign,preserve-sign" }

llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,12 +32,12 @@ define amdgpu_kernel void @s_abs_v2i16(ptr addrspace(1) %out, <2 x i16> %val) #0
3232
; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], [[VAL]], [[SUB]]
3333
; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[MAX]], 2 op_sel_hi:[1,0]
3434

35+
; VI-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
3536
; VI-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2
36-
; VI-DAG: v_lshrrev_b32_e32 v{{[0-9]+}}, 16,
3737
; VI-DAG: v_sub_u16_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
38-
; VI-DAG: v_sub_u16_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
39-
; VI-DAG: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
38+
; VI-DAG: v_sub_u16_sdwa v{{[0-9]+}}, [[ZERO]], v{{[0-9]+}}
4039
; VI-DAG: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
40+
; VI-DAG: v_max_i16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
4141
; VI: v_add_u16_e32 v{{[0-9]+}}, 2, v{{[0-9]+}}
4242
; VI: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[TWO]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
4343
; VI-NOT: v_and_b32

update.bat

Lines changed: 0 additions & 21 deletions
This file was deleted.

0 commit comments

Comments
 (0)