Skip to content

Commit 3828ea6

Browse files
committed
[AMDGPU] Divergence-driven instruction selection for mul i32
Differential Revision: https://reviews.llvm.org/D109881
1 parent 636fc0e commit 3828ea6

File tree

6 files changed

+38
-54
lines changed

6 files changed

+38
-54
lines changed

llvm/lib/Target/AMDGPU/SOPInstructions.td

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -622,9 +622,8 @@ def S_BFM_B32 : SOP2_32 <"s_bfm_b32",
622622
[(set i32:$sdst, (UniformBinFrag<AMDGPUbfm> i32:$src0, i32:$src1))]>;
623623
def S_BFM_B64 : SOP2_64_32_32 <"s_bfm_b64">;
624624

625-
// TODO: S_MUL_I32 require V_MUL_LO_I32 from VOP3 change
626625
def S_MUL_I32 : SOP2_32 <"s_mul_i32",
627-
[(set i32:$sdst, (mul i32:$src0, i32:$src1))]> {
626+
[(set i32:$sdst, (UniformBinFrag<mul> i32:$src0, i32:$src1))]> {
628627
let isCommutable = 1;
629628
}
630629
} // End isReMaterializable = 1

llvm/lib/Target/AMDGPU/VOP3Instructions.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -304,7 +304,7 @@ defm V_MAX_F64 : VOP3Inst <"v_max_f64", VOP3_Profile<VOP_F64_F64_F64>, fmaxnum_l
304304
} // End SchedRW = [WriteDoubleAdd]
305305

306306
let SchedRW = [WriteIntMul] in {
307-
defm V_MUL_LO_U32 : VOP3Inst <"v_mul_lo_u32", VOP3_Profile<VOP_I32_I32_I32>, mul>;
307+
defm V_MUL_LO_U32 : VOP3Inst <"v_mul_lo_u32", VOP3_Profile<VOP_I32_I32_I32>, DivergentBinFrag<mul>>;
308308
defm V_MUL_HI_U32 : VOP3Inst <"v_mul_hi_u32", VOP3_Profile<VOP_I32_I32_I32>, mulhu>;
309309
defm V_MUL_LO_I32 : VOP3Inst <"v_mul_lo_i32", VOP3_Profile<VOP_I32_I32_I32>>;
310310
defm V_MUL_HI_I32 : VOP3Inst <"v_mul_hi_i32", VOP3_Profile<VOP_I32_I32_I32>, mulhs>;

llvm/test/CodeGen/AMDGPU/urem-seteq-illegal-types.ll

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -76,22 +76,22 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind {
7676
; CHECK-NEXT: s_mov_b32 s5, 0x8311eb33
7777
; CHECK-NEXT: s_mov_b32 s6, 0x20140c
7878
; CHECK-NEXT: s_mov_b32 s7, 0xb6db6db7
79-
; CHECK-NEXT: s_mov_b32 s11, 0x49249249
80-
; CHECK-NEXT: s_mov_b32 s8, 0x24924924
81-
; CHECK-NEXT: s_mov_b32 s9, 0xaaaaaaab
82-
; CHECK-NEXT: s_mov_b32 s10, 0x2aaaaaaa
79+
; CHECK-NEXT: s_mov_b32 s8, 0x49249249
80+
; CHECK-NEXT: s_mov_b32 s9, 0x24924924
81+
; CHECK-NEXT: s_mov_b32 s10, 0xaaaaaaab
82+
; CHECK-NEXT: s_mov_b32 s11, 0x2aaaaaaa
8383
; CHECK-NEXT: v_and_b32_e32 v0, s4, v0
8484
; CHECK-NEXT: v_and_b32_e32 v1, s4, v1
8585
; CHECK-NEXT: v_and_b32_e32 v2, s4, v2
8686
; CHECK-NEXT: v_mul_lo_u32 v2, v2, s5
8787
; CHECK-NEXT: v_mul_lo_u32 v1, v1, s7
88-
; CHECK-NEXT: v_mul_lo_u32 v0, v0, s9
88+
; CHECK-NEXT: v_mul_lo_u32 v0, v0, s10
8989
; CHECK-NEXT: v_add_i32_e32 v2, vcc, 0xf9dc299a, v2
90-
; CHECK-NEXT: v_add_i32_e32 v1, vcc, s11, v1
90+
; CHECK-NEXT: v_add_i32_e32 v1, vcc, s8, v1
9191
; CHECK-NEXT: v_alignbit_b32 v0, v0, v0, 1
92-
; CHECK-NEXT: v_cmp_lt_u32_e32 vcc, s10, v0
92+
; CHECK-NEXT: v_cmp_lt_u32_e32 vcc, s11, v0
9393
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
94-
; CHECK-NEXT: v_cmp_lt_u32_e32 vcc, s8, v1
94+
; CHECK-NEXT: v_cmp_lt_u32_e32 vcc, s9, v1
9595
; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
9696
; CHECK-NEXT: v_cmp_lt_u32_e32 vcc, s6, v2
9797
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc

llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -119,9 +119,9 @@ define amdgpu_ps float @else3(i32 %z, float %v, i32 inreg %bound, i32 %x0) #0 {
119119
; SI: S_BRANCH %bb.4
120120
; SI: bb.2.Flow:
121121
; SI: successors: %bb.3(0x40000000), %bb.5(0x40000000)
122-
; SI: [[PHI2:%[0-9]+]]:vgpr_32 = PHI undef %34:vgpr_32, %bb.1, %10, %bb.4
123-
; SI: [[PHI3:%[0-9]+]]:vgpr_32 = PHI undef %35:vgpr_32, %bb.1, %9, %bb.4
124-
; SI: [[PHI4:%[0-9]+]]:vgpr_32 = PHI [[PHI1]], %bb.1, undef %38:vgpr_32, %bb.4
122+
; SI: [[PHI2:%[0-9]+]]:vgpr_32 = PHI undef %32:vgpr_32, %bb.1, %10, %bb.4
123+
; SI: [[PHI3:%[0-9]+]]:vgpr_32 = PHI undef %33:vgpr_32, %bb.1, %9, %bb.4
124+
; SI: [[PHI4:%[0-9]+]]:vgpr_32 = PHI [[PHI1]], %bb.1, undef %35:vgpr_32, %bb.4
125125
; SI: [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
126126
; SI: S_BRANCH %bb.3
127127
; SI: bb.3.if:
@@ -133,7 +133,6 @@ define amdgpu_ps float @else3(i32 %z, float %v, i32 inreg %bound, i32 %x0) #0 {
133133
; SI: successors: %bb.2(0x80000000)
134134
; SI: %9:vgpr_32 = nofpexcept V_MUL_F32_e32 [[COPY2]], [[PHI1]], implicit $mode, implicit $exec
135135
; SI: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 killed [[PHI1]], 3, implicit $exec
136-
; SI: [[COPY4:%[0-9]+]]:vgpr_32 = COPY killed [[V_MUL_LO_U32_e64_]]
137136
; SI: S_BRANCH %bb.2
138137
; SI: bb.5.if.end:
139138
; SI: successors: %bb.6(0x04000000), %bb.1(0x7c000000)
@@ -146,8 +145,8 @@ define amdgpu_ps float @else3(i32 %z, float %v, i32 inreg %bound, i32 %x0) #0 {
146145
; SI: S_CBRANCH_SCC1 %bb.1, implicit killed $scc
147146
; SI: S_BRANCH %bb.6
148147
; SI: bb.6.for.end:
149-
; SI: %33:vgpr_32 = nofpexcept V_ADD_F32_e32 killed [[PHI6]], killed [[PHI5]], implicit $mode, implicit $exec
150-
; SI: $vgpr0 = COPY killed %33
148+
; SI: %31:vgpr_32 = nofpexcept V_ADD_F32_e32 killed [[PHI6]], killed [[PHI5]], implicit $mode, implicit $exec
149+
; SI: $vgpr0 = COPY killed %31
151150
; SI: SI_RETURN_TO_EPILOG killed $vgpr0
152151
entry:
153152
; %break = icmp sgt i32 %bound, 0

llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll

Lines changed: 19 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -322,8 +322,6 @@ define hidden amdgpu_gfx i32 @strict_wwm_called(i32 %a) noinline {
322322
; GFX9-O0: ; %bb.0:
323323
; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
324324
; GFX9-O0-NEXT: v_add_u32_e64 v1, v0, v0
325-
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
326-
; GFX9-O0-NEXT: ; implicit-def: $sgpr4
327325
; GFX9-O0-NEXT: v_mul_lo_u32 v0, v1, v0
328326
; GFX9-O0-NEXT: v_sub_u32_e64 v0, v0, v1
329327
; GFX9-O0-NEXT: s_setpc_b64 s[30:31]
@@ -350,42 +348,36 @@ define amdgpu_gfx void @strict_wwm_call(<4 x i32> inreg %tmp14, i32 inreg %arg)
350348
; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
351349
; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
352350
; GFX9-O0-NEXT: s_mov_b64 exec, s[10:11]
353-
; GFX9-O0-NEXT: v_writelane_b32 v3, s33, 7
351+
; GFX9-O0-NEXT: v_writelane_b32 v3, s33, 2
354352
; GFX9-O0-NEXT: s_mov_b32 s33, s32
355353
; GFX9-O0-NEXT: s_add_i32 s32, s32, 0x400
356354
; GFX9-O0-NEXT: v_writelane_b32 v3, s30, 0
357355
; GFX9-O0-NEXT: v_writelane_b32 v3, s31, 1
358-
; GFX9-O0-NEXT: v_writelane_b32 v3, s8, 2
359-
; GFX9-O0-NEXT: s_mov_b32 s8, s4
360-
; GFX9-O0-NEXT: v_readlane_b32 s4, v3, 2
361-
; GFX9-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
362-
; GFX9-O0-NEXT: s_mov_b32 s9, s5
356+
; GFX9-O0-NEXT: s_mov_b32 s9, s8
357+
; GFX9-O0-NEXT: s_mov_b32 s8, s7
363358
; GFX9-O0-NEXT: s_mov_b32 s10, s6
364-
; GFX9-O0-NEXT: s_mov_b32 s11, s7
365-
; GFX9-O0-NEXT: v_writelane_b32 v3, s8, 3
366-
; GFX9-O0-NEXT: v_writelane_b32 v3, s9, 4
367-
; GFX9-O0-NEXT: v_writelane_b32 v3, s10, 5
368-
; GFX9-O0-NEXT: v_writelane_b32 v3, s11, 6
359+
; GFX9-O0-NEXT: s_mov_b32 s11, s5
360+
; GFX9-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
361+
; GFX9-O0-NEXT: s_mov_b32 s5, s11
362+
; GFX9-O0-NEXT: s_mov_b32 s6, s10
363+
; GFX9-O0-NEXT: s_mov_b32 s7, s8
364+
; GFX9-O0-NEXT: ; kill: def $sgpr12_sgpr13_sgpr14_sgpr15 killed $sgpr4_sgpr5_sgpr6_sgpr7
369365
; GFX9-O0-NEXT: s_mov_b32 s8, 0
370-
; GFX9-O0-NEXT: v_mov_b32_e32 v0, s4
366+
; GFX9-O0-NEXT: v_mov_b32_e32 v0, s9
371367
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0
372368
; GFX9-O0-NEXT: s_not_b64 exec, exec
373369
; GFX9-O0-NEXT: v_mov_b32_e32 v2, s8
374370
; GFX9-O0-NEXT: s_not_b64 exec, exec
375371
; GFX9-O0-NEXT: s_or_saveexec_b64 s[10:11], -1
376-
; GFX9-O0-NEXT: s_getpc_b64 s[4:5]
377-
; GFX9-O0-NEXT: s_add_u32 s4, s4, strict_wwm_called@rel32@lo+4
378-
; GFX9-O0-NEXT: s_addc_u32 s5, s5, strict_wwm_called@rel32@hi+12
379-
; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[2:3]
380-
; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[0:1]
381-
; GFX9-O0-NEXT: s_mov_b64 s[0:1], s[12:13]
382-
; GFX9-O0-NEXT: s_mov_b64 s[2:3], s[14:15]
372+
; GFX9-O0-NEXT: s_getpc_b64 s[12:13]
373+
; GFX9-O0-NEXT: s_add_u32 s12, s12, strict_wwm_called@rel32@lo+4
374+
; GFX9-O0-NEXT: s_addc_u32 s13, s13, strict_wwm_called@rel32@hi+12
375+
; GFX9-O0-NEXT: s_mov_b64 s[18:19], s[2:3]
376+
; GFX9-O0-NEXT: s_mov_b64 s[16:17], s[0:1]
377+
; GFX9-O0-NEXT: s_mov_b64 s[0:1], s[16:17]
378+
; GFX9-O0-NEXT: s_mov_b64 s[2:3], s[18:19]
383379
; GFX9-O0-NEXT: v_mov_b32_e32 v0, v2
384-
; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[4:5]
385-
; GFX9-O0-NEXT: v_readlane_b32 s4, v3, 3
386-
; GFX9-O0-NEXT: v_readlane_b32 s5, v3, 4
387-
; GFX9-O0-NEXT: v_readlane_b32 s6, v3, 5
388-
; GFX9-O0-NEXT: v_readlane_b32 s7, v3, 6
380+
; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[12:13]
389381
; GFX9-O0-NEXT: v_readlane_b32 s30, v3, 0
390382
; GFX9-O0-NEXT: v_readlane_b32 s31, v3, 1
391383
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0
@@ -394,7 +386,7 @@ define amdgpu_gfx void @strict_wwm_call(<4 x i32> inreg %tmp14, i32 inreg %arg)
394386
; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1
395387
; GFX9-O0-NEXT: buffer_store_dword v0, off, s[4:7], s8 offset:4
396388
; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xfffffc00
397-
; GFX9-O0-NEXT: v_readlane_b32 s33, v3, 7
389+
; GFX9-O0-NEXT: v_readlane_b32 s33, v3, 2
398390
; GFX9-O0-NEXT: s_or_saveexec_b64 s[4:5], -1
399391
; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
400392
; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
@@ -467,15 +459,11 @@ define amdgpu_gfx i64 @strict_wwm_called_i64(i64 %a) noinline {
467459
; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1]
468460
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0
469461
; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4
470-
; GFX9-O0-NEXT: ; implicit-def: $sgpr5
471-
; GFX9-O0-NEXT: ; implicit-def: $sgpr5
472462
; GFX9-O0-NEXT: v_mul_lo_u32 v2, v0, v1
473463
; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
474464
; GFX9-O0-NEXT: v_mul_hi_u32 v1, v0, v6
475465
; GFX9-O0-NEXT: v_lshrrev_b64 v[7:8], s4, v[4:5]
476466
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v7
477-
; GFX9-O0-NEXT: ; implicit-def: $sgpr5
478-
; GFX9-O0-NEXT: ; implicit-def: $sgpr5
479467
; GFX9-O0-NEXT: v_mul_lo_u32 v3, v3, v6
480468
; GFX9-O0-NEXT: v_add3_u32 v1, v1, v2, v3
481469
; GFX9-O0-NEXT: ; implicit-def: $sgpr5
@@ -485,8 +473,6 @@ define amdgpu_gfx i64 @strict_wwm_called_i64(i64 %a) noinline {
485473
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3
486474
; GFX9-O0-NEXT: v_lshlrev_b64 v[1:2], s4, v[1:2]
487475
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2
488-
; GFX9-O0-NEXT: ; implicit-def: $sgpr5
489-
; GFX9-O0-NEXT: ; implicit-def: $sgpr5
490476
; GFX9-O0-NEXT: v_mul_lo_u32 v6, v0, v6
491477
; GFX9-O0-NEXT: s_mov_b32 s5, 0
492478
; GFX9-O0-NEXT: v_mov_b32_e32 v0, 0

llvm/test/CodeGen/AMDGPU/wwm-reserved.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -105,14 +105,14 @@ define hidden i32 @called(i32 %a) noinline {
105105
; GFX9-LABEL: {{^}}call:
106106
define amdgpu_kernel void @call(<4 x i32> inreg %tmp14, i32 inreg %arg) {
107107
; GFX9-DAG: s_load_dword [[ARG:s[0-9]+]]
108-
; GFX9-O0-DAG: s_mov_b32 s0, 0{{$}}
108+
; GFX9-O0-DAG: s_mov_b32 s4, 0{{$}}
109109
; GFX9-O0-DAG: v_mov_b32_e32 v0, [[ARG]]
110110
; GFX9-O0-DAG: v_mov_b32_e32 v2, v0
111111

112112
; GFX9-O3: v_mov_b32_e32 v2, [[ARG]]
113113

114114
; GFX9-NEXT: s_not_b64 exec, exec
115-
; GFX9-O0-NEXT: v_mov_b32_e32 v2, s0
115+
; GFX9-O0-NEXT: v_mov_b32_e32 v2, s4
116116
; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0
117117
; GFX9-NEXT: s_not_b64 exec, exec
118118
%tmp107 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %arg, i32 0)
@@ -299,14 +299,14 @@ define hidden i32 @strict_wwm_called(i32 %a) noinline {
299299
; GFX9-LABEL: {{^}}strict_wwm_call:
300300
define amdgpu_kernel void @strict_wwm_call(<4 x i32> inreg %tmp14, i32 inreg %arg) {
301301
; GFX9-DAG: s_load_dword [[ARG:s[0-9]+]]
302-
; GFX9-O0-DAG: s_mov_b32 s0, 0{{$}}
302+
; GFX9-O0-DAG: s_mov_b32 s4, 0{{$}}
303303
; GFX9-O0-DAG: v_mov_b32_e32 v0, [[ARG]]
304304
; GFX9-O0-DAG: v_mov_b32_e32 v2, v0
305305

306306
; GFX9-O3: v_mov_b32_e32 v2, [[ARG]]
307307

308308
; GFX9-NEXT: s_not_b64 exec, exec
309-
; GFX9-O0-NEXT: v_mov_b32_e32 v2, s0
309+
; GFX9-O0-NEXT: v_mov_b32_e32 v2, s4
310310
; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0
311311
; GFX9-NEXT: s_not_b64 exec, exec
312312
%tmp107 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %arg, i32 0)

0 commit comments

Comments
 (0)