-
Notifications
You must be signed in to change notification settings - Fork 14.2k
[AMDGPU] Select (xor i1 (divergent trunc:i32 x), -1) -> cmp_neq x, 1 #133698
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-amdgpu Author: Ana Mihajlovic (mihajlovicana) ChangesPatch is 35.17 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/133698.diff 12 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 9051db0c01ed1..f770cf3014579 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3123,6 +3123,18 @@ def IMMBitSelConst : SDNodeXForm<imm, [{
// v_cmp_ne_u32_e64 $a, 0, $a
// Handle the VALU case.
+def : GCNPat <
+ (i1 (xor (i1 (DivergentUnaryFrag_oneuse<trunc>i32:$a)), -1)),
+ (V_CMP_NE_U32_e64 (V_AND_B32_e64 (i32 1), i32:$a),
+ (i32 1))
+>;
+
+def : GCNPat <
+ (i1 (xor (i1 (DivergentUnaryFrag_oneuse<trunc> i64:$a)), -1)),
+ (V_CMP_NE_U32_e64 (V_AND_B32_e64 (i32 1),
+ (i32 (EXTRACT_SUBREG $a, sub0))), (i32 1))
+>;
+
def : GCNPat <
(i1 (DivergentUnaryFrag<trunc> (i32 (srl i32:$a, (i32 imm:$b))))),
(V_CMP_NE_U32_e64 (V_AND_B32_e64 (i32 (IMMBitSelConst $b)), $a),
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index d6ad01c8f9b35..86e30096f5423 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -1255,6 +1255,19 @@ class VOPPatGen<SDPatternOperator Op, VOPProfile P> {
list<dag> ret = [!con(Outs, (set Ins))];
}
+class DivergentUnaryFrag_oneuse<SDPatternOperator Op> : PatFrag <
+ (ops node:$src0),
+ (Op $src0),
+ [{ return N->isDivergent(); }]> {
+ // This check is unnecessary as it's captured by the result register
+ // bank constraint.
+ //
+ // FIXME: Should add a way for the emitter to recognize this is a
+ // trivially true predicate to eliminate the check.
+ let GISelPredicateCode = [{return true;}];
+ let HasOneUse = 1;
+}
+
class DivergentUnaryFrag<SDPatternOperator Op> : PatFrag <
(ops node:$src0),
(Op $src0),
diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector.ll
index 61c0b8b861d5b..41082821bafe3 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-subvector.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-subvector.ll
@@ -6,12 +6,11 @@ define <2 x i16> @extract_2xi16(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_and_b32_e32 v4, 1, v4
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
-; GCN-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4
; GCN-NEXT: ; implicit-def: $vgpr5
; GCN-NEXT: ; implicit-def: $vgpr4
-; GCN-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
-; GCN-NEXT: s_xor_b64 s[4:5], exec, s[6:7]
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN-NEXT: s_cbranch_execz .LBB0_2
; GCN-NEXT: ; %bb.1: ; %F
; GCN-NEXT: s_mov_b32 s10, 0
@@ -101,11 +100,10 @@ define <2 x i64> @extract_2xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_and_b32_e32 v4, 1, v4
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
-; GCN-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4
; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19
-; GCN-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
-; GCN-NEXT: s_xor_b64 s[4:5], exec, s[6:7]
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN-NEXT: s_cbranch_execz .LBB1_2
; GCN-NEXT: ; %bb.1: ; %F
; GCN-NEXT: s_mov_b32 s10, 0
@@ -172,11 +170,10 @@ define <4 x i64> @extract_4xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_and_b32_e32 v4, 1, v4
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
-; GCN-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4
; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19
-; GCN-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
-; GCN-NEXT: s_xor_b64 s[4:5], exec, s[6:7]
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN-NEXT: s_cbranch_execz .LBB2_2
; GCN-NEXT: ; %bb.1: ; %F
; GCN-NEXT: s_mov_b32 s10, 0
@@ -249,11 +246,10 @@ define <8 x i64> @extract_8xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_and_b32_e32 v4, 1, v4
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
-; GCN-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4
; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35
-; GCN-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
-; GCN-NEXT: s_xor_b64 s[4:5], exec, s[6:7]
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN-NEXT: s_cbranch_execz .LBB3_2
; GCN-NEXT: ; %bb.1: ; %F
; GCN-NEXT: s_mov_b32 s10, 0
@@ -353,11 +349,10 @@ define <2 x double> @extract_2xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_and_b32_e32 v4, 1, v4
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
-; GCN-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4
; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19
-; GCN-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
-; GCN-NEXT: s_xor_b64 s[4:5], exec, s[6:7]
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN-NEXT: s_cbranch_execz .LBB4_2
; GCN-NEXT: ; %bb.1: ; %F
; GCN-NEXT: s_mov_b32 s10, 0
@@ -424,11 +419,10 @@ define <4 x double> @extract_4xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_and_b32_e32 v4, 1, v4
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
-; GCN-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4
; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19
-; GCN-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
-; GCN-NEXT: s_xor_b64 s[4:5], exec, s[6:7]
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN-NEXT: s_cbranch_execz .LBB5_2
; GCN-NEXT: ; %bb.1: ; %F
; GCN-NEXT: s_mov_b32 s10, 0
@@ -501,11 +495,10 @@ define <8 x double> @extract_8xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_and_b32_e32 v4, 1, v4
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
-; GCN-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4
; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35
-; GCN-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
-; GCN-NEXT: s_xor_b64 s[4:5], exec, s[6:7]
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN-NEXT: s_cbranch_execz .LBB6_2
; GCN-NEXT: ; %bb.1: ; %F
; GCN-NEXT: s_mov_b32 s10, 0
diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-phi-block-end-iterator-debugloc.ll b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-phi-block-end-iterator-debugloc.ll
index 13184cf17a2e5..fd64ea3ae1c4b 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-phi-block-end-iterator-debugloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-phi-block-end-iterator-debugloc.ll
@@ -6,8 +6,7 @@ define i32 @rocrand_regression(ptr addrspace(1) %arg, i32 %arg0, i1 %cmp7) {
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_and_b32_e32 v0, 1, v3
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; CHECK-NEXT: s_xor_b64 s[4:5], vcc, -1
+; CHECK-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v0
; CHECK-NEXT: s_mov_b32 s8, 0
; CHECK-NEXT: .LBB0_1: ; %do.body
; CHECK-NEXT: ; =>This Loop Header: Depth=1
diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll
index 8ee52a828de65..d0a3811314029 100644
--- a/llvm/test/CodeGen/AMDGPU/function-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-args.ll
@@ -102,9 +102,8 @@ define void @i1_arg_i1_use(i1 %arg) #0 {
; CIGFX89: ; %bb.0: ; %bb
; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CIGFX89-NEXT: v_and_b32_e32 v0, 1, v0
-; CIGFX89-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; CIGFX89-NEXT: s_xor_b64 s[6:7], vcc, -1
-; CIGFX89-NEXT: s_and_saveexec_b64 s[4:5], s[6:7]
+; CIGFX89-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0
+; CIGFX89-NEXT: s_and_saveexec_b64 s[4:5], vcc
; CIGFX89-NEXT: s_cbranch_execz .LBB3_2
; CIGFX89-NEXT: ; %bb.1: ; %bb1
; CIGFX89-NEXT: s_mov_b32 s7, 0xf000
@@ -120,15 +119,14 @@ define void @i1_arg_i1_use(i1 %arg) #0 {
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT: s_mov_b32 s2, -1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT: s_xor_b32 s1, vcc_lo, -1
-; GFX11-NEXT: s_and_saveexec_b32 s0, s1
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cmpx_ne_u32_e32 1, v0
; GFX11-NEXT: s_cbranch_execz .LBB3_2
; GFX11-NEXT: ; %bb.1: ; %bb1
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: .LBB3_2: ; %bb2
diff --git a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
index 8dbd6c5d133ea..56ceba258f471 100644
--- a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
+++ b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
@@ -11,37 +11,47 @@ define void @main(i1 %arg) #0 {
; CHECK-NEXT: s_mov_b64 exec, s[4:5]
; CHECK-NEXT: v_writelane_b32 v5, s30, 0
; CHECK-NEXT: v_writelane_b32 v5, s31, 1
-; CHECK-NEXT: v_writelane_b32 v5, s34, 2
-; CHECK-NEXT: v_writelane_b32 v5, s35, 3
-; CHECK-NEXT: v_writelane_b32 v5, s36, 4
-; CHECK-NEXT: v_writelane_b32 v5, s37, 5
-; CHECK-NEXT: v_writelane_b32 v5, s38, 6
+; CHECK-NEXT: v_writelane_b32 v5, s36, 2
+; CHECK-NEXT: v_writelane_b32 v5, s37, 3
+; CHECK-NEXT: v_writelane_b32 v5, s38, 4
+; CHECK-NEXT: v_writelane_b32 v5, s39, 5
+; CHECK-NEXT: v_writelane_b32 v5, s48, 6
+; CHECK-NEXT: v_writelane_b32 v5, s49, 7
+; CHECK-NEXT: v_writelane_b32 v5, s50, 8
+; CHECK-NEXT: v_writelane_b32 v5, s51, 9
+; CHECK-NEXT: v_writelane_b32 v5, s52, 10
+; CHECK-NEXT: v_writelane_b32 v5, s53, 11
+; CHECK-NEXT: v_writelane_b32 v5, s54, 12
+; CHECK-NEXT: v_writelane_b32 v5, s55, 13
; CHECK-NEXT: s_getpc_b64 s[24:25]
-; CHECK-NEXT: v_writelane_b32 v5, s39, 7
-; CHECK-NEXT: s_movk_i32 s20, 0xf0
-; CHECK-NEXT: s_mov_b32 s21, s24
-; CHECK-NEXT: v_writelane_b32 v5, s48, 8
-; CHECK-NEXT: s_load_dwordx16 s[4:19], s[20:21], 0x0
-; CHECK-NEXT: s_mov_b64 s[20:21], 0
-; CHECK-NEXT: v_writelane_b32 v5, s49, 9
-; CHECK-NEXT: s_load_dwordx4 s[20:23], s[20:21], 0x0
-; CHECK-NEXT: v_writelane_b32 v5, s50, 10
+; CHECK-NEXT: v_writelane_b32 v5, s64, 14
+; CHECK-NEXT: s_movk_i32 s4, 0xf0
+; CHECK-NEXT: s_mov_b32 s5, s24
+; CHECK-NEXT: v_writelane_b32 v5, s65, 15
+; CHECK-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
+; CHECK-NEXT: s_mov_b64 s[4:5], 0
+; CHECK-NEXT: v_writelane_b32 v5, s66, 16
+; CHECK-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
+; CHECK-NEXT: v_writelane_b32 v5, s67, 17
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: s_movk_i32 s22, 0x130
-; CHECK-NEXT: s_mov_b32 s23, s24
-; CHECK-NEXT: v_writelane_b32 v5, s51, 11
-; CHECK-NEXT: s_load_dwordx16 s[36:51], s[22:23], 0x0
-; CHECK-NEXT: s_mov_b32 s28, 0
+; CHECK-NEXT: s_movk_i32 s6, 0x130
+; CHECK-NEXT: s_mov_b32 s7, s24
+; CHECK-NEXT: v_writelane_b32 v5, s68, 18
+; CHECK-NEXT: s_load_dwordx16 s[36:51], s[6:7], 0x0
+; CHECK-NEXT: v_writelane_b32 v5, s69, 19
+; CHECK-NEXT: v_writelane_b32 v5, s70, 20
+; CHECK-NEXT: s_mov_b32 s68, 0
; CHECK-NEXT: v_mov_b32_e32 v1, 0
-; CHECK-NEXT: v_mov_b32_e32 v2, s20
+; CHECK-NEXT: v_writelane_b32 v5, s71, 21
+; CHECK-NEXT: v_mov_b32_e32 v2, s4
; CHECK-NEXT: v_mov_b32_e32 v3, v1
-; CHECK-NEXT: s_mov_b32 s29, s28
-; CHECK-NEXT: s_mov_b32 s30, s28
-; CHECK-NEXT: s_mov_b32 s31, s28
-; CHECK-NEXT: image_sample_lz v3, v[2:3], s[12:19], s[28:31] dmask:0x1
+; CHECK-NEXT: s_mov_b32 s69, s68
+; CHECK-NEXT: s_mov_b32 s70, s68
+; CHECK-NEXT: s_mov_b32 s71, s68
+; CHECK-NEXT: image_sample_lz v3, v[2:3], s[16:23], s[68:71] dmask:0x1
; CHECK-NEXT: v_mov_b32_e32 v2, v1
; CHECK-NEXT: ; implicit-def: $vgpr6 : SGPR spill to VGPR lane
-; CHECK-NEXT: v_writelane_b32 v5, s52, 12
+; CHECK-NEXT: s_mov_b32 s6, 48
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_writelane_b32 v6, s36, 0
; CHECK-NEXT: v_writelane_b32 v6, s37, 1
@@ -49,57 +59,44 @@ define void @main(i1 %arg) #0 {
; CHECK-NEXT: v_writelane_b32 v6, s39, 3
; CHECK-NEXT: v_writelane_b32 v6, s40, 4
; CHECK-NEXT: v_writelane_b32 v6, s41, 5
-; CHECK-NEXT: image_sample_lz v4, v[1:2], s[36:43], s[28:31] dmask:0x1
+; CHECK-NEXT: image_sample_lz v4, v[1:2], s[36:43], s[68:71] dmask:0x1
; CHECK-NEXT: v_writelane_b32 v6, s42, 6
; CHECK-NEXT: v_writelane_b32 v6, s43, 7
; CHECK-NEXT: v_writelane_b32 v6, s44, 8
; CHECK-NEXT: v_writelane_b32 v6, s45, 9
-; CHECK-NEXT: v_writelane_b32 v5, s53, 13
; CHECK-NEXT: v_writelane_b32 v6, s46, 10
-; CHECK-NEXT: v_writelane_b32 v5, s54, 14
; CHECK-NEXT: v_writelane_b32 v6, s47, 11
-; CHECK-NEXT: v_writelane_b32 v5, s55, 15
; CHECK-NEXT: v_writelane_b32 v6, s48, 12
-; CHECK-NEXT: v_writelane_b32 v5, s64, 16
; CHECK-NEXT: v_writelane_b32 v6, s49, 13
-; CHECK-NEXT: v_writelane_b32 v5, s65, 17
; CHECK-NEXT: v_writelane_b32 v6, s50, 14
-; CHECK-NEXT: v_writelane_b32 v5, s66, 18
-; CHECK-NEXT: v_writelane_b32 v6, s51, 15
-; CHECK-NEXT: s_mov_b32 s40, 48
; CHECK-NEXT: s_movk_i32 s56, 0x1f0
-; CHECK-NEXT: s_movk_i32 s34, 0x2f0
-; CHECK-NEXT: s_mov_b32 s41, s24
+; CHECK-NEXT: s_movk_i32 s72, 0x2f0
; CHECK-NEXT: s_mov_b32 s57, s24
-; CHECK-NEXT: s_mov_b32 s35, s24
-; CHECK-NEXT: v_writelane_b32 v5, s67, 19
-; CHECK-NEXT: s_load_dwordx8 s[20:27], s[40:41], 0x0
-; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_mov_b32 s73, s24
+; CHECK-NEXT: v_writelane_b32 v6, s51, 15
+; CHECK-NEXT: s_load_dwordx8 s[24:31], s[6:7], 0x0
; CHECK-NEXT: s_load_dwordx16 s[36:51], s[56:57], 0x0
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
-; CHECK-NEXT: s_load_dwordx16 s[52:67], s[34:35], 0x0
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; CHECK-NEXT: v_writelane_b32 v5, s68, 20
-; CHECK-NEXT: s_xor_b64 s[72:73], vcc, -1
-; CHECK-NEXT: v_writelane_b32 v5, s69, 21
+; CHECK-NEXT: s_load_dwordx16 s[52:67], s[72:73], 0x0
+; CHECK-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v0
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mul_f32_e32 v0, v4, v3
-; CHECK-NEXT: s_and_saveexec_b64 vcc, s[72:73]
-; CHECK-NEXT: s_xor_b64 s[34:35], exec, vcc
+; CHECK-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
+; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
; CHECK-NEXT: s_cbranch_execz .LBB0_3
; CHECK-NEXT: ; %bb.1: ; %bb48
-; CHECK-NEXT: image_sample_lz v3, v[1:2], s[12:19], s[28:31] dmask:0x1
+; CHECK-NEXT: image_sample_lz v3, v[1:2], s[16:23], s[68:71] dmask:0x1
; CHECK-NEXT: v_mov_b32_e32 v2, 0
; CHECK-NEXT: s_and_b64 vcc, exec, -1
; CHECK-NEXT: .LBB0_2: ; %bb50
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_mov_b32 s29, s28
-; CHECK-NEXT: s_mov_b32 s30, s28
-; CHECK-NEXT: s_mov_b32 s31, s28
+; CHECK-NEXT: s_mov_b32 s69, s68
+; CHECK-NEXT: s_mov_b32 s70, s68
+; CHECK-NEXT: s_mov_b32 s71, s68
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: image_sample_lz v4, v[1:2], s[44:51], s[24:27] dmask:0x1
+; CHECK-NEXT: image_sample_lz v4, v[1:2], s[44:51], s[28:31] dmask:0x1
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: image_sample_lz v1, v[1:2], s[60:67], s[28:31] dmask:0x1
+; CHECK-NEXT: image_sample_lz v1, v[1:2], s[60:67], s[68:71] dmask:0x1
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_sub_f32_e32 v1, v1, v4
; CHECK-NEXT: v_mul_f32_e32 v1, v1, v0
@@ -107,11 +104,11 @@ define void @main(i1 %arg) #0 {
; CHECK-NEXT: s_mov_b64 vcc, vcc
; CHECK-NEXT: s_cbranch_vccnz .LBB0_2
; CHECK-NEXT: .LBB0_3: ; %Flow14
-; CHECK-NEXT: s_andn2_saveexec_b64 s[12:13], s[34:35]
+; CHECK-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7]
; CHECK-NEXT: s_cbranch_execz .LBB0_10
; CHECK-NEXT: ; %bb.4: ; %bb32
-; CHECK-NEXT: s_and_saveexec_b64 s[14:15], s[72:73]
-; CHECK-NEXT: s_xor_b64 s[14:15], exec, s[14:15]
+; CHECK-NEXT: s_and_saveexec_b64 s[16:17], s[4:5]
+; CHECK-NEXT: s_xor_b64 s[4:5], exec, s[16:17]
; CHECK-NEXT: s_cbranch_execz .LBB0_6
; CHECK-NEXT: ; %bb.5: ; %bb43
; CHECK-NEXT: s_mov_b32 s16, 0
@@ -120,12 +117,12 @@ define void @main(i1 %arg) #0 {
; CHECK-NEXT: v_mov_b32_e32 v3, s17
; CHECK-NEXT: s_mov_b32 s18, s16
; CHECK-NEXT: s_mov_b32 s19, s16
-; CHECK-NEXT: image_sample_lz v1, v[2:3], s[4:11], s[16:19] dmask:0x1
+; CHECK-NEXT: image_sample_lz v1, v[2:3], s[8:15], s[16:19] dmask:0x1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: s_mov_b64 s[4:5], s[36:37]
-; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
-; CHECK-NEXT: s_mov_b64 s[8:9], s[40:41]
-; CHECK-NEXT: s_mov_b64 s[10:11], s[42:43]
+; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37]
+; CHECK-NEXT: s_mov_b64 s[10:11], s[38:39]
+; CHECK-NEXT: s_mov_b64 s[12:13], s[40:41]
+; CHECK-NEXT: s_mov_b64 s[14:15], s[42:43]
; CHECK-NEXT: v_readlane_b32 s36, v6, 0
; CHECK-NEXT: v_readlane_b32 s44, v6, 8
; CHECK-NEXT: v_readlane_b32 s45, v6, 9
@@ -140,32 +137,32 @@ define void @main(i1 %arg) #0 {
; CHECK-NEXT: v_readlane_b32 s39, v6, 3
; CHECK-NEXT: v_readlane_b32 s40, v6, 4
; CHECK-NEXT: v_readlane_b32 s41, v6, 5
-; CHECK-NEXT: image_sample_lz v0, v[2:3], s[44:51], s[20:23] dmask:0x1
+; CHECK-NEXT: image_sample_lz v0, v[2:3], s[44:51], s[24:27] dmask:0x1
; CHECK-NEXT: v_readlane_b32 s42, v6, 6
; CHECK-NEXT: v_readlane_b32 s43, v6, 7
; CHECK-NEXT: v_mov_b32_e32 v2, 0
-; CHECK-NEXT: s_mov_b64 s[42:43], s[10:11]
+; CHECK-NEXT: s_mov_b64 s[42:43], s[14:15]
; CHECK-NEXT: v_mov_b32_e32 v3, v2
-; CHECK-NEXT: s_mov_b64 s[40:41], s[8:9]
-; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7]
-; CHECK-NEXT: s_mov_b64 s[36:37], s[4:5]
+; CHECK-NEXT: s_mov_b64 s[40:41], s[12:13]
+; CHECK-NEXT: s_mov_b64 s[38:39], s[10:11]
+; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9]
; CHECK-NEXT: s_waitcnt vmcnt(1)
; CHECK-NEXT: buffer_store_dwordx3 v[1:3], off, s[16:19], 0
; CHECK-NEXT: s_waitcnt vmcnt(1)
; CHECK-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
; CHECK-NEXT: ; implicit-def: $vgpr0
; CHECK-NEXT: .LBB0_6: ; %Flow12
-; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[14:15]
+; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; CHECK-NEXT: s_cbranch_execz .LBB0_9
; CHECK-NEXT: ; %bb.7: ; %bb33.preheader
; CHECK-NEXT: s_mov_b32 s8, 0
-; CHECK-NEXT: s_mov_b32 s6, s8
-; CHECK-NEXT: s_mov_b32 s7, s8
-; CHECK-NEXT: v_mov_b32_e32 v1, s6
+; CHECK-NEXT: s_mov_b32 s12, s8
+; CHECK-NEXT: s_mov_b32 s13, s8
+; CHECK-NEXT: v_mov_b32_e32 v1, s12
; CHECK-NEXT: s_mov_b32 s9, s8
; CHECK-NEXT: s_mov_b32 s10, s8
; CHECK-NEXT: s_mov_b32 s11, s8
-; CHECK-NEXT: v_mov_b32_e32 ...
[truncated]
|
Shouldn't this be a DAG combine instead of a ISel pattern? |
Because divergent i1 trunc is selected into and and cmp which is target specific |
Ah, I didn't pay enough attention, I thought the pattern was selecting a cmp but it's a trunc. |
; ISA-NEXT: s_branch .LBB0_1 | ||
; ISA-NEXT: %bb.5: ; %DummyReturnBlock | ||
; ISA-NEXT: s_setpc_b64 s[30:31] | ||
; ISA: ; %bb.0: ; %BB |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It looks like this test wasn't autogenerated properly before, can you precommit an update to it?
Otherwise it's difficult to tell what changed.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done, changes should be visible now in the second commit
@@ -1267,6 +1267,19 @@ class DivergentUnaryFrag<SDPatternOperator Op> : PatFrag < | |||
let GISelPredicateCode = [{return true;}]; | |||
} | |||
|
|||
class DivergentUnaryFrag_oneuse<SDPatternOperator Op> : PatFrag < |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you write something like DivergentUnaryFrag<HasOneUseUnaryOp<trunc>>
, instead of defining this new class?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
seems to be working
ping |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/51/builds/14224 Here is the relevant piece of the build log for the reference
|
patch is for amdgpu |
No description provided.