Skip to content

[DAG] Preserve NUW when reassociating #87621

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Apr 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 6 additions & 5 deletions llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1164,19 +1164,20 @@ SDValue DAGCombiner::reassociateOpsCommutative(unsigned Opc, const SDLoc &DL,
SDValue N01 = N0.getOperand(1);

if (DAG.isConstantIntBuildVectorOrConstantInt(peekThroughBitcasts(N01))) {
SDNodeFlags NewFlags;
if (N0.getOpcode() == ISD::ADD && N0->getFlags().hasNoUnsignedWrap() &&
Flags.hasNoUnsignedWrap())
NewFlags.setNoUnsignedWrap(true);

if (DAG.isConstantIntBuildVectorOrConstantInt(peekThroughBitcasts(N1))) {
// Reassociate: (op (op x, c1), c2) -> (op x, (op c1, c2))
if (SDValue OpNode = DAG.FoldConstantArithmetic(Opc, DL, VT, {N01, N1}))
return DAG.getNode(Opc, DL, VT, N00, OpNode);
return DAG.getNode(Opc, DL, VT, N00, OpNode, NewFlags);
return SDValue();
}
if (TLI.isReassocProfitable(DAG, N0, N1)) {
// Reassociate: (op (op x, c1), y) -> (op (op x, y), c1)
// iff (op x, c1) has one use
SDNodeFlags NewFlags;
if (N0.getOpcode() == ISD::ADD && N0->getFlags().hasNoUnsignedWrap() &&
Flags.hasNoUnsignedWrap())
NewFlags.setNoUnsignedWrap(true);
SDValue OpNode = DAG.getNode(Opc, SDLoc(N0), VT, N00, N1, NewFlags);
return DAG.getNode(Opc, DL, VT, OpNode, N01, NewFlags);
}
Expand Down
73 changes: 28 additions & 45 deletions llvm/test/CodeGen/AMDGPU/bf16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -5678,22 +5678,18 @@ define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) {
; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8
; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4
; GFX11-NEXT: scratch_load_b32 v31, off, s32
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: s_clause 0x4
; GFX11-NEXT: scratch_store_b128 off, v[18:21], s0 offset:64
; GFX11-NEXT: scratch_store_b128 off, v[10:13], s0 offset:32
; GFX11-NEXT: scratch_store_b128 off, v[6:9], s0 offset:16
; GFX11-NEXT: scratch_store_b128 off, v[2:5], s0
; GFX11-NEXT: scratch_store_b16 off, v1, s0 offset:128
; GFX11-NEXT: s_add_i32 s1, s0, 0x70
; GFX11-NEXT: s_add_i32 s2, s0, 0x60
; GFX11-NEXT: s_add_i32 s3, s0, 0x50
; GFX11-NEXT: s_add_i32 s0, s0, 48
; GFX11-NEXT: s_clause 0x5
; GFX11-NEXT: scratch_store_b128 v0, v[22:25], off offset:80
; GFX11-NEXT: scratch_store_b128 v0, v[18:21], off offset:64
; GFX11-NEXT: scratch_store_b128 v0, v[14:17], off offset:48
; GFX11-NEXT: scratch_store_b128 v0, v[10:13], off offset:32
; GFX11-NEXT: scratch_store_b128 v0, v[6:9], off offset:16
; GFX11-NEXT: scratch_store_b128 v0, v[2:5], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: scratch_store_b128 off, v[30:33], s1
; GFX11-NEXT: scratch_store_b128 off, v[26:29], s2
; GFX11-NEXT: scratch_store_b128 off, v[22:25], s3
; GFX11-NEXT: scratch_store_b128 off, v[14:17], s0
; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: scratch_store_b128 v0, v[30:33], off offset:112
; GFX11-NEXT: scratch_store_b128 v0, v[26:29], off offset:96
; GFX11-NEXT: scratch_store_b16 v0, v1, off offset:128
; GFX11-NEXT: s_setpc_b64 s[30:31]
%ins.0 = insertvalue { <32 x i32>, bfloat } poison, <32 x i32> %b, 0
%ins.1 = insertvalue { <32 x i32>, bfloat } %ins.0 ,bfloat %a, 1
Expand Down Expand Up @@ -8827,19 +8823,6 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
; GFX11-NEXT: global_load_u16 v32, v[1:2], off offset:54
; GFX11-NEXT: global_load_u16 v33, v[1:2], off offset:58
; GFX11-NEXT: global_load_u16 v1, v[1:2], off offset:62
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: s_add_i32 s1, s0, 0xf0
; GFX11-NEXT: s_add_i32 s2, s0, 0xe0
; GFX11-NEXT: s_add_i32 s3, s0, 0xd0
; GFX11-NEXT: s_add_i32 s4, s0, 0xc0
; GFX11-NEXT: s_add_i32 s5, s0, 0xb0
; GFX11-NEXT: s_add_i32 s6, s0, 0xa0
; GFX11-NEXT: s_add_i32 s7, s0, 0x90
; GFX11-NEXT: s_add_i32 s8, s0, 0x70
; GFX11-NEXT: s_add_i32 s9, s0, 0x60
; GFX11-NEXT: s_add_i32 s10, s0, 0x50
; GFX11-NEXT: s_add_i32 s11, s0, 48
; GFX11-NEXT: s_waitcnt vmcnt(31)
; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v3
; GFX11-NEXT: s_waitcnt vmcnt(30)
Expand Down Expand Up @@ -8936,23 +8919,23 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
; GFX11-NEXT: v_cvt_f64_f32_e32 v[5:6], v5
; GFX11-NEXT: v_cvt_f64_f32_e32 v[3:4], v2
; GFX11-NEXT: v_cvt_f64_f32_e32 v[1:2], v37
; GFX11-NEXT: scratch_store_b128 off, v[96:99], s1
; GFX11-NEXT: scratch_store_b128 off, v[84:87], s2
; GFX11-NEXT: scratch_store_b128 off, v[80:83], s3
; GFX11-NEXT: scratch_store_b128 off, v[68:71], s4
; GFX11-NEXT: scratch_store_b128 off, v[64:67], s5
; GFX11-NEXT: scratch_store_b128 off, v[52:55], s6
; GFX11-NEXT: scratch_store_b128 off, v[48:51], s7
; GFX11-NEXT: scratch_store_b128 off, v[33:36], s0 offset:128
; GFX11-NEXT: scratch_store_b128 off, v[29:32], s8
; GFX11-NEXT: scratch_store_b128 off, v[25:28], s9
; GFX11-NEXT: scratch_store_b128 off, v[21:24], s10
; GFX11-NEXT: scratch_store_b128 off, v[17:20], s0 offset:64
; GFX11-NEXT: scratch_store_b128 off, v[13:16], s11
; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: scratch_store_b128 off, v[9:12], s0 offset:32
; GFX11-NEXT: scratch_store_b128 off, v[5:8], s0 offset:16
; GFX11-NEXT: scratch_store_b128 off, v[1:4], s0
; GFX11-NEXT: s_clause 0xf
; GFX11-NEXT: scratch_store_b128 v0, v[96:99], off offset:240
; GFX11-NEXT: scratch_store_b128 v0, v[84:87], off offset:224
; GFX11-NEXT: scratch_store_b128 v0, v[80:83], off offset:208
; GFX11-NEXT: scratch_store_b128 v0, v[68:71], off offset:192
; GFX11-NEXT: scratch_store_b128 v0, v[64:67], off offset:176
; GFX11-NEXT: scratch_store_b128 v0, v[52:55], off offset:160
; GFX11-NEXT: scratch_store_b128 v0, v[48:51], off offset:144
; GFX11-NEXT: scratch_store_b128 v0, v[33:36], off offset:128
; GFX11-NEXT: scratch_store_b128 v0, v[29:32], off offset:112
; GFX11-NEXT: scratch_store_b128 v0, v[25:28], off offset:96
; GFX11-NEXT: scratch_store_b128 v0, v[21:24], off offset:80
; GFX11-NEXT: scratch_store_b128 v0, v[17:20], off offset:64
; GFX11-NEXT: scratch_store_b128 v0, v[13:16], off offset:48
; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:32
; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
%load = load <32 x bfloat>, ptr addrspace(1) %ptr
%fpext = fpext <32 x bfloat> %load to <32 x double>
Expand Down
87 changes: 33 additions & 54 deletions llvm/test/CodeGen/AMDGPU/function-returns.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1561,34 +1561,28 @@ define <33 x i32> @v33i32_func_void() #0 {
; GFX11-NEXT: buffer_load_b128 v[9:12], off, s[0:3], 0 offset:80
; GFX11-NEXT: buffer_load_b128 v[13:16], off, s[0:3], 0 offset:64
; GFX11-NEXT: buffer_load_b128 v[17:20], off, s[0:3], 0 offset:48
; GFX11-NEXT: buffer_load_b128 v[21:24], off, s[0:3], 0 offset:16
; GFX11-NEXT: buffer_load_b128 v[25:28], off, s[0:3], 0
; GFX11-NEXT: buffer_load_b128 v[29:32], off, s[0:3], 0 offset:32
; GFX11-NEXT: buffer_load_b128 v[21:24], off, s[0:3], 0 offset:32
; GFX11-NEXT: buffer_load_b128 v[25:28], off, s[0:3], 0 offset:16
; GFX11-NEXT: buffer_load_b128 v[29:32], off, s[0:3], 0
; GFX11-NEXT: buffer_load_b32 v33, off, s[0:3], 0 offset:128
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: s_add_i32 s1, s0, 0x70
; GFX11-NEXT: s_add_i32 s2, s0, 0x60
; GFX11-NEXT: s_add_i32 s3, s0, 0x50
; GFX11-NEXT: s_add_i32 s4, s0, 48
; GFX11-NEXT: s_waitcnt vmcnt(8)
; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1
; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:112
; GFX11-NEXT: s_waitcnt vmcnt(7)
; GFX11-NEXT: scratch_store_b128 off, v[5:8], s2
; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:96
; GFX11-NEXT: s_waitcnt vmcnt(6)
; GFX11-NEXT: scratch_store_b128 off, v[9:12], s3
; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:80
; GFX11-NEXT: s_waitcnt vmcnt(5)
; GFX11-NEXT: scratch_store_b128 off, v[13:16], s0 offset:64
; GFX11-NEXT: scratch_store_b128 v0, v[13:16], off offset:64
; GFX11-NEXT: s_waitcnt vmcnt(4)
; GFX11-NEXT: scratch_store_b128 off, v[17:20], s4
; GFX11-NEXT: scratch_store_b128 v0, v[17:20], off offset:48
; GFX11-NEXT: s_waitcnt vmcnt(3)
; GFX11-NEXT: scratch_store_b128 off, v[21:24], s0 offset:16
; GFX11-NEXT: scratch_store_b128 v0, v[21:24], off offset:32
; GFX11-NEXT: s_waitcnt vmcnt(2)
; GFX11-NEXT: scratch_store_b128 off, v[25:28], s0
; GFX11-NEXT: scratch_store_b128 v0, v[25:28], off offset:16
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: scratch_store_b128 off, v[29:32], s0 offset:32
; GFX11-NEXT: scratch_store_b128 v0, v[29:32], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: scratch_store_b32 off, v33, s0 offset:128
; GFX11-NEXT: scratch_store_b32 v0, v33, off offset:128
; GFX11-NEXT: s_setpc_b64 s[30:31]
%ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
%val = load <33 x i32>, ptr addrspace(1) %ptr
Expand Down Expand Up @@ -1850,34 +1844,28 @@ define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 {
; GFX11-NEXT: buffer_load_b128 v[9:12], off, s[0:3], 0 offset:80
; GFX11-NEXT: buffer_load_b128 v[13:16], off, s[0:3], 0 offset:64
; GFX11-NEXT: buffer_load_b128 v[17:20], off, s[0:3], 0 offset:48
; GFX11-NEXT: buffer_load_b128 v[21:24], off, s[0:3], 0 offset:16
; GFX11-NEXT: buffer_load_b128 v[25:28], off, s[0:3], 0
; GFX11-NEXT: buffer_load_b128 v[29:32], off, s[0:3], 0 offset:32
; GFX11-NEXT: buffer_load_b128 v[21:24], off, s[0:3], 0 offset:32
; GFX11-NEXT: buffer_load_b128 v[25:28], off, s[0:3], 0 offset:16
; GFX11-NEXT: buffer_load_b128 v[29:32], off, s[0:3], 0
; GFX11-NEXT: buffer_load_b32 v33, off, s[0:3], 0 offset:128
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: s_add_i32 s1, s0, 0x70
; GFX11-NEXT: s_add_i32 s2, s0, 0x60
; GFX11-NEXT: s_add_i32 s3, s0, 0x50
; GFX11-NEXT: s_add_i32 s4, s0, 48
; GFX11-NEXT: s_waitcnt vmcnt(8)
; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1
; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:112
; GFX11-NEXT: s_waitcnt vmcnt(7)
; GFX11-NEXT: scratch_store_b128 off, v[5:8], s2
; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:96
; GFX11-NEXT: s_waitcnt vmcnt(6)
; GFX11-NEXT: scratch_store_b128 off, v[9:12], s3
; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:80
; GFX11-NEXT: s_waitcnt vmcnt(5)
; GFX11-NEXT: scratch_store_b128 off, v[13:16], s0 offset:64
; GFX11-NEXT: scratch_store_b128 v0, v[13:16], off offset:64
; GFX11-NEXT: s_waitcnt vmcnt(4)
; GFX11-NEXT: scratch_store_b128 off, v[17:20], s4
; GFX11-NEXT: scratch_store_b128 v0, v[17:20], off offset:48
; GFX11-NEXT: s_waitcnt vmcnt(3)
; GFX11-NEXT: scratch_store_b128 off, v[21:24], s0 offset:16
; GFX11-NEXT: scratch_store_b128 v0, v[21:24], off offset:32
; GFX11-NEXT: s_waitcnt vmcnt(2)
; GFX11-NEXT: scratch_store_b128 off, v[25:28], s0
; GFX11-NEXT: scratch_store_b128 v0, v[25:28], off offset:16
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: scratch_store_b128 off, v[29:32], s0 offset:32
; GFX11-NEXT: scratch_store_b128 v0, v[29:32], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: scratch_store_b32 off, v33, s0 offset:128
; GFX11-NEXT: scratch_store_b32 v0, v33, off offset:128
; GFX11-NEXT: s_setpc_b64 s[30:31]
%ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
%val = load { <32 x i32>, i32 }, ptr addrspace(1) %ptr
Expand Down Expand Up @@ -2143,33 +2131,24 @@ define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 {
; GFX11-NEXT: buffer_load_b128 v[25:28], off, s[0:3], 0 offset:144
; GFX11-NEXT: buffer_load_b128 v[29:32], off, s[0:3], 0 offset:128
; GFX11-NEXT: buffer_load_b32 v33, off, s[0:3], 0
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: s_add_i32 s1, s0, 0xf0
; GFX11-NEXT: s_add_i32 s2, s0, 0xe0
; GFX11-NEXT: s_add_i32 s3, s0, 0xd0
; GFX11-NEXT: s_add_i32 s4, s0, 0xc0
; GFX11-NEXT: s_add_i32 s5, s0, 0xb0
; GFX11-NEXT: s_add_i32 s6, s0, 0xa0
; GFX11-NEXT: s_add_i32 s7, s0, 0x90
; GFX11-NEXT: s_waitcnt vmcnt(8)
; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1
; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:240
; GFX11-NEXT: s_waitcnt vmcnt(7)
; GFX11-NEXT: scratch_store_b128 off, v[5:8], s2
; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:224
; GFX11-NEXT: s_waitcnt vmcnt(6)
; GFX11-NEXT: scratch_store_b128 off, v[9:12], s3
; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:208
; GFX11-NEXT: s_waitcnt vmcnt(5)
; GFX11-NEXT: scratch_store_b128 off, v[13:16], s4
; GFX11-NEXT: scratch_store_b128 v0, v[13:16], off offset:192
; GFX11-NEXT: s_waitcnt vmcnt(4)
; GFX11-NEXT: scratch_store_b128 off, v[17:20], s5
; GFX11-NEXT: scratch_store_b128 v0, v[17:20], off offset:176
; GFX11-NEXT: s_waitcnt vmcnt(3)
; GFX11-NEXT: scratch_store_b128 off, v[21:24], s6
; GFX11-NEXT: scratch_store_b128 v0, v[21:24], off offset:160
; GFX11-NEXT: s_waitcnt vmcnt(2)
; GFX11-NEXT: scratch_store_b128 off, v[25:28], s7
; GFX11-NEXT: scratch_store_b128 v0, v[25:28], off offset:144
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: scratch_store_b128 off, v[29:32], s0 offset:128
; GFX11-NEXT: scratch_store_b128 v0, v[29:32], off offset:128
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: scratch_store_b32 off, v33, s0
; GFX11-NEXT: scratch_store_b32 v0, v33, off
; GFX11-NEXT: s_setpc_b64 s[30:31]
%ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
%val = load { i32, <32 x i32> }, ptr addrspace(1) %ptr
Expand Down
Loading