Skip to content

Commit 5b59ae4

Browse files
authored
[DAG] Preserve NUW when reassociating (#87621)
Similarly to the generic case below, preserve the NUW flag when reassociating adds with constants.
1 parent 8ebf7b7 commit 5b59ae4

File tree

7 files changed

+6077
-8294
lines changed

7 files changed

+6077
-8294
lines changed

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1164,19 +1164,20 @@ SDValue DAGCombiner::reassociateOpsCommutative(unsigned Opc, const SDLoc &DL,
11641164
SDValue N01 = N0.getOperand(1);
11651165

11661166
if (DAG.isConstantIntBuildVectorOrConstantInt(peekThroughBitcasts(N01))) {
1167+
SDNodeFlags NewFlags;
1168+
if (N0.getOpcode() == ISD::ADD && N0->getFlags().hasNoUnsignedWrap() &&
1169+
Flags.hasNoUnsignedWrap())
1170+
NewFlags.setNoUnsignedWrap(true);
1171+
11671172
if (DAG.isConstantIntBuildVectorOrConstantInt(peekThroughBitcasts(N1))) {
11681173
// Reassociate: (op (op x, c1), c2) -> (op x, (op c1, c2))
11691174
if (SDValue OpNode = DAG.FoldConstantArithmetic(Opc, DL, VT, {N01, N1}))
1170-
return DAG.getNode(Opc, DL, VT, N00, OpNode);
1175+
return DAG.getNode(Opc, DL, VT, N00, OpNode, NewFlags);
11711176
return SDValue();
11721177
}
11731178
if (TLI.isReassocProfitable(DAG, N0, N1)) {
11741179
// Reassociate: (op (op x, c1), y) -> (op (op x, y), c1)
11751180
// iff (op x, c1) has one use
1176-
SDNodeFlags NewFlags;
1177-
if (N0.getOpcode() == ISD::ADD && N0->getFlags().hasNoUnsignedWrap() &&
1178-
Flags.hasNoUnsignedWrap())
1179-
NewFlags.setNoUnsignedWrap(true);
11801181
SDValue OpNode = DAG.getNode(Opc, SDLoc(N0), VT, N00, N1, NewFlags);
11811182
return DAG.getNode(Opc, DL, VT, OpNode, N01, NewFlags);
11821183
}

llvm/test/CodeGen/AMDGPU/bf16.ll

Lines changed: 28 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -5678,22 +5678,18 @@ define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) {
56785678
; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8
56795679
; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4
56805680
; GFX11-NEXT: scratch_load_b32 v31, off, s32
5681-
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
5682-
; GFX11-NEXT: s_clause 0x4
5683-
; GFX11-NEXT: scratch_store_b128 off, v[18:21], s0 offset:64
5684-
; GFX11-NEXT: scratch_store_b128 off, v[10:13], s0 offset:32
5685-
; GFX11-NEXT: scratch_store_b128 off, v[6:9], s0 offset:16
5686-
; GFX11-NEXT: scratch_store_b128 off, v[2:5], s0
5687-
; GFX11-NEXT: scratch_store_b16 off, v1, s0 offset:128
5688-
; GFX11-NEXT: s_add_i32 s1, s0, 0x70
5689-
; GFX11-NEXT: s_add_i32 s2, s0, 0x60
5690-
; GFX11-NEXT: s_add_i32 s3, s0, 0x50
5691-
; GFX11-NEXT: s_add_i32 s0, s0, 48
5681+
; GFX11-NEXT: s_clause 0x5
5682+
; GFX11-NEXT: scratch_store_b128 v0, v[22:25], off offset:80
5683+
; GFX11-NEXT: scratch_store_b128 v0, v[18:21], off offset:64
5684+
; GFX11-NEXT: scratch_store_b128 v0, v[14:17], off offset:48
5685+
; GFX11-NEXT: scratch_store_b128 v0, v[10:13], off offset:32
5686+
; GFX11-NEXT: scratch_store_b128 v0, v[6:9], off offset:16
5687+
; GFX11-NEXT: scratch_store_b128 v0, v[2:5], off
56925688
; GFX11-NEXT: s_waitcnt vmcnt(0)
5693-
; GFX11-NEXT: scratch_store_b128 off, v[30:33], s1
5694-
; GFX11-NEXT: scratch_store_b128 off, v[26:29], s2
5695-
; GFX11-NEXT: scratch_store_b128 off, v[22:25], s3
5696-
; GFX11-NEXT: scratch_store_b128 off, v[14:17], s0
5689+
; GFX11-NEXT: s_clause 0x2
5690+
; GFX11-NEXT: scratch_store_b128 v0, v[30:33], off offset:112
5691+
; GFX11-NEXT: scratch_store_b128 v0, v[26:29], off offset:96
5692+
; GFX11-NEXT: scratch_store_b16 v0, v1, off offset:128
56975693
; GFX11-NEXT: s_setpc_b64 s[30:31]
56985694
%ins.0 = insertvalue { <32 x i32>, bfloat } poison, <32 x i32> %b, 0
56995695
%ins.1 = insertvalue { <32 x i32>, bfloat } %ins.0 ,bfloat %a, 1
@@ -8827,19 +8823,6 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
88278823
; GFX11-NEXT: global_load_u16 v32, v[1:2], off offset:54
88288824
; GFX11-NEXT: global_load_u16 v33, v[1:2], off offset:58
88298825
; GFX11-NEXT: global_load_u16 v1, v[1:2], off offset:62
8830-
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
8831-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
8832-
; GFX11-NEXT: s_add_i32 s1, s0, 0xf0
8833-
; GFX11-NEXT: s_add_i32 s2, s0, 0xe0
8834-
; GFX11-NEXT: s_add_i32 s3, s0, 0xd0
8835-
; GFX11-NEXT: s_add_i32 s4, s0, 0xc0
8836-
; GFX11-NEXT: s_add_i32 s5, s0, 0xb0
8837-
; GFX11-NEXT: s_add_i32 s6, s0, 0xa0
8838-
; GFX11-NEXT: s_add_i32 s7, s0, 0x90
8839-
; GFX11-NEXT: s_add_i32 s8, s0, 0x70
8840-
; GFX11-NEXT: s_add_i32 s9, s0, 0x60
8841-
; GFX11-NEXT: s_add_i32 s10, s0, 0x50
8842-
; GFX11-NEXT: s_add_i32 s11, s0, 48
88438826
; GFX11-NEXT: s_waitcnt vmcnt(31)
88448827
; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v3
88458828
; GFX11-NEXT: s_waitcnt vmcnt(30)
@@ -8936,23 +8919,23 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
89368919
; GFX11-NEXT: v_cvt_f64_f32_e32 v[5:6], v5
89378920
; GFX11-NEXT: v_cvt_f64_f32_e32 v[3:4], v2
89388921
; GFX11-NEXT: v_cvt_f64_f32_e32 v[1:2], v37
8939-
; GFX11-NEXT: scratch_store_b128 off, v[96:99], s1
8940-
; GFX11-NEXT: scratch_store_b128 off, v[84:87], s2
8941-
; GFX11-NEXT: scratch_store_b128 off, v[80:83], s3
8942-
; GFX11-NEXT: scratch_store_b128 off, v[68:71], s4
8943-
; GFX11-NEXT: scratch_store_b128 off, v[64:67], s5
8944-
; GFX11-NEXT: scratch_store_b128 off, v[52:55], s6
8945-
; GFX11-NEXT: scratch_store_b128 off, v[48:51], s7
8946-
; GFX11-NEXT: scratch_store_b128 off, v[33:36], s0 offset:128
8947-
; GFX11-NEXT: scratch_store_b128 off, v[29:32], s8
8948-
; GFX11-NEXT: scratch_store_b128 off, v[25:28], s9
8949-
; GFX11-NEXT: scratch_store_b128 off, v[21:24], s10
8950-
; GFX11-NEXT: scratch_store_b128 off, v[17:20], s0 offset:64
8951-
; GFX11-NEXT: scratch_store_b128 off, v[13:16], s11
8952-
; GFX11-NEXT: s_clause 0x2
8953-
; GFX11-NEXT: scratch_store_b128 off, v[9:12], s0 offset:32
8954-
; GFX11-NEXT: scratch_store_b128 off, v[5:8], s0 offset:16
8955-
; GFX11-NEXT: scratch_store_b128 off, v[1:4], s0
8922+
; GFX11-NEXT: s_clause 0xf
8923+
; GFX11-NEXT: scratch_store_b128 v0, v[96:99], off offset:240
8924+
; GFX11-NEXT: scratch_store_b128 v0, v[84:87], off offset:224
8925+
; GFX11-NEXT: scratch_store_b128 v0, v[80:83], off offset:208
8926+
; GFX11-NEXT: scratch_store_b128 v0, v[68:71], off offset:192
8927+
; GFX11-NEXT: scratch_store_b128 v0, v[64:67], off offset:176
8928+
; GFX11-NEXT: scratch_store_b128 v0, v[52:55], off offset:160
8929+
; GFX11-NEXT: scratch_store_b128 v0, v[48:51], off offset:144
8930+
; GFX11-NEXT: scratch_store_b128 v0, v[33:36], off offset:128
8931+
; GFX11-NEXT: scratch_store_b128 v0, v[29:32], off offset:112
8932+
; GFX11-NEXT: scratch_store_b128 v0, v[25:28], off offset:96
8933+
; GFX11-NEXT: scratch_store_b128 v0, v[21:24], off offset:80
8934+
; GFX11-NEXT: scratch_store_b128 v0, v[17:20], off offset:64
8935+
; GFX11-NEXT: scratch_store_b128 v0, v[13:16], off offset:48
8936+
; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:32
8937+
; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
8938+
; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off
89568939
; GFX11-NEXT: s_setpc_b64 s[30:31]
89578940
%load = load <32 x bfloat>, ptr addrspace(1) %ptr
89588941
%fpext = fpext <32 x bfloat> %load to <32 x double>

llvm/test/CodeGen/AMDGPU/function-returns.ll

Lines changed: 33 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -1561,34 +1561,28 @@ define <33 x i32> @v33i32_func_void() #0 {
15611561
; GFX11-NEXT: buffer_load_b128 v[9:12], off, s[0:3], 0 offset:80
15621562
; GFX11-NEXT: buffer_load_b128 v[13:16], off, s[0:3], 0 offset:64
15631563
; GFX11-NEXT: buffer_load_b128 v[17:20], off, s[0:3], 0 offset:48
1564-
; GFX11-NEXT: buffer_load_b128 v[21:24], off, s[0:3], 0 offset:16
1565-
; GFX11-NEXT: buffer_load_b128 v[25:28], off, s[0:3], 0
1566-
; GFX11-NEXT: buffer_load_b128 v[29:32], off, s[0:3], 0 offset:32
1564+
; GFX11-NEXT: buffer_load_b128 v[21:24], off, s[0:3], 0 offset:32
1565+
; GFX11-NEXT: buffer_load_b128 v[25:28], off, s[0:3], 0 offset:16
1566+
; GFX11-NEXT: buffer_load_b128 v[29:32], off, s[0:3], 0
15671567
; GFX11-NEXT: buffer_load_b32 v33, off, s[0:3], 0 offset:128
1568-
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
1569-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1570-
; GFX11-NEXT: s_add_i32 s1, s0, 0x70
1571-
; GFX11-NEXT: s_add_i32 s2, s0, 0x60
1572-
; GFX11-NEXT: s_add_i32 s3, s0, 0x50
1573-
; GFX11-NEXT: s_add_i32 s4, s0, 48
15741568
; GFX11-NEXT: s_waitcnt vmcnt(8)
1575-
; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1
1569+
; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:112
15761570
; GFX11-NEXT: s_waitcnt vmcnt(7)
1577-
; GFX11-NEXT: scratch_store_b128 off, v[5:8], s2
1571+
; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:96
15781572
; GFX11-NEXT: s_waitcnt vmcnt(6)
1579-
; GFX11-NEXT: scratch_store_b128 off, v[9:12], s3
1573+
; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:80
15801574
; GFX11-NEXT: s_waitcnt vmcnt(5)
1581-
; GFX11-NEXT: scratch_store_b128 off, v[13:16], s0 offset:64
1575+
; GFX11-NEXT: scratch_store_b128 v0, v[13:16], off offset:64
15821576
; GFX11-NEXT: s_waitcnt vmcnt(4)
1583-
; GFX11-NEXT: scratch_store_b128 off, v[17:20], s4
1577+
; GFX11-NEXT: scratch_store_b128 v0, v[17:20], off offset:48
15841578
; GFX11-NEXT: s_waitcnt vmcnt(3)
1585-
; GFX11-NEXT: scratch_store_b128 off, v[21:24], s0 offset:16
1579+
; GFX11-NEXT: scratch_store_b128 v0, v[21:24], off offset:32
15861580
; GFX11-NEXT: s_waitcnt vmcnt(2)
1587-
; GFX11-NEXT: scratch_store_b128 off, v[25:28], s0
1581+
; GFX11-NEXT: scratch_store_b128 v0, v[25:28], off offset:16
15881582
; GFX11-NEXT: s_waitcnt vmcnt(1)
1589-
; GFX11-NEXT: scratch_store_b128 off, v[29:32], s0 offset:32
1583+
; GFX11-NEXT: scratch_store_b128 v0, v[29:32], off
15901584
; GFX11-NEXT: s_waitcnt vmcnt(0)
1591-
; GFX11-NEXT: scratch_store_b32 off, v33, s0 offset:128
1585+
; GFX11-NEXT: scratch_store_b32 v0, v33, off offset:128
15921586
; GFX11-NEXT: s_setpc_b64 s[30:31]
15931587
%ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
15941588
%val = load <33 x i32>, ptr addrspace(1) %ptr
@@ -1850,34 +1844,28 @@ define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 {
18501844
; GFX11-NEXT: buffer_load_b128 v[9:12], off, s[0:3], 0 offset:80
18511845
; GFX11-NEXT: buffer_load_b128 v[13:16], off, s[0:3], 0 offset:64
18521846
; GFX11-NEXT: buffer_load_b128 v[17:20], off, s[0:3], 0 offset:48
1853-
; GFX11-NEXT: buffer_load_b128 v[21:24], off, s[0:3], 0 offset:16
1854-
; GFX11-NEXT: buffer_load_b128 v[25:28], off, s[0:3], 0
1855-
; GFX11-NEXT: buffer_load_b128 v[29:32], off, s[0:3], 0 offset:32
1847+
; GFX11-NEXT: buffer_load_b128 v[21:24], off, s[0:3], 0 offset:32
1848+
; GFX11-NEXT: buffer_load_b128 v[25:28], off, s[0:3], 0 offset:16
1849+
; GFX11-NEXT: buffer_load_b128 v[29:32], off, s[0:3], 0
18561850
; GFX11-NEXT: buffer_load_b32 v33, off, s[0:3], 0 offset:128
1857-
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
1858-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1859-
; GFX11-NEXT: s_add_i32 s1, s0, 0x70
1860-
; GFX11-NEXT: s_add_i32 s2, s0, 0x60
1861-
; GFX11-NEXT: s_add_i32 s3, s0, 0x50
1862-
; GFX11-NEXT: s_add_i32 s4, s0, 48
18631851
; GFX11-NEXT: s_waitcnt vmcnt(8)
1864-
; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1
1852+
; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:112
18651853
; GFX11-NEXT: s_waitcnt vmcnt(7)
1866-
; GFX11-NEXT: scratch_store_b128 off, v[5:8], s2
1854+
; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:96
18671855
; GFX11-NEXT: s_waitcnt vmcnt(6)
1868-
; GFX11-NEXT: scratch_store_b128 off, v[9:12], s3
1856+
; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:80
18691857
; GFX11-NEXT: s_waitcnt vmcnt(5)
1870-
; GFX11-NEXT: scratch_store_b128 off, v[13:16], s0 offset:64
1858+
; GFX11-NEXT: scratch_store_b128 v0, v[13:16], off offset:64
18711859
; GFX11-NEXT: s_waitcnt vmcnt(4)
1872-
; GFX11-NEXT: scratch_store_b128 off, v[17:20], s4
1860+
; GFX11-NEXT: scratch_store_b128 v0, v[17:20], off offset:48
18731861
; GFX11-NEXT: s_waitcnt vmcnt(3)
1874-
; GFX11-NEXT: scratch_store_b128 off, v[21:24], s0 offset:16
1862+
; GFX11-NEXT: scratch_store_b128 v0, v[21:24], off offset:32
18751863
; GFX11-NEXT: s_waitcnt vmcnt(2)
1876-
; GFX11-NEXT: scratch_store_b128 off, v[25:28], s0
1864+
; GFX11-NEXT: scratch_store_b128 v0, v[25:28], off offset:16
18771865
; GFX11-NEXT: s_waitcnt vmcnt(1)
1878-
; GFX11-NEXT: scratch_store_b128 off, v[29:32], s0 offset:32
1866+
; GFX11-NEXT: scratch_store_b128 v0, v[29:32], off
18791867
; GFX11-NEXT: s_waitcnt vmcnt(0)
1880-
; GFX11-NEXT: scratch_store_b32 off, v33, s0 offset:128
1868+
; GFX11-NEXT: scratch_store_b32 v0, v33, off offset:128
18811869
; GFX11-NEXT: s_setpc_b64 s[30:31]
18821870
%ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
18831871
%val = load { <32 x i32>, i32 }, ptr addrspace(1) %ptr
@@ -2143,33 +2131,24 @@ define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 {
21432131
; GFX11-NEXT: buffer_load_b128 v[25:28], off, s[0:3], 0 offset:144
21442132
; GFX11-NEXT: buffer_load_b128 v[29:32], off, s[0:3], 0 offset:128
21452133
; GFX11-NEXT: buffer_load_b32 v33, off, s[0:3], 0
2146-
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
2147-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
2148-
; GFX11-NEXT: s_add_i32 s1, s0, 0xf0
2149-
; GFX11-NEXT: s_add_i32 s2, s0, 0xe0
2150-
; GFX11-NEXT: s_add_i32 s3, s0, 0xd0
2151-
; GFX11-NEXT: s_add_i32 s4, s0, 0xc0
2152-
; GFX11-NEXT: s_add_i32 s5, s0, 0xb0
2153-
; GFX11-NEXT: s_add_i32 s6, s0, 0xa0
2154-
; GFX11-NEXT: s_add_i32 s7, s0, 0x90
21552134
; GFX11-NEXT: s_waitcnt vmcnt(8)
2156-
; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1
2135+
; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:240
21572136
; GFX11-NEXT: s_waitcnt vmcnt(7)
2158-
; GFX11-NEXT: scratch_store_b128 off, v[5:8], s2
2137+
; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:224
21592138
; GFX11-NEXT: s_waitcnt vmcnt(6)
2160-
; GFX11-NEXT: scratch_store_b128 off, v[9:12], s3
2139+
; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:208
21612140
; GFX11-NEXT: s_waitcnt vmcnt(5)
2162-
; GFX11-NEXT: scratch_store_b128 off, v[13:16], s4
2141+
; GFX11-NEXT: scratch_store_b128 v0, v[13:16], off offset:192
21632142
; GFX11-NEXT: s_waitcnt vmcnt(4)
2164-
; GFX11-NEXT: scratch_store_b128 off, v[17:20], s5
2143+
; GFX11-NEXT: scratch_store_b128 v0, v[17:20], off offset:176
21652144
; GFX11-NEXT: s_waitcnt vmcnt(3)
2166-
; GFX11-NEXT: scratch_store_b128 off, v[21:24], s6
2145+
; GFX11-NEXT: scratch_store_b128 v0, v[21:24], off offset:160
21672146
; GFX11-NEXT: s_waitcnt vmcnt(2)
2168-
; GFX11-NEXT: scratch_store_b128 off, v[25:28], s7
2147+
; GFX11-NEXT: scratch_store_b128 v0, v[25:28], off offset:144
21692148
; GFX11-NEXT: s_waitcnt vmcnt(1)
2170-
; GFX11-NEXT: scratch_store_b128 off, v[29:32], s0 offset:128
2149+
; GFX11-NEXT: scratch_store_b128 v0, v[29:32], off offset:128
21712150
; GFX11-NEXT: s_waitcnt vmcnt(0)
2172-
; GFX11-NEXT: scratch_store_b32 off, v33, s0
2151+
; GFX11-NEXT: scratch_store_b32 v0, v33, off
21732152
; GFX11-NEXT: s_setpc_b64 s[30:31]
21742153
%ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
21752154
%val = load { i32, <32 x i32> }, ptr addrspace(1) %ptr

0 commit comments

Comments
 (0)