Skip to content

Commit bc4fe8a

Browse files
committed
DAG: Avoid introducing stack usage in vector->int bitcast int op promotion
Avoids stack usage in the v5i32 to i160 case for AMDGPU, which appears in fat pointer lowering.
1 parent 4d7072f commit bc4fe8a

File tree

5 files changed

+355
-866
lines changed

5 files changed

+355
-866
lines changed

llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -566,6 +566,27 @@ SDValue DAGTypeLegalizer::PromoteIntRes_BITCAST(SDNode *N) {
566566
}
567567
}
568568

569+
if (!NOutVT.isVector() && InOp.getValueType().isVector()) {
570+
// Pad the vector operand with undef and cast to a wider integer.
571+
EVT EltVT = InOp.getValueType().getVectorElementType();
572+
TypeSize EltSize = EltVT.getSizeInBits();
573+
TypeSize OutSize = NOutVT.getSizeInBits();
574+
575+
if (OutSize.hasKnownScalarFactor(EltSize)) {
576+
unsigned NumEltsWithPadding = OutSize.getKnownScalarFactor(EltSize);
577+
EVT WideVecVT =
578+
EVT::getVectorVT(*DAG.getContext(), EltVT, NumEltsWithPadding);
579+
580+
if (isTypeLegal(WideVecVT)) {
581+
SDValue Inserted = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
582+
DAG.getUNDEF(WideVecVT), InOp,
583+
DAG.getVectorIdxConstant(0, dl));
584+
585+
return DAG.getNode(ISD::BITCAST, dl, NOutVT, Inserted);
586+
}
587+
}
588+
}
589+
569590
return DAG.getNode(ISD::ANY_EXTEND, dl, NOutVT,
570591
CreateStackStoreLoad(InOp, OutVT));
571592
}

llvm/test/CodeGen/AMDGPU/bitcast_vector_bigint.ll

Lines changed: 0 additions & 160 deletions
Original file line numberDiff line numberDiff line change
@@ -8,15 +8,6 @@ define i160 @bitcast_v5i32_to_i160(<5 x i32> %vec) {
88
; GFX9-LABEL: bitcast_v5i32_to_i160:
99
; GFX9: ; %bb.0:
1010
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11-
; GFX9-NEXT: s_mov_b32 s4, s33
12-
; GFX9-NEXT: s_add_i32 s33, s32, 0x7c0
13-
; GFX9-NEXT: s_and_b32 s33, s33, 0xfffff800
14-
; GFX9-NEXT: s_mov_b32 s5, s34
15-
; GFX9-NEXT: s_mov_b32 s34, s32
16-
; GFX9-NEXT: s_addk_i32 s32, 0x1000
17-
; GFX9-NEXT: s_mov_b32 s32, s34
18-
; GFX9-NEXT: s_mov_b32 s34, s5
19-
; GFX9-NEXT: s_mov_b32 s33, s4
2011
; GFX9-NEXT: s_setpc_b64 s[30:31]
2112
;
2213
; GFX12-LABEL: bitcast_v5i32_to_i160:
@@ -26,23 +17,6 @@ define i160 @bitcast_v5i32_to_i160(<5 x i32> %vec) {
2617
; GFX12-NEXT: s_wait_samplecnt 0x0
2718
; GFX12-NEXT: s_wait_bvhcnt 0x0
2819
; GFX12-NEXT: s_wait_kmcnt 0x0
29-
; GFX12-NEXT: s_mov_b32 s0, s33
30-
; GFX12-NEXT: s_add_co_i32 s33, s32, 31
31-
; GFX12-NEXT: s_mov_b32 s1, s34
32-
; GFX12-NEXT: s_wait_alu 0xfffe
33-
; GFX12-NEXT: s_and_not1_b32 s33, s33, 31
34-
; GFX12-NEXT: s_mov_b32 s34, s32
35-
; GFX12-NEXT: scratch_store_b128 off, v[0:3], s33
36-
; GFX12-NEXT: s_clause 0x1
37-
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s33
38-
; GFX12-NEXT: scratch_load_b64 v[2:3], off, s33 offset:8
39-
; GFX12-NEXT: s_add_co_i32 s32, s32, 64
40-
; GFX12-NEXT: s_wait_alu 0xfffe
41-
; GFX12-NEXT: s_mov_b32 s32, s34
42-
; GFX12-NEXT: s_mov_b32 s34, s1
43-
; GFX12-NEXT: s_mov_b32 s33, s0
44-
; GFX12-NEXT: s_wait_loadcnt 0x0
45-
; GFX12-NEXT: s_wait_alu 0xfffe
4620
; GFX12-NEXT: s_setpc_b64 s[30:31]
4721
%bitcast = bitcast <5 x i32> %vec to i160
4822
ret i160 %bitcast
@@ -52,15 +26,6 @@ define i192 @bitcast_v6i32_to_i192(<6 x i32> %vec) {
5226
; GFX9-LABEL: bitcast_v6i32_to_i192:
5327
; GFX9: ; %bb.0:
5428
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
55-
; GFX9-NEXT: s_mov_b32 s4, s33
56-
; GFX9-NEXT: s_add_i32 s33, s32, 0x7c0
57-
; GFX9-NEXT: s_and_b32 s33, s33, 0xfffff800
58-
; GFX9-NEXT: s_mov_b32 s5, s34
59-
; GFX9-NEXT: s_mov_b32 s34, s32
60-
; GFX9-NEXT: s_addk_i32 s32, 0x1000
61-
; GFX9-NEXT: s_mov_b32 s32, s34
62-
; GFX9-NEXT: s_mov_b32 s34, s5
63-
; GFX9-NEXT: s_mov_b32 s33, s4
6429
; GFX9-NEXT: s_setpc_b64 s[30:31]
6530
;
6631
; GFX12-LABEL: bitcast_v6i32_to_i192:
@@ -70,23 +35,6 @@ define i192 @bitcast_v6i32_to_i192(<6 x i32> %vec) {
7035
; GFX12-NEXT: s_wait_samplecnt 0x0
7136
; GFX12-NEXT: s_wait_bvhcnt 0x0
7237
; GFX12-NEXT: s_wait_kmcnt 0x0
73-
; GFX12-NEXT: s_mov_b32 s0, s33
74-
; GFX12-NEXT: s_add_co_i32 s33, s32, 31
75-
; GFX12-NEXT: s_mov_b32 s1, s34
76-
; GFX12-NEXT: s_wait_alu 0xfffe
77-
; GFX12-NEXT: s_and_not1_b32 s33, s33, 31
78-
; GFX12-NEXT: s_mov_b32 s34, s32
79-
; GFX12-NEXT: scratch_store_b128 off, v[0:3], s33
80-
; GFX12-NEXT: s_clause 0x1
81-
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s33
82-
; GFX12-NEXT: scratch_load_b64 v[2:3], off, s33 offset:8
83-
; GFX12-NEXT: s_add_co_i32 s32, s32, 64
84-
; GFX12-NEXT: s_wait_alu 0xfffe
85-
; GFX12-NEXT: s_mov_b32 s32, s34
86-
; GFX12-NEXT: s_mov_b32 s34, s1
87-
; GFX12-NEXT: s_mov_b32 s33, s0
88-
; GFX12-NEXT: s_wait_loadcnt 0x0
89-
; GFX12-NEXT: s_wait_alu 0xfffe
9038
; GFX12-NEXT: s_setpc_b64 s[30:31]
9139
%bitcast = bitcast <6 x i32> %vec to i192
9240
ret i192 %bitcast
@@ -96,15 +44,6 @@ define i224 @bitcast_v7i32_to_i224(<7 x i32> %vec) {
9644
; GFX9-LABEL: bitcast_v7i32_to_i224:
9745
; GFX9: ; %bb.0:
9846
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
99-
; GFX9-NEXT: s_mov_b32 s4, s33
100-
; GFX9-NEXT: s_add_i32 s33, s32, 0x7c0
101-
; GFX9-NEXT: s_and_b32 s33, s33, 0xfffff800
102-
; GFX9-NEXT: s_mov_b32 s5, s34
103-
; GFX9-NEXT: s_mov_b32 s34, s32
104-
; GFX9-NEXT: s_addk_i32 s32, 0x1000
105-
; GFX9-NEXT: s_mov_b32 s32, s34
106-
; GFX9-NEXT: s_mov_b32 s34, s5
107-
; GFX9-NEXT: s_mov_b32 s33, s4
10847
; GFX9-NEXT: s_setpc_b64 s[30:31]
10948
;
11049
; GFX12-LABEL: bitcast_v7i32_to_i224:
@@ -114,27 +53,6 @@ define i224 @bitcast_v7i32_to_i224(<7 x i32> %vec) {
11453
; GFX12-NEXT: s_wait_samplecnt 0x0
11554
; GFX12-NEXT: s_wait_bvhcnt 0x0
11655
; GFX12-NEXT: s_wait_kmcnt 0x0
117-
; GFX12-NEXT: s_mov_b32 s0, s33
118-
; GFX12-NEXT: s_add_co_i32 s33, s32, 31
119-
; GFX12-NEXT: s_mov_b32 s1, s34
120-
; GFX12-NEXT: s_wait_alu 0xfffe
121-
; GFX12-NEXT: s_and_not1_b32 s33, s33, 31
122-
; GFX12-NEXT: s_mov_b32 s34, s32
123-
; GFX12-NEXT: scratch_store_b128 off, v[0:3], s33
124-
; GFX12-NEXT: s_clause 0x1
125-
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s33
126-
; GFX12-NEXT: scratch_load_b64 v[2:3], off, s33 offset:8
127-
; GFX12-NEXT: scratch_store_b96 off, v[4:6], s33 offset:16
128-
; GFX12-NEXT: s_clause 0x1
129-
; GFX12-NEXT: scratch_load_b64 v[4:5], off, s33 offset:16
130-
; GFX12-NEXT: scratch_load_b32 v6, off, s33 offset:24
131-
; GFX12-NEXT: s_add_co_i32 s32, s32, 64
132-
; GFX12-NEXT: s_wait_alu 0xfffe
133-
; GFX12-NEXT: s_mov_b32 s32, s34
134-
; GFX12-NEXT: s_mov_b32 s34, s1
135-
; GFX12-NEXT: s_mov_b32 s33, s0
136-
; GFX12-NEXT: s_wait_loadcnt 0x0
137-
; GFX12-NEXT: s_wait_alu 0xfffe
13856
; GFX12-NEXT: s_setpc_b64 s[30:31]
13957
%bitcast = bitcast <7 x i32> %vec to i224
14058
ret i224 %bitcast
@@ -316,15 +234,6 @@ define i192 @bitcast_v3i64_to_i192(<3 x i64> %vec) {
316234
; GFX9-LABEL: bitcast_v3i64_to_i192:
317235
; GFX9: ; %bb.0:
318236
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
319-
; GFX9-NEXT: s_mov_b32 s4, s33
320-
; GFX9-NEXT: s_add_i32 s33, s32, 0x7c0
321-
; GFX9-NEXT: s_and_b32 s33, s33, 0xfffff800
322-
; GFX9-NEXT: s_mov_b32 s5, s34
323-
; GFX9-NEXT: s_mov_b32 s34, s32
324-
; GFX9-NEXT: s_addk_i32 s32, 0x1000
325-
; GFX9-NEXT: s_mov_b32 s32, s34
326-
; GFX9-NEXT: s_mov_b32 s34, s5
327-
; GFX9-NEXT: s_mov_b32 s33, s4
328237
; GFX9-NEXT: s_setpc_b64 s[30:31]
329238
;
330239
; GFX12-LABEL: bitcast_v3i64_to_i192:
@@ -334,23 +243,6 @@ define i192 @bitcast_v3i64_to_i192(<3 x i64> %vec) {
334243
; GFX12-NEXT: s_wait_samplecnt 0x0
335244
; GFX12-NEXT: s_wait_bvhcnt 0x0
336245
; GFX12-NEXT: s_wait_kmcnt 0x0
337-
; GFX12-NEXT: s_mov_b32 s0, s33
338-
; GFX12-NEXT: s_add_co_i32 s33, s32, 31
339-
; GFX12-NEXT: s_mov_b32 s1, s34
340-
; GFX12-NEXT: s_wait_alu 0xfffe
341-
; GFX12-NEXT: s_and_not1_b32 s33, s33, 31
342-
; GFX12-NEXT: s_mov_b32 s34, s32
343-
; GFX12-NEXT: scratch_store_b128 off, v[0:3], s33
344-
; GFX12-NEXT: s_clause 0x1
345-
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s33
346-
; GFX12-NEXT: scratch_load_b64 v[2:3], off, s33 offset:8
347-
; GFX12-NEXT: s_add_co_i32 s32, s32, 64
348-
; GFX12-NEXT: s_wait_alu 0xfffe
349-
; GFX12-NEXT: s_mov_b32 s32, s34
350-
; GFX12-NEXT: s_mov_b32 s34, s1
351-
; GFX12-NEXT: s_mov_b32 s33, s0
352-
; GFX12-NEXT: s_wait_loadcnt 0x0
353-
; GFX12-NEXT: s_wait_alu 0xfffe
354246
; GFX12-NEXT: s_setpc_b64 s[30:31]
355247
%bitcast = bitcast <3 x i64> %vec to i192
356248
ret i192 %bitcast
@@ -498,15 +390,6 @@ define i160 @bitcast_v5f32_to_i160(<5 x float> %vec) {
498390
; GFX9-LABEL: bitcast_v5f32_to_i160:
499391
; GFX9: ; %bb.0:
500392
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
501-
; GFX9-NEXT: s_mov_b32 s4, s33
502-
; GFX9-NEXT: s_add_i32 s33, s32, 0x7c0
503-
; GFX9-NEXT: s_and_b32 s33, s33, 0xfffff800
504-
; GFX9-NEXT: s_mov_b32 s5, s34
505-
; GFX9-NEXT: s_mov_b32 s34, s32
506-
; GFX9-NEXT: s_addk_i32 s32, 0x1000
507-
; GFX9-NEXT: s_mov_b32 s32, s34
508-
; GFX9-NEXT: s_mov_b32 s34, s5
509-
; GFX9-NEXT: s_mov_b32 s33, s4
510393
; GFX9-NEXT: s_setpc_b64 s[30:31]
511394
;
512395
; GFX12-LABEL: bitcast_v5f32_to_i160:
@@ -516,23 +399,6 @@ define i160 @bitcast_v5f32_to_i160(<5 x float> %vec) {
516399
; GFX12-NEXT: s_wait_samplecnt 0x0
517400
; GFX12-NEXT: s_wait_bvhcnt 0x0
518401
; GFX12-NEXT: s_wait_kmcnt 0x0
519-
; GFX12-NEXT: s_mov_b32 s0, s33
520-
; GFX12-NEXT: s_add_co_i32 s33, s32, 31
521-
; GFX12-NEXT: s_mov_b32 s1, s34
522-
; GFX12-NEXT: s_wait_alu 0xfffe
523-
; GFX12-NEXT: s_and_not1_b32 s33, s33, 31
524-
; GFX12-NEXT: s_mov_b32 s34, s32
525-
; GFX12-NEXT: scratch_store_b128 off, v[0:3], s33
526-
; GFX12-NEXT: s_clause 0x1
527-
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s33
528-
; GFX12-NEXT: scratch_load_b64 v[2:3], off, s33 offset:8
529-
; GFX12-NEXT: s_add_co_i32 s32, s32, 64
530-
; GFX12-NEXT: s_wait_alu 0xfffe
531-
; GFX12-NEXT: s_mov_b32 s32, s34
532-
; GFX12-NEXT: s_mov_b32 s34, s1
533-
; GFX12-NEXT: s_mov_b32 s33, s0
534-
; GFX12-NEXT: s_wait_loadcnt 0x0
535-
; GFX12-NEXT: s_wait_alu 0xfffe
536402
; GFX12-NEXT: s_setpc_b64 s[30:31]
537403
%bitcast = bitcast <5 x float> %vec to i160
538404
ret i160 %bitcast
@@ -630,15 +496,6 @@ define i192 @bitcast_v6f32_to_i192(<6 x float> %vec) {
630496
; GFX9-LABEL: bitcast_v6f32_to_i192:
631497
; GFX9: ; %bb.0:
632498
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
633-
; GFX9-NEXT: s_mov_b32 s4, s33
634-
; GFX9-NEXT: s_add_i32 s33, s32, 0x7c0
635-
; GFX9-NEXT: s_and_b32 s33, s33, 0xfffff800
636-
; GFX9-NEXT: s_mov_b32 s5, s34
637-
; GFX9-NEXT: s_mov_b32 s34, s32
638-
; GFX9-NEXT: s_addk_i32 s32, 0x1000
639-
; GFX9-NEXT: s_mov_b32 s32, s34
640-
; GFX9-NEXT: s_mov_b32 s34, s5
641-
; GFX9-NEXT: s_mov_b32 s33, s4
642499
; GFX9-NEXT: s_setpc_b64 s[30:31]
643500
;
644501
; GFX12-LABEL: bitcast_v6f32_to_i192:
@@ -648,23 +505,6 @@ define i192 @bitcast_v6f32_to_i192(<6 x float> %vec) {
648505
; GFX12-NEXT: s_wait_samplecnt 0x0
649506
; GFX12-NEXT: s_wait_bvhcnt 0x0
650507
; GFX12-NEXT: s_wait_kmcnt 0x0
651-
; GFX12-NEXT: s_mov_b32 s0, s33
652-
; GFX12-NEXT: s_add_co_i32 s33, s32, 31
653-
; GFX12-NEXT: s_mov_b32 s1, s34
654-
; GFX12-NEXT: s_wait_alu 0xfffe
655-
; GFX12-NEXT: s_and_not1_b32 s33, s33, 31
656-
; GFX12-NEXT: s_mov_b32 s34, s32
657-
; GFX12-NEXT: scratch_store_b128 off, v[0:3], s33
658-
; GFX12-NEXT: s_clause 0x1
659-
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s33
660-
; GFX12-NEXT: scratch_load_b64 v[2:3], off, s33 offset:8
661-
; GFX12-NEXT: s_add_co_i32 s32, s32, 64
662-
; GFX12-NEXT: s_wait_alu 0xfffe
663-
; GFX12-NEXT: s_mov_b32 s32, s34
664-
; GFX12-NEXT: s_mov_b32 s34, s1
665-
; GFX12-NEXT: s_mov_b32 s33, s0
666-
; GFX12-NEXT: s_wait_loadcnt 0x0
667-
; GFX12-NEXT: s_wait_alu 0xfffe
668508
; GFX12-NEXT: s_setpc_b64 s[30:31]
669509
%bitcast = bitcast <6 x float> %vec to i192
670510
ret i192 %bitcast

llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3091,15 +3091,6 @@ define i160 @load_i160(ptr addrspace(8) inreg %buf) {
30913091
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30923092
; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0
30933093
; SDAG-NEXT: buffer_load_dword v4, off, s[16:19], 0 offset:16
3094-
; SDAG-NEXT: s_mov_b32 s4, s33
3095-
; SDAG-NEXT: s_add_i32 s33, s32, 0x7c0
3096-
; SDAG-NEXT: s_and_b32 s33, s33, 0xfffff800
3097-
; SDAG-NEXT: s_mov_b32 s5, s34
3098-
; SDAG-NEXT: s_mov_b32 s34, s32
3099-
; SDAG-NEXT: s_addk_i32 s32, 0x1800
3100-
; SDAG-NEXT: s_mov_b32 s32, s34
3101-
; SDAG-NEXT: s_mov_b32 s34, s5
3102-
; SDAG-NEXT: s_mov_b32 s33, s4
31033094
; SDAG-NEXT: s_waitcnt vmcnt(0)
31043095
; SDAG-NEXT: s_setpc_b64 s[30:31]
31053096
;

0 commit comments

Comments
 (0)