Skip to content

DAG: Avoid stack usage in bitcast operand promotion to legal vector #125637

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 35 additions & 1 deletion llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2204,9 +2204,43 @@ SDValue DAGTypeLegalizer::PromoteIntOp_ATOMIC_STORE(AtomicSDNode *N) {
}

SDValue DAGTypeLegalizer::PromoteIntOp_BITCAST(SDNode *N) {
EVT OutVT = N->getValueType(0);
SDValue InOp = N->getOperand(0);
EVT InVT = InOp.getValueType();
EVT NInVT = TLI.getTypeToTransformTo(*DAG.getContext(), InVT);
SDLoc dl(N);

switch (getTypeAction(InVT)) {
case TargetLowering::TypePromoteInteger: {
// TODO: Handle big endian
if (OutVT.isVector() && DAG.getDataLayout().isLittleEndian()) {
EVT EltVT = OutVT.getVectorElementType();
TypeSize EltSize = EltVT.getSizeInBits();
TypeSize NInSize = NInVT.getSizeInBits();

if (NInSize.hasKnownScalarFactor(EltSize)) {
unsigned NumEltsWithPadding = NInSize.getKnownScalarFactor(EltSize);
EVT WideVecVT =
EVT::getVectorVT(*DAG.getContext(), EltVT, NumEltsWithPadding);

if (isTypeLegal(WideVecVT)) {
SDValue Promoted = GetPromotedInteger(InOp);
Copy link
Collaborator

@topperc topperc Feb 4, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need to shift the scalar value left for big endian targets?

SDValue Cast = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Promoted);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OutVT, Cast,
DAG.getVectorIdxConstant(0, dl));
}
}
}

break;
}
default:
break;
}

// This should only occur in unusual situations like bitcasting to an
// x86_fp80, so just turn it into a store+load
return CreateStackStoreLoad(N->getOperand(0), N->getValueType(0));
return CreateStackStoreLoad(InOp, OutVT);
}

SDValue DAGTypeLegalizer::PromoteIntOp_BR_CC(SDNode *N, unsigned OpNo) {
Expand Down
160 changes: 0 additions & 160 deletions llvm/test/CodeGen/AMDGPU/bitcast_vector_bigint.ll
Original file line number Diff line number Diff line change
Expand Up @@ -80,15 +80,6 @@ define <5 x i32> @bitcast_i160_to_v5i32(i160 %int) {
; GFX9-LABEL: bitcast_i160_to_v5i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s4, s33
; GFX9-NEXT: s_add_i32 s33, s32, 0x7c0
; GFX9-NEXT: s_and_b32 s33, s33, 0xfffff800
; GFX9-NEXT: s_mov_b32 s5, s34
; GFX9-NEXT: s_mov_b32 s34, s32
; GFX9-NEXT: s_addk_i32 s32, 0x1000
; GFX9-NEXT: s_mov_b32 s32, s34
; GFX9-NEXT: s_mov_b32 s34, s5
; GFX9-NEXT: s_mov_b32 s33, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: bitcast_i160_to_v5i32:
Expand All @@ -98,23 +89,6 @@ define <5 x i32> @bitcast_i160_to_v5i32(i160 %int) {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_mov_b32 s0, s33
; GFX12-NEXT: s_add_co_i32 s33, s32, 31
; GFX12-NEXT: s_mov_b32 s1, s34
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 s33, s33, 31
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s33 offset:8
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s33
; GFX12-NEXT: scratch_load_b128 v[0:3], off, s33
; GFX12-NEXT: s_mov_b32 s34, s32
; GFX12-NEXT: s_add_co_i32 s32, s32, 64
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 s32, s34
; GFX12-NEXT: s_mov_b32 s34, s1
; GFX12-NEXT: s_mov_b32 s33, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%bitcast = bitcast i160 %int to <5 x i32>
ret <5 x i32> %bitcast
Expand All @@ -124,15 +98,6 @@ define <6 x i32> @bitcast_i192_to_v6i32(i192 %int) {
; GFX9-LABEL: bitcast_i192_to_v6i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s4, s33
; GFX9-NEXT: s_add_i32 s33, s32, 0x7c0
; GFX9-NEXT: s_and_b32 s33, s33, 0xfffff800
; GFX9-NEXT: s_mov_b32 s5, s34
; GFX9-NEXT: s_mov_b32 s34, s32
; GFX9-NEXT: s_addk_i32 s32, 0x1000
; GFX9-NEXT: s_mov_b32 s32, s34
; GFX9-NEXT: s_mov_b32 s34, s5
; GFX9-NEXT: s_mov_b32 s33, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: bitcast_i192_to_v6i32:
Expand All @@ -142,23 +107,6 @@ define <6 x i32> @bitcast_i192_to_v6i32(i192 %int) {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_mov_b32 s0, s33
; GFX12-NEXT: s_add_co_i32 s33, s32, 31
; GFX12-NEXT: s_mov_b32 s1, s34
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 s33, s33, 31
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s33 offset:8
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s33
; GFX12-NEXT: scratch_load_b128 v[0:3], off, s33
; GFX12-NEXT: s_mov_b32 s34, s32
; GFX12-NEXT: s_add_co_i32 s32, s32, 64
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 s32, s34
; GFX12-NEXT: s_mov_b32 s34, s1
; GFX12-NEXT: s_mov_b32 s33, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%bitcast = bitcast i192 %int to <6 x i32>
ret <6 x i32> %bitcast
Expand All @@ -168,15 +116,6 @@ define <7 x i32> @bitcast_i224_to_v7i32(i224 %int) {
; GFX9-LABEL: bitcast_i224_to_v7i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s4, s33
; GFX9-NEXT: s_add_i32 s33, s32, 0x7c0
; GFX9-NEXT: s_and_b32 s33, s33, 0xfffff800
; GFX9-NEXT: s_mov_b32 s5, s34
; GFX9-NEXT: s_mov_b32 s34, s32
; GFX9-NEXT: s_addk_i32 s32, 0x1000
; GFX9-NEXT: s_mov_b32 s32, s34
; GFX9-NEXT: s_mov_b32 s34, s5
; GFX9-NEXT: s_mov_b32 s33, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: bitcast_i224_to_v7i32:
Expand All @@ -186,27 +125,6 @@ define <7 x i32> @bitcast_i224_to_v7i32(i224 %int) {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_mov_b32 s0, s33
; GFX12-NEXT: s_add_co_i32 s33, s32, 31
; GFX12-NEXT: s_mov_b32 s1, s34
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 s33, s33, 31
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s33 offset:8
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s33
; GFX12-NEXT: scratch_load_b128 v[0:3], off, s33
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: scratch_store_b32 off, v6, s33 offset:24
; GFX12-NEXT: scratch_store_b64 off, v[4:5], s33 offset:16
; GFX12-NEXT: scratch_load_b96 v[4:6], off, s33 offset:16
; GFX12-NEXT: s_mov_b32 s34, s32
; GFX12-NEXT: s_add_co_i32 s32, s32, 64
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 s32, s34
; GFX12-NEXT: s_mov_b32 s34, s1
; GFX12-NEXT: s_mov_b32 s33, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%bitcast = bitcast i224 %int to <7 x i32>
ret <7 x i32> %bitcast
Expand Down Expand Up @@ -252,15 +170,6 @@ define <3 x i64> @bitcast_i192_to_v3i64(i192 %int) {
; GFX9-LABEL: bitcast_i192_to_v3i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s4, s33
; GFX9-NEXT: s_add_i32 s33, s32, 0x7c0
; GFX9-NEXT: s_and_b32 s33, s33, 0xfffff800
; GFX9-NEXT: s_mov_b32 s5, s34
; GFX9-NEXT: s_mov_b32 s34, s32
; GFX9-NEXT: s_addk_i32 s32, 0x1000
; GFX9-NEXT: s_mov_b32 s32, s34
; GFX9-NEXT: s_mov_b32 s34, s5
; GFX9-NEXT: s_mov_b32 s33, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: bitcast_i192_to_v3i64:
Expand All @@ -270,23 +179,6 @@ define <3 x i64> @bitcast_i192_to_v3i64(i192 %int) {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_mov_b32 s0, s33
; GFX12-NEXT: s_add_co_i32 s33, s32, 31
; GFX12-NEXT: s_mov_b32 s1, s34
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 s33, s33, 31
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s33 offset:8
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s33
; GFX12-NEXT: scratch_load_b128 v[0:3], off, s33
; GFX12-NEXT: s_mov_b32 s34, s32
; GFX12-NEXT: s_add_co_i32 s32, s32, 64
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 s32, s34
; GFX12-NEXT: s_mov_b32 s34, s1
; GFX12-NEXT: s_mov_b32 s33, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%bitcast = bitcast i192 %int to <3 x i64>
ret <3 x i64> %bitcast
Expand Down Expand Up @@ -408,15 +300,6 @@ define <5 x float> @bitcast_i160_to_v5f32(i160 %int) {
; GFX9-LABEL: bitcast_i160_to_v5f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s4, s33
; GFX9-NEXT: s_add_i32 s33, s32, 0x7c0
; GFX9-NEXT: s_and_b32 s33, s33, 0xfffff800
; GFX9-NEXT: s_mov_b32 s5, s34
; GFX9-NEXT: s_mov_b32 s34, s32
; GFX9-NEXT: s_addk_i32 s32, 0x1000
; GFX9-NEXT: s_mov_b32 s32, s34
; GFX9-NEXT: s_mov_b32 s34, s5
; GFX9-NEXT: s_mov_b32 s33, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: bitcast_i160_to_v5f32:
Expand All @@ -426,23 +309,6 @@ define <5 x float> @bitcast_i160_to_v5f32(i160 %int) {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_mov_b32 s0, s33
; GFX12-NEXT: s_add_co_i32 s33, s32, 31
; GFX12-NEXT: s_mov_b32 s1, s34
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 s33, s33, 31
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s33 offset:8
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s33
; GFX12-NEXT: scratch_load_b128 v[0:3], off, s33
; GFX12-NEXT: s_mov_b32 s34, s32
; GFX12-NEXT: s_add_co_i32 s32, s32, 64
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 s32, s34
; GFX12-NEXT: s_mov_b32 s34, s1
; GFX12-NEXT: s_mov_b32 s33, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%bitcast = bitcast i160 %int to <5 x float>
ret <5 x float> %bitcast
Expand All @@ -452,15 +318,6 @@ define <6 x float> @bitcast_i192_to_v6f32(i192 %int) {
; GFX9-LABEL: bitcast_i192_to_v6f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s4, s33
; GFX9-NEXT: s_add_i32 s33, s32, 0x7c0
; GFX9-NEXT: s_and_b32 s33, s33, 0xfffff800
; GFX9-NEXT: s_mov_b32 s5, s34
; GFX9-NEXT: s_mov_b32 s34, s32
; GFX9-NEXT: s_addk_i32 s32, 0x1000
; GFX9-NEXT: s_mov_b32 s32, s34
; GFX9-NEXT: s_mov_b32 s34, s5
; GFX9-NEXT: s_mov_b32 s33, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: bitcast_i192_to_v6f32:
Expand All @@ -470,23 +327,6 @@ define <6 x float> @bitcast_i192_to_v6f32(i192 %int) {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_mov_b32 s0, s33
; GFX12-NEXT: s_add_co_i32 s33, s32, 31
; GFX12-NEXT: s_mov_b32 s1, s34
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 s33, s33, 31
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: scratch_store_b64 off, v[2:3], s33 offset:8
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s33
; GFX12-NEXT: scratch_load_b128 v[0:3], off, s33
; GFX12-NEXT: s_mov_b32 s34, s32
; GFX12-NEXT: s_add_co_i32 s32, s32, 64
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 s32, s34
; GFX12-NEXT: s_mov_b32 s34, s1
; GFX12-NEXT: s_mov_b32 s33, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%bitcast = bitcast i192 %int to <6 x float>
ret <6 x float> %bitcast
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3110,17 +3110,8 @@ define void @store_i160(i160 %data, ptr addrspace(8) inreg %buf) {
; SDAG-LABEL: store_i160:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: s_mov_b32 s4, s33
; SDAG-NEXT: s_add_i32 s33, s32, 0x7c0
; SDAG-NEXT: s_and_b32 s33, s33, 0xfffff800
; SDAG-NEXT: s_mov_b32 s5, s34
; SDAG-NEXT: s_mov_b32 s34, s32
; SDAG-NEXT: s_addk_i32 s32, 0x1000
; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
; SDAG-NEXT: buffer_store_dword v4, off, s[16:19], 0 offset:16
; SDAG-NEXT: s_mov_b32 s32, s34
; SDAG-NEXT: s_mov_b32 s34, s5
; SDAG-NEXT: s_mov_b32 s33, s4
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
Expand Down
Loading