-
Notifications
You must be signed in to change notification settings - Fork 14.3k
DAG: Avoid stack usage in bitcast operand promotion to legal vector #125637
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
DAG: Avoid stack usage in bitcast operand promotion to legal vector #125637
Conversation
This stack of pull requests is managed by Graphite. Learn more about stacking. |
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-llvm-selectiondag Author: Matt Arsenault (arsenm) ChangesFix introducing stack usage if a bitcast source operand is an illegal Patch is 156.41 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/125637.diff 12 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 95fb8b406e51bf..eb0c5faa7fe1eb 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -2202,9 +2202,42 @@ SDValue DAGTypeLegalizer::PromoteIntOp_ATOMIC_STORE(AtomicSDNode *N) {
}
SDValue DAGTypeLegalizer::PromoteIntOp_BITCAST(SDNode *N) {
+ EVT OutVT = N->getValueType(0);
+ SDValue InOp = N->getOperand(0);
+ EVT InVT = InOp.getValueType();
+ EVT NInVT = TLI.getTypeToTransformTo(*DAG.getContext(), InVT);
+ SDLoc dl(N);
+
+ switch (getTypeAction(InVT)) {
+ case TargetLowering::TypePromoteInteger: {
+ if (OutVT.isVector()) {
+ EVT EltVT = OutVT.getVectorElementType();
+ TypeSize EltSize = EltVT.getSizeInBits();
+ TypeSize NInSize = NInVT.getSizeInBits();
+
+ if (NInSize.hasKnownScalarFactor(EltSize)) {
+ unsigned NumEltsWithPadding = NInSize.getKnownScalarFactor(EltSize);
+ EVT WideVecVT =
+ EVT::getVectorVT(*DAG.getContext(), EltVT, NumEltsWithPadding);
+
+ if (isTypeLegal(WideVecVT)) {
+ SDValue Promoted = GetPromotedInteger(InOp);
+ SDValue Cast = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Promoted);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OutVT, Cast,
+ DAG.getVectorIdxConstant(0, dl));
+ }
+ }
+ }
+
+ break;
+ }
+ default:
+ break;
+ }
+
// This should only occur in unusual situations like bitcasting to an
// x86_fp80, so just turn it into a store+load
- return CreateStackStoreLoad(N->getOperand(0), N->getValueType(0));
+ return CreateStackStoreLoad(InOp, OutVT);
}
SDValue DAGTypeLegalizer::PromoteIntOp_BR_CC(SDNode *N, unsigned OpNo) {
diff --git a/llvm/test/CodeGen/AMDGPU/bitcast_vector_bigint.ll b/llvm/test/CodeGen/AMDGPU/bitcast_vector_bigint.ll
index ab89bb293f6e6e..2c6aabec763306 100644
--- a/llvm/test/CodeGen/AMDGPU/bitcast_vector_bigint.ll
+++ b/llvm/test/CodeGen/AMDGPU/bitcast_vector_bigint.ll
@@ -80,15 +80,6 @@ define <5 x i32> @bitcast_i160_to_v5i32(i160 %int) {
; GFX9-LABEL: bitcast_i160_to_v5i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s4, s33
-; GFX9-NEXT: s_add_i32 s33, s32, 0x7c0
-; GFX9-NEXT: s_and_b32 s33, s33, 0xfffff800
-; GFX9-NEXT: s_mov_b32 s5, s34
-; GFX9-NEXT: s_mov_b32 s34, s32
-; GFX9-NEXT: s_addk_i32 s32, 0x1000
-; GFX9-NEXT: s_mov_b32 s32, s34
-; GFX9-NEXT: s_mov_b32 s34, s5
-; GFX9-NEXT: s_mov_b32 s33, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: bitcast_i160_to_v5i32:
@@ -98,23 +89,6 @@ define <5 x i32> @bitcast_i160_to_v5i32(i160 %int) {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s0, s33
-; GFX12-NEXT: s_add_co_i32 s33, s32, 31
-; GFX12-NEXT: s_mov_b32 s1, s34
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 s33, s33, 31
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: scratch_store_b64 off, v[2:3], s33 offset:8
-; GFX12-NEXT: scratch_store_b64 off, v[0:1], s33
-; GFX12-NEXT: scratch_load_b128 v[0:3], off, s33
-; GFX12-NEXT: s_mov_b32 s34, s32
-; GFX12-NEXT: s_add_co_i32 s32, s32, 64
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_mov_b32 s32, s34
-; GFX12-NEXT: s_mov_b32 s34, s1
-; GFX12-NEXT: s_mov_b32 s33, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%bitcast = bitcast i160 %int to <5 x i32>
ret <5 x i32> %bitcast
@@ -124,15 +98,6 @@ define <6 x i32> @bitcast_i192_to_v6i32(i192 %int) {
; GFX9-LABEL: bitcast_i192_to_v6i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s4, s33
-; GFX9-NEXT: s_add_i32 s33, s32, 0x7c0
-; GFX9-NEXT: s_and_b32 s33, s33, 0xfffff800
-; GFX9-NEXT: s_mov_b32 s5, s34
-; GFX9-NEXT: s_mov_b32 s34, s32
-; GFX9-NEXT: s_addk_i32 s32, 0x1000
-; GFX9-NEXT: s_mov_b32 s32, s34
-; GFX9-NEXT: s_mov_b32 s34, s5
-; GFX9-NEXT: s_mov_b32 s33, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: bitcast_i192_to_v6i32:
@@ -142,23 +107,6 @@ define <6 x i32> @bitcast_i192_to_v6i32(i192 %int) {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s0, s33
-; GFX12-NEXT: s_add_co_i32 s33, s32, 31
-; GFX12-NEXT: s_mov_b32 s1, s34
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 s33, s33, 31
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: scratch_store_b64 off, v[2:3], s33 offset:8
-; GFX12-NEXT: scratch_store_b64 off, v[0:1], s33
-; GFX12-NEXT: scratch_load_b128 v[0:3], off, s33
-; GFX12-NEXT: s_mov_b32 s34, s32
-; GFX12-NEXT: s_add_co_i32 s32, s32, 64
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_mov_b32 s32, s34
-; GFX12-NEXT: s_mov_b32 s34, s1
-; GFX12-NEXT: s_mov_b32 s33, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%bitcast = bitcast i192 %int to <6 x i32>
ret <6 x i32> %bitcast
@@ -168,15 +116,6 @@ define <7 x i32> @bitcast_i224_to_v7i32(i224 %int) {
; GFX9-LABEL: bitcast_i224_to_v7i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s4, s33
-; GFX9-NEXT: s_add_i32 s33, s32, 0x7c0
-; GFX9-NEXT: s_and_b32 s33, s33, 0xfffff800
-; GFX9-NEXT: s_mov_b32 s5, s34
-; GFX9-NEXT: s_mov_b32 s34, s32
-; GFX9-NEXT: s_addk_i32 s32, 0x1000
-; GFX9-NEXT: s_mov_b32 s32, s34
-; GFX9-NEXT: s_mov_b32 s34, s5
-; GFX9-NEXT: s_mov_b32 s33, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: bitcast_i224_to_v7i32:
@@ -186,27 +125,6 @@ define <7 x i32> @bitcast_i224_to_v7i32(i224 %int) {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s0, s33
-; GFX12-NEXT: s_add_co_i32 s33, s32, 31
-; GFX12-NEXT: s_mov_b32 s1, s34
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 s33, s33, 31
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: scratch_store_b64 off, v[2:3], s33 offset:8
-; GFX12-NEXT: scratch_store_b64 off, v[0:1], s33
-; GFX12-NEXT: scratch_load_b128 v[0:3], off, s33
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: scratch_store_b32 off, v6, s33 offset:24
-; GFX12-NEXT: scratch_store_b64 off, v[4:5], s33 offset:16
-; GFX12-NEXT: scratch_load_b96 v[4:6], off, s33 offset:16
-; GFX12-NEXT: s_mov_b32 s34, s32
-; GFX12-NEXT: s_add_co_i32 s32, s32, 64
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_mov_b32 s32, s34
-; GFX12-NEXT: s_mov_b32 s34, s1
-; GFX12-NEXT: s_mov_b32 s33, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%bitcast = bitcast i224 %int to <7 x i32>
ret <7 x i32> %bitcast
@@ -252,15 +170,6 @@ define <3 x i64> @bitcast_i192_to_v3i64(i192 %int) {
; GFX9-LABEL: bitcast_i192_to_v3i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s4, s33
-; GFX9-NEXT: s_add_i32 s33, s32, 0x7c0
-; GFX9-NEXT: s_and_b32 s33, s33, 0xfffff800
-; GFX9-NEXT: s_mov_b32 s5, s34
-; GFX9-NEXT: s_mov_b32 s34, s32
-; GFX9-NEXT: s_addk_i32 s32, 0x1000
-; GFX9-NEXT: s_mov_b32 s32, s34
-; GFX9-NEXT: s_mov_b32 s34, s5
-; GFX9-NEXT: s_mov_b32 s33, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: bitcast_i192_to_v3i64:
@@ -270,23 +179,6 @@ define <3 x i64> @bitcast_i192_to_v3i64(i192 %int) {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s0, s33
-; GFX12-NEXT: s_add_co_i32 s33, s32, 31
-; GFX12-NEXT: s_mov_b32 s1, s34
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 s33, s33, 31
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: scratch_store_b64 off, v[2:3], s33 offset:8
-; GFX12-NEXT: scratch_store_b64 off, v[0:1], s33
-; GFX12-NEXT: scratch_load_b128 v[0:3], off, s33
-; GFX12-NEXT: s_mov_b32 s34, s32
-; GFX12-NEXT: s_add_co_i32 s32, s32, 64
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_mov_b32 s32, s34
-; GFX12-NEXT: s_mov_b32 s34, s1
-; GFX12-NEXT: s_mov_b32 s33, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%bitcast = bitcast i192 %int to <3 x i64>
ret <3 x i64> %bitcast
@@ -408,15 +300,6 @@ define <5 x float> @bitcast_i160_to_v5f32(i160 %int) {
; GFX9-LABEL: bitcast_i160_to_v5f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s4, s33
-; GFX9-NEXT: s_add_i32 s33, s32, 0x7c0
-; GFX9-NEXT: s_and_b32 s33, s33, 0xfffff800
-; GFX9-NEXT: s_mov_b32 s5, s34
-; GFX9-NEXT: s_mov_b32 s34, s32
-; GFX9-NEXT: s_addk_i32 s32, 0x1000
-; GFX9-NEXT: s_mov_b32 s32, s34
-; GFX9-NEXT: s_mov_b32 s34, s5
-; GFX9-NEXT: s_mov_b32 s33, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: bitcast_i160_to_v5f32:
@@ -426,23 +309,6 @@ define <5 x float> @bitcast_i160_to_v5f32(i160 %int) {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s0, s33
-; GFX12-NEXT: s_add_co_i32 s33, s32, 31
-; GFX12-NEXT: s_mov_b32 s1, s34
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 s33, s33, 31
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: scratch_store_b64 off, v[2:3], s33 offset:8
-; GFX12-NEXT: scratch_store_b64 off, v[0:1], s33
-; GFX12-NEXT: scratch_load_b128 v[0:3], off, s33
-; GFX12-NEXT: s_mov_b32 s34, s32
-; GFX12-NEXT: s_add_co_i32 s32, s32, 64
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_mov_b32 s32, s34
-; GFX12-NEXT: s_mov_b32 s34, s1
-; GFX12-NEXT: s_mov_b32 s33, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%bitcast = bitcast i160 %int to <5 x float>
ret <5 x float> %bitcast
@@ -452,15 +318,6 @@ define <6 x float> @bitcast_i192_to_v6f32(i192 %int) {
; GFX9-LABEL: bitcast_i192_to_v6f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s4, s33
-; GFX9-NEXT: s_add_i32 s33, s32, 0x7c0
-; GFX9-NEXT: s_and_b32 s33, s33, 0xfffff800
-; GFX9-NEXT: s_mov_b32 s5, s34
-; GFX9-NEXT: s_mov_b32 s34, s32
-; GFX9-NEXT: s_addk_i32 s32, 0x1000
-; GFX9-NEXT: s_mov_b32 s32, s34
-; GFX9-NEXT: s_mov_b32 s34, s5
-; GFX9-NEXT: s_mov_b32 s33, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: bitcast_i192_to_v6f32:
@@ -470,23 +327,6 @@ define <6 x float> @bitcast_i192_to_v6f32(i192 %int) {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s0, s33
-; GFX12-NEXT: s_add_co_i32 s33, s32, 31
-; GFX12-NEXT: s_mov_b32 s1, s34
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 s33, s33, 31
-; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: scratch_store_b64 off, v[2:3], s33 offset:8
-; GFX12-NEXT: scratch_store_b64 off, v[0:1], s33
-; GFX12-NEXT: scratch_load_b128 v[0:3], off, s33
-; GFX12-NEXT: s_mov_b32 s34, s32
-; GFX12-NEXT: s_add_co_i32 s32, s32, 64
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_mov_b32 s32, s34
-; GFX12-NEXT: s_mov_b32 s34, s1
-; GFX12-NEXT: s_mov_b32 s33, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%bitcast = bitcast i192 %int to <6 x float>
ret <6 x float> %bitcast
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
index 5f49e69a58ed87..405058b24dcc21 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
@@ -3110,17 +3110,8 @@ define void @store_i160(i160 %data, ptr addrspace(8) inreg %buf) {
; SDAG-LABEL: store_i160:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: s_mov_b32 s4, s33
-; SDAG-NEXT: s_add_i32 s33, s32, 0x7c0
-; SDAG-NEXT: s_and_b32 s33, s33, 0xfffff800
-; SDAG-NEXT: s_mov_b32 s5, s34
-; SDAG-NEXT: s_mov_b32 s34, s32
-; SDAG-NEXT: s_addk_i32 s32, 0x1000
; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
; SDAG-NEXT: buffer_store_dword v4, off, s[16:19], 0 offset:16
-; SDAG-NEXT: s_mov_b32 s32, s34
-; SDAG-NEXT: s_mov_b32 s34, s5
-; SDAG-NEXT: s_mov_b32 s33, s4
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/ctpop16.ll b/llvm/test/CodeGen/AMDGPU/ctpop16.ll
index 17ab8fc780fb41..6bf126af5ade23 100644
--- a/llvm/test/CodeGen/AMDGPU/ctpop16.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctpop16.ll
@@ -457,58 +457,27 @@ define amdgpu_kernel void @v_ctpop_v4i16(ptr addrspace(1) noalias %out, ptr addr
;
; EG-LABEL: v_ctpop_v4i16:
; EG: ; %bb.0:
-; EG-NEXT: ALU 3, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 37, @12, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XY, T0.X, 1
+; EG-NEXT: ALU 7, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T6.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
-; EG-NEXT: VTX_READ_64 T8.XY, T0.X, 0, #1
+; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
-; EG-NEXT: MOV T0.Y, T4.X,
-; EG-NEXT: LSHL * T0.W, T0.X, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
-; EG-NEXT: ALU clause starting at 12:
-; EG-NEXT: AND_INT * T0.W, T8.X, literal.x,
-; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: BCNT_INT T0.W, PV.W,
-; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x,
-; EG-NEXT: -65536(nan), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, PS, PV.W,
-; EG-NEXT: MOV * T4.X, PV.W,
-; EG-NEXT: MOV T0.X, PV.X,
-; EG-NEXT: LSHR * T0.W, T8.X, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BCNT_INT T0.W, PV.W,
-; EG-NEXT: AND_INT * T1.W, PV.X, literal.x,
-; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV T4.X, PV.W,
-; EG-NEXT: MOV * T0.X, T5.X,
-; EG-NEXT: AND_INT * T0.W, T8.Y, literal.x,
-; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: BCNT_INT T0.W, PV.W,
-; EG-NEXT: AND_INT * T1.W, T0.X, literal.x,
-; EG-NEXT: -65536(nan), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, PS, PV.W,
-; EG-NEXT: MOV * T5.X, PV.W,
-; EG-NEXT: MOV T0.X, PV.X,
-; EG-NEXT: LSHR * T0.W, T8.Y, literal.x,
+; EG-NEXT: ALU clause starting at 11:
+; EG-NEXT: LSHR * T0.W, T0.X, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BCNT_INT T0.W, PV.W,
-; EG-NEXT: AND_INT * T1.W, PV.X, literal.x,
+; EG-NEXT: BCNT_INT T0.Y, PV.W,
+; EG-NEXT: AND_INT * T0.W, T0.X, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
-; EG-NEXT: OR_INT * T8.Y, T1.W, PV.W,
+; EG-NEXT: BCNT_INT T0.X, PV.W,
+; EG-NEXT: LSHR * T6.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: MOV T5.X, PV.Y,
-; EG-NEXT: MOV * T8.X, T4.X,
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%in.gep = getelementptr <4 x i16>, ptr addrspace(1) %in, i32 %tid
%val = load <4 x i16>, ptr addrspace(1) %in.gep, align 16
@@ -601,94 +570,33 @@ define amdgpu_kernel void @v_ctpop_v8i16(ptr addrspace(1) noalias %out, ptr addr
;
; EG-LABEL: v_ctpop_v8i16:
; EG: ; %bb.0:
-; EG-NEXT: ALU 3, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 73, @12, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T12.X, 1
+; EG-NEXT: ALU 13, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T8.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
-; EG-NEXT: VTX_READ_128 T12.XYZW, T0.X, 0, #1
+; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
-; EG-NEXT: MOV T0.Y, T4.X,
-; EG-NEXT: LSHL * T0.W, T0.X, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
-; EG-NEXT: ALU clause starting at 12:
-; EG-NEXT: LSHR * T0.W, T12.X, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BCNT_INT * T0.W, PV.W,
-; EG-NEXT: LSHL T0.W, PV.W, literal.x,
-; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
-; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
-; EG-NEXT: OR_INT * T0.W, PS, PV.W,
-; EG-NEXT: MOV * T4.X, PV.W,
-; EG-NEXT: MOV T0.X, PV.X,
-; EG-NEXT: AND_INT * T0.W, T12.X, literal.x,
-; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: BCNT_INT T0.W, PV.W,
-; EG-NEXT: AND_INT * T1.W, PV.X, literal.x,
-; EG-NEXT: -65536(nan), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, PS, PV.W,
-; EG-NEXT: MOV T4.X, PV.W,
-; EG-NEXT: MOV * T0.X, T5.X,
-; EG-NEXT: LSHR * T0.W, T12.Y, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BCNT_INT T0.W, PV.W,
-; EG-NEXT: AND_INT * T1.W, T0.X, literal.x,
-; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV * T5.X, PV.W,
-; EG-NEXT: MOV T0.X, PV.X,
-; EG-NEXT: AND_INT * T0.W, T12.Y, literal.x,
-; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: BCNT_INT T0.W, PV.W,
-; EG-NEXT: AND_INT * T1.W, PV.X, literal.x,
-; EG-NEXT: -65536(nan), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.Y, PS, PV.W,
-; EG-NEXT: MOV T5.X, PV.Y,
-; EG-NEXT: MOV * T0.X, T8.X,
-; EG-NEXT: LSHR * T0.W, T12.Z, literal.x,
+; EG-NEXT: ALU clause starting at 11:
+; EG-NEXT: LSHR * T0.W, T0.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BCNT_INT T0.W, PV.W,
-; EG-NEXT: AND_INT * T1.W, T0.X, literal.x,
-; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV * T8.X, PV.W,
-; EG-NEXT: MOV T0.X, PV.X,
-; EG-NEXT: AND_INT * T0.W, T12.Z, literal.x,
+; EG-NEXT: AND_INT * T1.W, T0.Z, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: BCNT_INT T0.W, PV.W,
-; EG-NEXT: AND_INT * T1.W, PV.X, literal.x,
-; EG-NEXT: -65536(nan), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, PS, PV.W,
-; EG-NEXT: MOV T8.X, PV.W,
-; EG-NEXT: MOV * T0.X, T9.X,
-; EG-NEXT: LSHR * T0.W, T12.W, literal.x,
+; EG-NEXT: BCNT_INT T0.Z, PS,
+; EG-NEXT: LSHR * T1.W, T0.X, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: BCNT_INT T0.W, PV.W,
+; EG-NEXT: BCNT_INT T0.Y, PV.W,
; EG-NEXT: AND_INT * T1.W, T0.X, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
-; EG-NEXT: MOV * T9.X, PV.W,
-; EG-NEXT: MOV T0.X, PV.X,
-; EG-NEXT: AND_INT * T0.W, T12.W, literal.x,
-; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
-; EG-NEXT: BCNT_INT T0.W, PV.W,
-; EG-NEXT: ...
[truncated]
|
EVT::getVectorVT(*DAG.getContext(), EltVT, NumEltsWithPadding); | ||
|
||
if (isTypeLegal(WideVecVT)) { | ||
SDValue Promoted = GetPromotedInteger(InOp); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we need to shift the scalar value left for big endian targets?
2d1418b
to
de94495
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
Merge activity
|
5b18109
to
4b47549
Compare
de94495
to
35dbabd
Compare
Fix introducing stack usage if a bitcast source operand is an illegal integer type cast to a legal vector type. This should cover more situations, but this is the first one I noticed.
35dbabd
to
c47a4b3
Compare
…lvm#125637) Fix introducing stack usage if a bitcast source operand is an illegal integer type cast to a legal vector type. This should cover more situations, but this is the first one I noticed.
Fix introducing stack usage if a bitcast source operand is an illegal
integer type cast to a legal vector type. This should cover more
situations, but this is the first one I noticed.