-
Notifications
You must be signed in to change notification settings - Fork 14.3k
AMDGPU: Replace insertelement undef with poison in cases with manual updates #130898
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
arsenm
merged 1 commit into
main
from
users/arsenm/amdgpu/use-insertelement-poison-instead-of-undef-manual-fixups
Mar 12, 2025
Merged
AMDGPU: Replace insertelement undef with poison in cases with manual updates #130898
arsenm
merged 1 commit into
main
from
users/arsenm/amdgpu/use-insertelement-poison-instead-of-undef-manual-fixups
Mar 12, 2025
Conversation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
@llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesI had to manually intervene in a few tests. fcanonicalize.f16.ll is directly sensitive Full diff: https://github.com/llvm/llvm-project/pull/130898.diff 3 Files Affected:
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
index e72f3d3ce993a..d48b75a666db7 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
@@ -238,7 +238,7 @@ define <2 x half> @v_test_canonicalize_build_vector_v2f16(half %lo, half %hi) #1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_pk_max_f16 v0, v0, v0
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
- %ins0 = insertelement <2 x half> undef, half %lo, i32 0
+ %ins0 = insertelement <2 x half> poison, half %lo, i32 0
%ins1 = insertelement <2 x half> %ins0, half %hi, i32 1
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %ins1)
ret <2 x half> %canonicalized
@@ -2581,7 +2581,7 @@ define <2 x half> @v_test_canonicalize_reg_undef_v2f16(half %val) #1 {
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v0, 0
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
- %vec = insertelement <2 x half> undef, half %val, i32 0
+ %vec = insertelement <2 x half> poison, half %val, i32 0
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
ret <2 x half> %canonicalized
}
@@ -2622,7 +2622,7 @@ define <2 x half> @v_test_canonicalize_undef_reg_v2f16(half %val) #1 {
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
- %vec = insertelement <2 x half> undef, half %val, i32 1
+ %vec = insertelement <2 x half> poison, half %val, i32 1
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
ret <2 x half> %canonicalized
}
@@ -2785,7 +2785,7 @@ define <2 x half> @v_test_canonicalize_reg_k_v2f16(half %val) #1 {
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v0, 2.0
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
- %vec0 = insertelement <2 x half> undef, half %val, i32 0
+ %vec0 = insertelement <2 x half> poison, half %val, i32 0
%vec1 = insertelement <2 x half> %vec0, half 2.0, i32 1
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec1)
ret <2 x half> %canonicalized
@@ -2829,7 +2829,7 @@ define <2 x half> @v_test_canonicalize_k_reg_v2f16(half %val) #1 {
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, 2.0, v0
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
- %vec0 = insertelement <2 x half> undef, half 2.0, i32 0
+ %vec0 = insertelement <2 x half> poison, half 2.0, i32 0
%vec1 = insertelement <2 x half> %vec0, half %val, i32 1
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec1)
ret <2 x half> %canonicalized
@@ -2925,7 +2925,7 @@ define <4 x half> @v_test_canonicalize_reg_undef_undef_undef_v4f16(half %val) #1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v0, 0
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
- %vec = insertelement <4 x half> undef, half %val, i32 0
+ %vec = insertelement <4 x half> poison, half %val, i32 0
%canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> %vec)
ret <4 x half> %canonicalized
}
@@ -2977,7 +2977,7 @@ define <4 x half> @v_test_canonicalize_reg_reg_undef_undef_v4f16(half %val0, hal
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_pk_max_f16 v0, v0, v0
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
- %vec0 = insertelement <4 x half> undef, half %val0, i32 0
+ %vec0 = insertelement <4 x half> poison, half %val0, i32 0
%vec1 = insertelement <4 x half> %vec0, half %val1, i32 1
%canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> %vec1)
ret <4 x half> %canonicalized
@@ -3035,7 +3035,7 @@ define <4 x half> @v_test_canonicalize_reg_undef_reg_reg_v4f16(half %val0, half
; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v0, 0
; GFX11-FAKE16-NEXT: v_pk_max_f16 v1, v1, v1
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
- %vec0 = insertelement <4 x half> undef, half %val0, i32 0
+ %vec0 = insertelement <4 x half> poison, half %val0, i32 0
%vec1 = insertelement <4 x half> %vec0, half %val1, i32 2
%vec2 = insertelement <4 x half> %vec1, half %val2, i32 3
%canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> %vec2)
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll
index 2c3cb1e6a5e6e..a4a8a985df0bf 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll
@@ -134,7 +134,7 @@ define amdgpu_vs void @promote_load_from_store_aggr_varoff(<4 x i32> %input) {
; CHECK-NEXT: [[FOO3_UNPACK2:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @block4, i64 8), align 4
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <3 x i32> undef, i32 [[FOO3_UNPACK2]], i32 2
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i32> [[TMP1]], i32 [[FOO3_UNPACK2]]
-; CHECK-NEXT: [[FOO12:%.*]] = insertelement <4 x i32> %input, i32 [[TMP2]], i64 3
+; CHECK-NEXT: [[FOO12:%.*]] = insertelement <4 x i32> [[INPUT:%.*]], i32 [[TMP2]], i64 3
; CHECK-NEXT: store <4 x i32> [[FOO12]], ptr addrspace(1) @pv1, align 16
; CHECK-NEXT: ret void
;
@@ -344,7 +344,7 @@ define amdgpu_ps void @promote_double_aggr() #0 {
; CHECK-NEXT: [[FOO10:%.*]] = fadd double [[FOO5_FCA_1_EXTRACT]], [[FOO5_FCA_1_EXTRACT]]
; CHECK-NEXT: [[FOO16:%.*]] = fadd double [[FOO10]], [[FOO5_FCA_1_EXTRACT]]
; CHECK-NEXT: [[FOO17:%.*]] = fptrunc double [[FOO16]] to float
-; CHECK-NEXT: [[FOO18:%.*]] = insertelement <4 x float> undef, float [[FOO17]], i32 0
+; CHECK-NEXT: [[FOO18:%.*]] = insertelement <4 x float> poison, float [[FOO17]], i32 0
; CHECK-NEXT: [[FOO19:%.*]] = insertelement <4 x float> [[FOO18]], float [[FOO17]], i32 1
; CHECK-NEXT: [[FOO20:%.*]] = insertelement <4 x float> [[FOO19]], float [[FOO17]], i32 2
; CHECK-NEXT: [[FOO21:%.*]] = insertelement <4 x float> [[FOO20]], float [[FOO17]], i32 3
@@ -370,7 +370,7 @@ define amdgpu_ps void @promote_double_aggr() #0 {
%foo15 = load double, ptr addrspace(5) %foo14
%foo16 = fadd double %foo13, %foo15
%foo17 = fptrunc double %foo16 to float
- %foo18 = insertelement <4 x float> undef, float %foo17, i32 0
+ %foo18 = insertelement <4 x float> poison, float %foo17, i32 0
%foo19 = insertelement <4 x float> %foo18, float %foo17, i32 1
%foo20 = insertelement <4 x float> %foo19, float %foo17, i32 2
%foo21 = insertelement <4 x float> %foo20, float %foo17, i32 3
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-loadstores.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-loadstores.ll
index 1e49500a243e1..119d3611e1007 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-loadstores.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-loadstores.ll
@@ -9,15 +9,15 @@ define amdgpu_kernel void @test_overwrite(i64 %val, i1 %cond) {
; CHECK-NEXT: entry:
; CHECK-NEXT: br i1 [[COND]], label [[LOOP:%.*]], label [[END:%.*]]
; CHECK: loop:
-; CHECK-NEXT: [[PROMOTEALLOCA:%.*]] = phi <3 x i64> [ [[TMP2:%.*]], [[LOOP]] ], [ <i64 43, i64 undef, i64 undef>, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA]], i32 0
-; CHECK-NEXT: [[TMP1:%.*]] = insertelement <3 x i64> [[PROMOTEALLOCA]], i64 68, i32 0
+; CHECK-NEXT: [[PROMOTEALLOCA1:%.*]] = phi <3 x i64> [ [[TMP2:%.*]], [[LOOP]] ], [ <i64 43, i64 undef, i64 undef>, [[ENTRY:%.*]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA1]], i32 0
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <3 x i64> [[PROMOTEALLOCA1]], i64 68, i32 0
; CHECK-NEXT: [[TMP2]] = insertelement <3 x i64> [[TMP1]], i64 32, i32 0
; CHECK-NEXT: [[LOOP_CC:%.*]] = icmp ne i64 [[TMP0]], 68
; CHECK-NEXT: br i1 [[LOOP_CC]], label [[LOOP]], label [[END]]
; CHECK: end:
-; CHECK-NEXT: [[PROMOTEALLOCA1:%.*]] = phi <3 x i64> [ [[TMP2]], [[LOOP]] ], [ <i64 43, i64 undef, i64 undef>, [[ENTRY]] ]
-; CHECK-NEXT: [[TMP3:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA1]], i32 0
+; CHECK-NEXT: [[PROMOTEALLOCA:%.*]] = phi <3 x i64> [ [[TMP2]], [[LOOP]] ], [ <i64 43, i64 undef, i64 undef>, [[ENTRY]] ]
+; CHECK-NEXT: [[TMP3:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA]], i32 0
; CHECK-NEXT: ret void
;
entry:
@@ -64,15 +64,15 @@ define amdgpu_kernel void @test_no_overwrite(i64 %val, i1 %cond) {
; CHECK-NEXT: entry:
; CHECK-NEXT: br i1 [[COND]], label [[LOOP:%.*]], label [[END:%.*]]
; CHECK: loop:
-; CHECK-NEXT: [[PROMOTEALLOCA:%.*]] = phi <3 x i64> [ [[TMP1:%.*]], [[LOOP]] ], [ <i64 43, i64 undef, i64 undef>, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA]], i32 0
-; CHECK-NEXT: [[TMP1]] = insertelement <3 x i64> [[PROMOTEALLOCA]], i64 32, i32 1
+; CHECK-NEXT: [[PROMOTEALLOCA1:%.*]] = phi <3 x i64> [ [[TMP1:%.*]], [[LOOP]] ], [ <i64 43, i64 undef, i64 undef>, [[ENTRY:%.*]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA1]], i32 0
+; CHECK-NEXT: [[TMP1]] = insertelement <3 x i64> [[PROMOTEALLOCA1]], i64 32, i32 1
; CHECK-NEXT: [[LOOP_CC:%.*]] = icmp ne i64 [[TMP0]], 32
; CHECK-NEXT: br i1 [[LOOP_CC]], label [[LOOP]], label [[END]]
; CHECK: end:
-; CHECK-NEXT: [[PROMOTEALLOCA1:%.*]] = phi <3 x i64> [ [[TMP1]], [[LOOP]] ], [ <i64 43, i64 undef, i64 undef>, [[ENTRY]] ]
-; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA1]], i32 0
-; CHECK-NEXT: [[TMP3:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA1]], i32 1
+; CHECK-NEXT: [[PROMOTEALLOCA:%.*]] = phi <3 x i64> [ [[TMP1]], [[LOOP]] ], [ <i64 43, i64 undef, i64 undef>, [[ENTRY]] ]
+; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA]], i32 0
+; CHECK-NEXT: [[TMP3:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA]], i32 1
; CHECK-NEXT: ret void
;
entry:
|
4515e4f
to
ddaf38a
Compare
3a30865
to
6609eb3
Compare
This was referenced Mar 12, 2025
ddaf38a
to
e511b58
Compare
6609eb3
to
bcc5ce8
Compare
e511b58
to
f027ed2
Compare
bcc5ce8
to
4d5a13c
Compare
f027ed2
to
e980e7c
Compare
4d5a13c
to
63cb586
Compare
shiltian
approved these changes
Mar 12, 2025
e980e7c
to
c293ae1
Compare
Base automatically changed from
users/arsenm/amdgpu/use-insertelement-poison-instead-of-undef
to
main
March 12, 2025 13:33
…updates I had to manually intervene in a few tests. fcanonicalize.f16.ll is directly sensitive to undef vs. poison.
63cb586
to
7b07d29
Compare
This was referenced Mar 13, 2025
frederik-h
pushed a commit
to frederik-h/llvm-project
that referenced
this pull request
Mar 18, 2025
…updates (llvm#130898) I had to manually intervene in a few tests. fcanonicalize.f16.ll is directly sensitive to undef vs. poison.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
I had to manually intervene in a few tests. fcanonicalize.f16.ll is directly sensitive
to undef vs. poison.