-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[GlobalISel] Combine G_MERGE_VALUES of x and undef #113616
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-aarch64 @llvm/pr-subscribers-llvm-globalisel Author: Thorsten Schütt (tschuett) Changesinto zext x ; CHECK-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [TRUNC], [DEF] Please continue padding merge values. // %bits_8_15:(s8) = G_IMPLICIT_DEF %bits_8_15 is defined by undef. Its value is undefined and we can pick an arbitrary value. For optimization, we pick zero. // %0:_(s16) = G_ZEXT %bits_0_7:(s8) The upper bits of %0 are zero and the lower bits come from %bits_0_7. Patch is 33.45 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/113616.diff 11 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 9240a3c3127eb4..dbd9d6d553b24b 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -922,6 +922,9 @@ class CombinerHelper {
bool matchUnmergeValuesAnyExtBuildVector(const MachineInstr &MI,
BuildFnTy &MatchInfo);
+ // merge_values(_, undef) -> zext
+ bool matchMergeXAndUndef(const MachineInstr &MI, BuildFnTy &MatchInfo);
+
private:
/// Checks for legality of an indexed variant of \p LdSt.
bool isIndexedLoadStoreLegal(GLoadStore &LdSt) const;
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index ead4149fc11068..6c84d6ad40471c 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -848,6 +848,14 @@ def unmerge_zext_to_zext : GICombineRule<
(apply [{ Helper.applyCombineUnmergeZExtToZExt(*${d}); }])
>;
+/// Transform merge_x_undef -> zext.
+def merge_of_x_and_undef : GICombineRule <
+ (defs root:$root, build_fn_matchinfo:$matchinfo),
+ (match (G_IMPLICIT_DEF $undef),
+ (G_MERGE_VALUES $root, $x, $undef):$MI,
+ [{ return Helper.matchMergeXAndUndef(*${MI}, ${matchinfo}); }]),
+ (apply [{ Helper.applyBuildFn(*${MI}, ${matchinfo}); }])>;
+
def merge_combines: GICombineGroup<[
unmerge_anyext_build_vector,
unmerge_merge,
@@ -855,7 +863,8 @@ def merge_combines: GICombineGroup<[
unmerge_cst,
unmerge_undef,
unmerge_dead_to_trunc,
- unmerge_zext_to_zext
+ unmerge_zext_to_zext,
+ merge_of_x_and_undef
]>;
// Under certain conditions, transform:
diff --git a/llvm/lib/CodeGen/GlobalISel/CMakeLists.txt b/llvm/lib/CodeGen/GlobalISel/CMakeLists.txt
index af1717dbf76f39..a45024d120be68 100644
--- a/llvm/lib/CodeGen/GlobalISel/CMakeLists.txt
+++ b/llvm/lib/CodeGen/GlobalISel/CMakeLists.txt
@@ -6,6 +6,7 @@ add_llvm_component_library(LLVMGlobalISel
GlobalISel.cpp
Combiner.cpp
CombinerHelper.cpp
+ CombinerHelperArtifacts.cpp
CombinerHelperCasts.cpp
CombinerHelperCompares.cpp
CombinerHelperVectorOps.cpp
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp
new file mode 100644
index 00000000000000..29875b04c37984
--- /dev/null
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp
@@ -0,0 +1,57 @@
+//===- CombinerHelperArtifacts.cpp-----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements CombinerHelper for legalization artifacts.
+//
+//===----------------------------------------------------------------------===//
+//
+// G_MERGE_VALUES
+//
+//===----------------------------------------------------------------------===//
+#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
+#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
+#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/CodeGen/LowLevelTypeUtils.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
+#include "llvm/Support/Casting.h"
+
+#define DEBUG_TYPE "gi-combiner"
+
+using namespace llvm;
+
+bool CombinerHelper::matchMergeXAndUndef(const MachineInstr &MI,
+ BuildFnTy &MatchInfo) {
+ const GMerge *Merge = cast<GMerge>(&MI);
+
+ Register Dst = Merge->getReg(0);
+ Register Undef = Merge->getSourceReg(1);
+ LLT DstTy = MRI.getType(Dst);
+ LLT SrcTy = MRI.getType(Merge->getSourceReg(0));
+
+ //
+ // %bits_8_15:_(s8) = G_IMPLICIT_DEF
+ // %0:_(s16) = G_MERGE_VALUES %bits_0_7:(s8), %bits_8_15:(s8)
+ //
+ // ->
+ //
+ // %0:_(s16) = G_ZEXT %bits_0_7:(s8)
+ //
+
+ if (!MRI.hasOneNonDBGUse(Undef) ||
+ !isLegalOrBeforeLegalizer({TargetOpcode::G_ZEXT, {DstTy, SrcTy}}))
+ return false;
+
+ MatchInfo = [=](MachineIRBuilder &B) {
+ B.buildZExt(Dst, Merge->getSourceReg(0));
+ };
+ return true;
+}
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir
index 7566d38e6c6cfa..67cbdd19a05684 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir
@@ -10,9 +10,9 @@ body: |
bb.1:
; CHECK-LABEL: name: test_combine_unmerge_merge
; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
- ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; CHECK-NEXT: $w0 = COPY [[DEF]](s32)
- ; CHECK-NEXT: $w1 = COPY [[DEF1]](s32)
+ ; CHECK-NEXT: $w1 = COPY [[C]](s32)
%0:_(s32) = G_IMPLICIT_DEF
%1:_(s32) = G_IMPLICIT_DEF
%2:_(s64) = G_MERGE_VALUES %0(s32), %1(s32)
@@ -115,9 +115,11 @@ body: |
bb.1:
; CHECK-LABEL: name: test_combine_unmerge_bitcast_merge
; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
- ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
- ; CHECK-NEXT: $w0 = COPY [[DEF]](s32)
- ; CHECK-NEXT: $w1 = COPY [[DEF1]](s32)
+ ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[DEF]](s32)
+ ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[ZEXT]](s64)
+ ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](<2 x s32>)
+ ; CHECK-NEXT: $w0 = COPY [[UV]](s32)
+ ; CHECK-NEXT: $w1 = COPY [[UV1]](s32)
%0:_(s32) = G_IMPLICIT_DEF
%1:_(s32) = G_IMPLICIT_DEF
%2:_(s64) = G_MERGE_VALUES %0(s32), %1(s32)
@@ -136,9 +138,8 @@ body: |
bb.1:
; CHECK-LABEL: name: test_combine_unmerge_merge_incompatible_types
; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
- ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
- ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[DEF]](s32), [[DEF1]](s32)
- ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[MV]](s64)
+ ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[DEF]](s32)
+ ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[ZEXT]](s64)
; CHECK-NEXT: $h0 = COPY [[UV]](s16)
; CHECK-NEXT: $h1 = COPY [[UV1]](s16)
; CHECK-NEXT: $h2 = COPY [[UV2]](s16)
@@ -539,3 +540,36 @@ body: |
$q0 = COPY %un1(s128)
$q1 = COPY %un2(s128)
...
+
+# Check that we zext the merge
+---
+name: test_merge_undef
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: test_merge_undef
+ ; CHECK: %opaque:_(s64) = COPY $x0
+ ; CHECK-NEXT: %me:_(s128) = G_ZEXT %opaque(s64)
+ ; CHECK-NEXT: $q0 = COPY %me(s128)
+ %opaque:_(s64) = COPY $x0
+ %def:_(s64) = G_IMPLICIT_DEF
+ %me:_(s128) = G_MERGE_VALUES %opaque(s64), %def
+ $q0 = COPY %me(s128)
+...
+
+# Check that we don't zext the merge, multi-use
+---
+name: test_merge_undef_multi_use
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: test_merge_undef_multi_use
+ ; CHECK: %opaque:_(s64) = COPY $x0
+ ; CHECK-NEXT: %def:_(s64) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: %me:_(s128) = G_MERGE_VALUES %opaque(s64), %def(s64)
+ ; CHECK-NEXT: $q0 = COPY %me(s128)
+ ; CHECK-NEXT: $x0 = COPY %def(s64)
+ %opaque:_(s64) = COPY $x0
+ %def:_(s64) = G_IMPLICIT_DEF
+ %me:_(s128) = G_MERGE_VALUES %opaque(s64), %def
+ $q0 = COPY %me(s128)
+ $x0 = COPY %def(s64)
+...
diff --git a/llvm/test/CodeGen/AArch64/bswap.ll b/llvm/test/CodeGen/AArch64/bswap.ll
index 74e4a167ae14ca..afc1d932840ff7 100644
--- a/llvm/test/CodeGen/AArch64/bswap.ll
+++ b/llvm/test/CodeGen/AArch64/bswap.ll
@@ -45,25 +45,14 @@ define i64 @bswap_i16_to_i64_anyext(i16 %a) {
; The zext here is optimised to an any_extend during isel..
define i128 @bswap_i16_to_i128_anyext(i16 %a) {
-; CHECK-SD-LABEL: bswap_i16_to_i128_anyext:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: mov w8, w0
-; CHECK-SD-NEXT: mov x0, xzr
-; CHECK-SD-NEXT: rev w8, w8
-; CHECK-SD-NEXT: lsr w8, w8, #16
-; CHECK-SD-NEXT: lsl x1, x8, #48
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: bswap_i16_to_i128_anyext:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: mov w8, w0
-; CHECK-GI-NEXT: mov x0, xzr
-; CHECK-GI-NEXT: rev w8, w8
-; CHECK-GI-NEXT: lsr w8, w8, #16
-; CHECK-GI-NEXT: bfi x8, x8, #32, #32
-; CHECK-GI-NEXT: and x8, x8, #0xffff
-; CHECK-GI-NEXT: lsl x1, x8, #48
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: bswap_i16_to_i128_anyext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: mov x0, xzr
+; CHECK-NEXT: rev w8, w8
+; CHECK-NEXT: lsr w8, w8, #16
+; CHECK-NEXT: lsl x1, x8, #48
+; CHECK-NEXT: ret
%3 = call i16 @llvm.bswap.i16(i16 %a)
%4 = zext i16 %3 to i128
%5 = shl i128 %4, 112
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
index 63f5464371cc62..e86282fa1883d9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
@@ -1884,22 +1884,22 @@ define amdgpu_ps i65 @s_ashr_i65(i65 inreg %value, i65 inreg %amount) {
define amdgpu_ps i65 @s_ashr_i65_33(i65 inreg %value) {
; GCN-LABEL: s_ashr_i65_33:
; GCN: ; %bb.0:
-; GCN-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000
-; GCN-NEXT: s_lshr_b32 s0, s1, 1
-; GCN-NEXT: s_mov_b32 s1, 0
-; GCN-NEXT: s_lshl_b64 s[4:5], s[2:3], 31
-; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
-; GCN-NEXT: s_ashr_i32 s2, s3, 1
+; GCN-NEXT: s_mov_b32 s3, 0
+; GCN-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x10000
+; GCN-NEXT: s_lshr_b32 s2, s1, 1
+; GCN-NEXT: s_lshl_b64 s[0:1], s[4:5], 31
+; GCN-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
+; GCN-NEXT: s_ashr_i32 s2, s5, 1
; GCN-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_ashr_i65_33:
; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000
-; GFX10PLUS-NEXT: s_lshr_b32 s0, s1, 1
-; GFX10PLUS-NEXT: s_mov_b32 s1, 0
-; GFX10PLUS-NEXT: s_lshl_b64 s[4:5], s[2:3], 31
-; GFX10PLUS-NEXT: s_ashr_i32 s2, s3, 1
-; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
+; GFX10PLUS-NEXT: s_mov_b32 s3, 0
+; GFX10PLUS-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x10000
+; GFX10PLUS-NEXT: s_lshr_b32 s2, s1, 1
+; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[4:5], 31
+; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX10PLUS-NEXT: s_ashr_i32 s2, s5, 1
; GFX10PLUS-NEXT: ; return to shader part epilog
%result = ashr i65 %value, 33
ret i65 %result
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
index 5dd4fa0809131f..2ae9d28cda16a2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
@@ -1574,8 +1574,8 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) {
; GFX6-LABEL: v_lshr_i65:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, 1, v2
; GFX6-NEXT: v_mov_b32_e32 v5, 0
+; GFX6-NEXT: v_and_b32_e32 v4, 1, v2
; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 64, v3
; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 64, v3
; GFX6-NEXT: v_lshr_b64 v[6:7], v[0:1], v3
@@ -1596,8 +1596,8 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) {
; GFX8-LABEL: v_lshr_i65:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v4, 1, v2
; GFX8-NEXT: v_mov_b32_e32 v5, 0
+; GFX8-NEXT: v_and_b32_e32 v4, 1, v2
; GFX8-NEXT: v_sub_u32_e32 v8, vcc, 64, v3
; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, 64, v3
; GFX8-NEXT: v_lshrrev_b64 v[6:7], v3, v[0:1]
@@ -1618,8 +1618,8 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) {
; GFX9-LABEL: v_lshr_i65:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v4, 1, v2
; GFX9-NEXT: v_mov_b32_e32 v5, 0
+; GFX9-NEXT: v_and_b32_e32 v4, 1, v2
; GFX9-NEXT: v_sub_u32_e32 v8, 64, v3
; GFX9-NEXT: v_subrev_u32_e32 v2, 64, v3
; GFX9-NEXT: v_lshrrev_b64 v[6:7], v3, v[0:1]
@@ -1688,8 +1688,8 @@ define i65 @v_lshr_i65_33(i65 %value) {
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v3, v1
-; GFX6-NEXT: v_and_b32_e32 v0, 1, v2
; GFX6-NEXT: v_mov_b32_e32 v1, 0
+; GFX6-NEXT: v_and_b32_e32 v0, 1, v2
; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 31
; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v3
; GFX6-NEXT: v_or_b32_e32 v0, v2, v0
@@ -1700,8 +1700,8 @@ define i65 @v_lshr_i65_33(i65 %value) {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: v_and_b32_e32 v0, 1, v2
; GFX8-NEXT: v_mov_b32_e32 v1, 0
+; GFX8-NEXT: v_and_b32_e32 v0, 1, v2
; GFX8-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1]
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 1, v3
; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
@@ -1712,8 +1712,8 @@ define i65 @v_lshr_i65_33(i65 %value) {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v2
; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v2
; GFX9-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1]
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 1, v3
; GFX9-NEXT: v_or_b32_e32 v0, v2, v0
@@ -1749,20 +1749,22 @@ define i65 @v_lshr_i65_33(i65 %value) {
define amdgpu_ps i65 @s_lshr_i65(i65 inreg %value, i65 inreg %amount) {
; GCN-LABEL: s_lshr_i65:
; GCN: ; %bb.0:
-; GCN-NEXT: s_and_b64 s[4:5], s[2:3], 1
-; GCN-NEXT: s_sub_i32 s10, s3, 64
-; GCN-NEXT: s_sub_i32 s8, 64, s3
-; GCN-NEXT: s_cmp_lt_u32 s3, 64
+; GCN-NEXT: s_mov_b32 s4, s3
+; GCN-NEXT: s_mov_b32 s3, 0
+; GCN-NEXT: s_and_b64 s[2:3], s[2:3], 1
+; GCN-NEXT: s_sub_i32 s10, s4, 64
+; GCN-NEXT: s_sub_i32 s8, 64, s4
+; GCN-NEXT: s_cmp_lt_u32 s4, 64
; GCN-NEXT: s_cselect_b32 s11, 1, 0
-; GCN-NEXT: s_cmp_eq_u32 s3, 0
+; GCN-NEXT: s_cmp_eq_u32 s4, 0
; GCN-NEXT: s_cselect_b32 s12, 1, 0
-; GCN-NEXT: s_lshr_b64 s[6:7], s[4:5], s3
-; GCN-NEXT: s_lshr_b64 s[2:3], s[0:1], s3
-; GCN-NEXT: s_lshl_b64 s[8:9], s[4:5], s8
-; GCN-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9]
-; GCN-NEXT: s_lshr_b64 s[4:5], s[4:5], s10
+; GCN-NEXT: s_lshr_b64 s[6:7], s[2:3], s4
+; GCN-NEXT: s_lshr_b64 s[4:5], s[0:1], s4
+; GCN-NEXT: s_lshl_b64 s[8:9], s[2:3], s8
+; GCN-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9]
+; GCN-NEXT: s_lshr_b64 s[2:3], s[2:3], s10
; GCN-NEXT: s_cmp_lg_u32 s11, 0
-; GCN-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5]
+; GCN-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3]
; GCN-NEXT: s_cmp_lg_u32 s12, 0
; GCN-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3]
; GCN-NEXT: s_cmp_lg_u32 s11, 0
@@ -1771,24 +1773,26 @@ define amdgpu_ps i65 @s_lshr_i65(i65 inreg %value, i65 inreg %amount) {
;
; GFX10PLUS-LABEL: s_lshr_i65:
; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: s_and_b64 s[4:5], s[2:3], 1
-; GFX10PLUS-NEXT: s_sub_i32 s10, s3, 64
-; GFX10PLUS-NEXT: s_sub_i32 s2, 64, s3
-; GFX10PLUS-NEXT: s_cmp_lt_u32 s3, 64
+; GFX10PLUS-NEXT: s_mov_b32 s4, s3
+; GFX10PLUS-NEXT: s_mov_b32 s3, 0
+; GFX10PLUS-NEXT: s_sub_i32 s10, s4, 64
+; GFX10PLUS-NEXT: s_and_b64 s[2:3], s[2:3], 1
+; GFX10PLUS-NEXT: s_sub_i32 s5, 64, s4
+; GFX10PLUS-NEXT: s_cmp_lt_u32 s4, 64
; GFX10PLUS-NEXT: s_cselect_b32 s11, 1, 0
-; GFX10PLUS-NEXT: s_cmp_eq_u32 s3, 0
+; GFX10PLUS-NEXT: s_cmp_eq_u32 s4, 0
; GFX10PLUS-NEXT: s_cselect_b32 s12, 1, 0
-; GFX10PLUS-NEXT: s_lshr_b64 s[6:7], s[0:1], s3
-; GFX10PLUS-NEXT: s_lshl_b64 s[8:9], s[4:5], s2
-; GFX10PLUS-NEXT: s_lshr_b64 s[2:3], s[4:5], s3
+; GFX10PLUS-NEXT: s_lshr_b64 s[6:7], s[0:1], s4
+; GFX10PLUS-NEXT: s_lshl_b64 s[8:9], s[2:3], s5
+; GFX10PLUS-NEXT: s_lshr_b64 s[4:5], s[2:3], s4
; GFX10PLUS-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9]
-; GFX10PLUS-NEXT: s_lshr_b64 s[4:5], s[4:5], s10
+; GFX10PLUS-NEXT: s_lshr_b64 s[2:3], s[2:3], s10
; GFX10PLUS-NEXT: s_cmp_lg_u32 s11, 0
-; GFX10PLUS-NEXT: s_cselect_b64 s[4:5], s[6:7], s[4:5]
+; GFX10PLUS-NEXT: s_cselect_b64 s[2:3], s[6:7], s[2:3]
; GFX10PLUS-NEXT: s_cmp_lg_u32 s12, 0
-; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5]
+; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3]
; GFX10PLUS-NEXT: s_cmp_lg_u32 s11, 0
-; GFX10PLUS-NEXT: s_cselect_b32 s2, s2, 0
+; GFX10PLUS-NEXT: s_cselect_b32 s2, s4, 0
; GFX10PLUS-NEXT: ; return to shader part epilog
%result = lshr i65 %value, %amount
ret i65 %result
@@ -1797,22 +1801,22 @@ define amdgpu_ps i65 @s_lshr_i65(i65 inreg %value, i65 inreg %amount) {
define amdgpu_ps i65 @s_lshr_i65_33(i65 inreg %value) {
; GCN-LABEL: s_lshr_i65_33:
; GCN: ; %bb.0:
-; GCN-NEXT: s_and_b64 s[2:3], s[2:3], 1
-; GCN-NEXT: s_lshr_b32 s0, s1, 1
-; GCN-NEXT: s_mov_b32 s1, 0
-; GCN-NEXT: s_lshl_b64 s[4:5], s[2:3], 31
-; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
-; GCN-NEXT: s_lshr_b32 s2, s3, 1
+; GCN-NEXT: s_mov_b32 s3, 0
+; GCN-NEXT: s_and_b64 s[4:5], s[2:3], 1
+; GCN-NEXT: s_lshr_b32 s2, s1, 1
+; GCN-NEXT: s_lshl_b64 s[0:1], s[4:5], 31
+; GCN-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
+; GCN-NEXT: s_lshr_b32 s2, s5, 1
; GCN-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_lshr_i65_33:
; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: s_and_b64 s[2:3], s[2:3], 1
-; GFX10PLUS-NEXT: s_lshr_b32 s0, s1, 1
-; GFX10PLUS-NEXT: s_mov_b32 s1, 0
-; GFX10PLUS-NEXT: s_lshl_b64 s[4:5], s[2:3], 31
-; GFX10PLUS-NEXT: s_lshr_b32 s2, s3, 1
-; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
+; GFX10PLUS-NEXT: s_mov_b32 s3, 0
+; GFX10PLUS-NEXT: s_and_b64 s[4:5], s[2:3], 1
+; GFX10PLUS-NEXT: s_lshr_b32 s2, s1, 1
+; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[4:5], 31
+; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX10PLUS-NEXT: s_lshr_b32 s2, s5, 1
; GFX10PLUS-NEXT: ; return to shader part epilog
%result = lshr i65 %value, 33
ret i65 %result
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll
index bac80f0777c024..ac6660b76ded98 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll
@@ -1440,6 +1440,7 @@ define i65 @v_sext_inreg_i65_22(i65 %value) {
; GFX6-LABEL: v_sext_inreg_i65_22:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v3, 0
; GFX6-NEXT: v_lshl_b64 v[2:3], v[2:3], 22
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 10, v1
; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
@@ -1455,6 +1456,7 @@ define i65 @v_sext_inreg_i65_22(i65 %value) {
; GFX8-LABEL: v_sext_inreg_i65_22:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v3, 0
; GFX8-NEXT: v_lshlrev_b64 v[2:3], 22, v[2:3]
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 10, v1
; GFX8-NEXT: v_or_b32_e32 v2, v2, v3
@@ -1470,6 +1472,7 @@ define i65 @v_sext_inreg_i65_22(i65 %value) {
; GFX9-LABEL: v_sext_inreg_i65_22:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: v_lshlrev_b64 v[2:3], 22, v[2:3]
; GFX9-NEXT: v_lshrrev_b32_e32 v3, 10, v1
; GFX9-NEXT: v_or_b32_e32 v2, v2, v3
@@ -1484,6 +1487,7 @@ define i65 @v_sext_inreg_i65_22(i65 %value) {
; GFX10PLUS-LABEL: v_sext_inreg_i65_22:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10PLUS-NEXT: v_mov_b32_e32 v3, 0
; GFX10PLUS-NEXT: v_lshlrev_b64 v[2:3], 22, v[2:3]
; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v3, 10, v1
; GFX10PLUS-NEXT: v_lshrrev_b64 v[0:1], 0, v[0:1]
@@ -1555,29 +1559,29 @@ define i65 @...
[truncated]
|
@llvm/pr-subscribers-backend-amdgpu Author: Thorsten Schütt (tschuett) Changesinto zext x ; CHECK-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [TRUNC], [DEF] Please continue padding merge values. // %bits_8_15:(s8) = G_IMPLICIT_DEF %bits_8_15 is defined by undef. Its value is undefined and we can pick an arbitrary value. For optimization, we pick zero. // %0:_(s16) = G_ZEXT %bits_0_7:(s8) The upper bits of %0 are zero and the lower bits come from %bits_0_7. Patch is 33.45 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/113616.diff 11 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 9240a3c3127eb4..dbd9d6d553b24b 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -922,6 +922,9 @@ class CombinerHelper {
bool matchUnmergeValuesAnyExtBuildVector(const MachineInstr &MI,
BuildFnTy &MatchInfo);
+ // merge_values(_, undef) -> zext
+ bool matchMergeXAndUndef(const MachineInstr &MI, BuildFnTy &MatchInfo);
+
private:
/// Checks for legality of an indexed variant of \p LdSt.
bool isIndexedLoadStoreLegal(GLoadStore &LdSt) const;
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index ead4149fc11068..6c84d6ad40471c 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -848,6 +848,14 @@ def unmerge_zext_to_zext : GICombineRule<
(apply [{ Helper.applyCombineUnmergeZExtToZExt(*${d}); }])
>;
+/// Transform merge_x_undef -> zext.
+def merge_of_x_and_undef : GICombineRule <
+ (defs root:$root, build_fn_matchinfo:$matchinfo),
+ (match (G_IMPLICIT_DEF $undef),
+ (G_MERGE_VALUES $root, $x, $undef):$MI,
+ [{ return Helper.matchMergeXAndUndef(*${MI}, ${matchinfo}); }]),
+ (apply [{ Helper.applyBuildFn(*${MI}, ${matchinfo}); }])>;
+
def merge_combines: GICombineGroup<[
unmerge_anyext_build_vector,
unmerge_merge,
@@ -855,7 +863,8 @@ def merge_combines: GICombineGroup<[
unmerge_cst,
unmerge_undef,
unmerge_dead_to_trunc,
- unmerge_zext_to_zext
+ unmerge_zext_to_zext,
+ merge_of_x_and_undef
]>;
// Under certain conditions, transform:
diff --git a/llvm/lib/CodeGen/GlobalISel/CMakeLists.txt b/llvm/lib/CodeGen/GlobalISel/CMakeLists.txt
index af1717dbf76f39..a45024d120be68 100644
--- a/llvm/lib/CodeGen/GlobalISel/CMakeLists.txt
+++ b/llvm/lib/CodeGen/GlobalISel/CMakeLists.txt
@@ -6,6 +6,7 @@ add_llvm_component_library(LLVMGlobalISel
GlobalISel.cpp
Combiner.cpp
CombinerHelper.cpp
+ CombinerHelperArtifacts.cpp
CombinerHelperCasts.cpp
CombinerHelperCompares.cpp
CombinerHelperVectorOps.cpp
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp
new file mode 100644
index 00000000000000..29875b04c37984
--- /dev/null
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp
@@ -0,0 +1,57 @@
+//===- CombinerHelperArtifacts.cpp-----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements CombinerHelper for legalization artifacts.
+//
+//===----------------------------------------------------------------------===//
+//
+// G_MERGE_VALUES
+//
+//===----------------------------------------------------------------------===//
+#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
+#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
+#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/CodeGen/LowLevelTypeUtils.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
+#include "llvm/Support/Casting.h"
+
+#define DEBUG_TYPE "gi-combiner"
+
+using namespace llvm;
+
+bool CombinerHelper::matchMergeXAndUndef(const MachineInstr &MI,
+ BuildFnTy &MatchInfo) {
+ const GMerge *Merge = cast<GMerge>(&MI);
+
+ Register Dst = Merge->getReg(0);
+ Register Undef = Merge->getSourceReg(1);
+ LLT DstTy = MRI.getType(Dst);
+ LLT SrcTy = MRI.getType(Merge->getSourceReg(0));
+
+ //
+ // %bits_8_15:_(s8) = G_IMPLICIT_DEF
+ // %0:_(s16) = G_MERGE_VALUES %bits_0_7:(s8), %bits_8_15:(s8)
+ //
+ // ->
+ //
+ // %0:_(s16) = G_ZEXT %bits_0_7:(s8)
+ //
+
+ if (!MRI.hasOneNonDBGUse(Undef) ||
+ !isLegalOrBeforeLegalizer({TargetOpcode::G_ZEXT, {DstTy, SrcTy}}))
+ return false;
+
+ MatchInfo = [=](MachineIRBuilder &B) {
+ B.buildZExt(Dst, Merge->getSourceReg(0));
+ };
+ return true;
+}
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir
index 7566d38e6c6cfa..67cbdd19a05684 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-unmerge.mir
@@ -10,9 +10,9 @@ body: |
bb.1:
; CHECK-LABEL: name: test_combine_unmerge_merge
; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
- ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; CHECK-NEXT: $w0 = COPY [[DEF]](s32)
- ; CHECK-NEXT: $w1 = COPY [[DEF1]](s32)
+ ; CHECK-NEXT: $w1 = COPY [[C]](s32)
%0:_(s32) = G_IMPLICIT_DEF
%1:_(s32) = G_IMPLICIT_DEF
%2:_(s64) = G_MERGE_VALUES %0(s32), %1(s32)
@@ -115,9 +115,11 @@ body: |
bb.1:
; CHECK-LABEL: name: test_combine_unmerge_bitcast_merge
; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
- ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
- ; CHECK-NEXT: $w0 = COPY [[DEF]](s32)
- ; CHECK-NEXT: $w1 = COPY [[DEF1]](s32)
+ ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[DEF]](s32)
+ ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[ZEXT]](s64)
+ ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](<2 x s32>)
+ ; CHECK-NEXT: $w0 = COPY [[UV]](s32)
+ ; CHECK-NEXT: $w1 = COPY [[UV1]](s32)
%0:_(s32) = G_IMPLICIT_DEF
%1:_(s32) = G_IMPLICIT_DEF
%2:_(s64) = G_MERGE_VALUES %0(s32), %1(s32)
@@ -136,9 +138,8 @@ body: |
bb.1:
; CHECK-LABEL: name: test_combine_unmerge_merge_incompatible_types
; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
- ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
- ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[DEF]](s32), [[DEF1]](s32)
- ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[MV]](s64)
+ ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[DEF]](s32)
+ ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[ZEXT]](s64)
; CHECK-NEXT: $h0 = COPY [[UV]](s16)
; CHECK-NEXT: $h1 = COPY [[UV1]](s16)
; CHECK-NEXT: $h2 = COPY [[UV2]](s16)
@@ -539,3 +540,36 @@ body: |
$q0 = COPY %un1(s128)
$q1 = COPY %un2(s128)
...
+
+# Check that we zext the merge
+---
+name: test_merge_undef
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: test_merge_undef
+ ; CHECK: %opaque:_(s64) = COPY $x0
+ ; CHECK-NEXT: %me:_(s128) = G_ZEXT %opaque(s64)
+ ; CHECK-NEXT: $q0 = COPY %me(s128)
+ %opaque:_(s64) = COPY $x0
+ %def:_(s64) = G_IMPLICIT_DEF
+ %me:_(s128) = G_MERGE_VALUES %opaque(s64), %def
+ $q0 = COPY %me(s128)
+...
+
+# Check that we don't zext the merge, multi-use
+---
+name: test_merge_undef_multi_use
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: test_merge_undef_multi_use
+ ; CHECK: %opaque:_(s64) = COPY $x0
+ ; CHECK-NEXT: %def:_(s64) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: %me:_(s128) = G_MERGE_VALUES %opaque(s64), %def(s64)
+ ; CHECK-NEXT: $q0 = COPY %me(s128)
+ ; CHECK-NEXT: $x0 = COPY %def(s64)
+ %opaque:_(s64) = COPY $x0
+ %def:_(s64) = G_IMPLICIT_DEF
+ %me:_(s128) = G_MERGE_VALUES %opaque(s64), %def
+ $q0 = COPY %me(s128)
+ $x0 = COPY %def(s64)
+...
diff --git a/llvm/test/CodeGen/AArch64/bswap.ll b/llvm/test/CodeGen/AArch64/bswap.ll
index 74e4a167ae14ca..afc1d932840ff7 100644
--- a/llvm/test/CodeGen/AArch64/bswap.ll
+++ b/llvm/test/CodeGen/AArch64/bswap.ll
@@ -45,25 +45,14 @@ define i64 @bswap_i16_to_i64_anyext(i16 %a) {
; The zext here is optimised to an any_extend during isel..
define i128 @bswap_i16_to_i128_anyext(i16 %a) {
-; CHECK-SD-LABEL: bswap_i16_to_i128_anyext:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: mov w8, w0
-; CHECK-SD-NEXT: mov x0, xzr
-; CHECK-SD-NEXT: rev w8, w8
-; CHECK-SD-NEXT: lsr w8, w8, #16
-; CHECK-SD-NEXT: lsl x1, x8, #48
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: bswap_i16_to_i128_anyext:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: mov w8, w0
-; CHECK-GI-NEXT: mov x0, xzr
-; CHECK-GI-NEXT: rev w8, w8
-; CHECK-GI-NEXT: lsr w8, w8, #16
-; CHECK-GI-NEXT: bfi x8, x8, #32, #32
-; CHECK-GI-NEXT: and x8, x8, #0xffff
-; CHECK-GI-NEXT: lsl x1, x8, #48
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: bswap_i16_to_i128_anyext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: mov x0, xzr
+; CHECK-NEXT: rev w8, w8
+; CHECK-NEXT: lsr w8, w8, #16
+; CHECK-NEXT: lsl x1, x8, #48
+; CHECK-NEXT: ret
%3 = call i16 @llvm.bswap.i16(i16 %a)
%4 = zext i16 %3 to i128
%5 = shl i128 %4, 112
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
index 63f5464371cc62..e86282fa1883d9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
@@ -1884,22 +1884,22 @@ define amdgpu_ps i65 @s_ashr_i65(i65 inreg %value, i65 inreg %amount) {
define amdgpu_ps i65 @s_ashr_i65_33(i65 inreg %value) {
; GCN-LABEL: s_ashr_i65_33:
; GCN: ; %bb.0:
-; GCN-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000
-; GCN-NEXT: s_lshr_b32 s0, s1, 1
-; GCN-NEXT: s_mov_b32 s1, 0
-; GCN-NEXT: s_lshl_b64 s[4:5], s[2:3], 31
-; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
-; GCN-NEXT: s_ashr_i32 s2, s3, 1
+; GCN-NEXT: s_mov_b32 s3, 0
+; GCN-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x10000
+; GCN-NEXT: s_lshr_b32 s2, s1, 1
+; GCN-NEXT: s_lshl_b64 s[0:1], s[4:5], 31
+; GCN-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
+; GCN-NEXT: s_ashr_i32 s2, s5, 1
; GCN-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_ashr_i65_33:
; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000
-; GFX10PLUS-NEXT: s_lshr_b32 s0, s1, 1
-; GFX10PLUS-NEXT: s_mov_b32 s1, 0
-; GFX10PLUS-NEXT: s_lshl_b64 s[4:5], s[2:3], 31
-; GFX10PLUS-NEXT: s_ashr_i32 s2, s3, 1
-; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
+; GFX10PLUS-NEXT: s_mov_b32 s3, 0
+; GFX10PLUS-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x10000
+; GFX10PLUS-NEXT: s_lshr_b32 s2, s1, 1
+; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[4:5], 31
+; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX10PLUS-NEXT: s_ashr_i32 s2, s5, 1
; GFX10PLUS-NEXT: ; return to shader part epilog
%result = ashr i65 %value, 33
ret i65 %result
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
index 5dd4fa0809131f..2ae9d28cda16a2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
@@ -1574,8 +1574,8 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) {
; GFX6-LABEL: v_lshr_i65:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, 1, v2
; GFX6-NEXT: v_mov_b32_e32 v5, 0
+; GFX6-NEXT: v_and_b32_e32 v4, 1, v2
; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 64, v3
; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 64, v3
; GFX6-NEXT: v_lshr_b64 v[6:7], v[0:1], v3
@@ -1596,8 +1596,8 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) {
; GFX8-LABEL: v_lshr_i65:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v4, 1, v2
; GFX8-NEXT: v_mov_b32_e32 v5, 0
+; GFX8-NEXT: v_and_b32_e32 v4, 1, v2
; GFX8-NEXT: v_sub_u32_e32 v8, vcc, 64, v3
; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, 64, v3
; GFX8-NEXT: v_lshrrev_b64 v[6:7], v3, v[0:1]
@@ -1618,8 +1618,8 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) {
; GFX9-LABEL: v_lshr_i65:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v4, 1, v2
; GFX9-NEXT: v_mov_b32_e32 v5, 0
+; GFX9-NEXT: v_and_b32_e32 v4, 1, v2
; GFX9-NEXT: v_sub_u32_e32 v8, 64, v3
; GFX9-NEXT: v_subrev_u32_e32 v2, 64, v3
; GFX9-NEXT: v_lshrrev_b64 v[6:7], v3, v[0:1]
@@ -1688,8 +1688,8 @@ define i65 @v_lshr_i65_33(i65 %value) {
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v3, v1
-; GFX6-NEXT: v_and_b32_e32 v0, 1, v2
; GFX6-NEXT: v_mov_b32_e32 v1, 0
+; GFX6-NEXT: v_and_b32_e32 v0, 1, v2
; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 31
; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v3
; GFX6-NEXT: v_or_b32_e32 v0, v2, v0
@@ -1700,8 +1700,8 @@ define i65 @v_lshr_i65_33(i65 %value) {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: v_and_b32_e32 v0, 1, v2
; GFX8-NEXT: v_mov_b32_e32 v1, 0
+; GFX8-NEXT: v_and_b32_e32 v0, 1, v2
; GFX8-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1]
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 1, v3
; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
@@ -1712,8 +1712,8 @@ define i65 @v_lshr_i65_33(i65 %value) {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v2
; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v2
; GFX9-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1]
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 1, v3
; GFX9-NEXT: v_or_b32_e32 v0, v2, v0
@@ -1749,20 +1749,22 @@ define i65 @v_lshr_i65_33(i65 %value) {
define amdgpu_ps i65 @s_lshr_i65(i65 inreg %value, i65 inreg %amount) {
; GCN-LABEL: s_lshr_i65:
; GCN: ; %bb.0:
-; GCN-NEXT: s_and_b64 s[4:5], s[2:3], 1
-; GCN-NEXT: s_sub_i32 s10, s3, 64
-; GCN-NEXT: s_sub_i32 s8, 64, s3
-; GCN-NEXT: s_cmp_lt_u32 s3, 64
+; GCN-NEXT: s_mov_b32 s4, s3
+; GCN-NEXT: s_mov_b32 s3, 0
+; GCN-NEXT: s_and_b64 s[2:3], s[2:3], 1
+; GCN-NEXT: s_sub_i32 s10, s4, 64
+; GCN-NEXT: s_sub_i32 s8, 64, s4
+; GCN-NEXT: s_cmp_lt_u32 s4, 64
; GCN-NEXT: s_cselect_b32 s11, 1, 0
-; GCN-NEXT: s_cmp_eq_u32 s3, 0
+; GCN-NEXT: s_cmp_eq_u32 s4, 0
; GCN-NEXT: s_cselect_b32 s12, 1, 0
-; GCN-NEXT: s_lshr_b64 s[6:7], s[4:5], s3
-; GCN-NEXT: s_lshr_b64 s[2:3], s[0:1], s3
-; GCN-NEXT: s_lshl_b64 s[8:9], s[4:5], s8
-; GCN-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9]
-; GCN-NEXT: s_lshr_b64 s[4:5], s[4:5], s10
+; GCN-NEXT: s_lshr_b64 s[6:7], s[2:3], s4
+; GCN-NEXT: s_lshr_b64 s[4:5], s[0:1], s4
+; GCN-NEXT: s_lshl_b64 s[8:9], s[2:3], s8
+; GCN-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9]
+; GCN-NEXT: s_lshr_b64 s[2:3], s[2:3], s10
; GCN-NEXT: s_cmp_lg_u32 s11, 0
-; GCN-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5]
+; GCN-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3]
; GCN-NEXT: s_cmp_lg_u32 s12, 0
; GCN-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3]
; GCN-NEXT: s_cmp_lg_u32 s11, 0
@@ -1771,24 +1773,26 @@ define amdgpu_ps i65 @s_lshr_i65(i65 inreg %value, i65 inreg %amount) {
;
; GFX10PLUS-LABEL: s_lshr_i65:
; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: s_and_b64 s[4:5], s[2:3], 1
-; GFX10PLUS-NEXT: s_sub_i32 s10, s3, 64
-; GFX10PLUS-NEXT: s_sub_i32 s2, 64, s3
-; GFX10PLUS-NEXT: s_cmp_lt_u32 s3, 64
+; GFX10PLUS-NEXT: s_mov_b32 s4, s3
+; GFX10PLUS-NEXT: s_mov_b32 s3, 0
+; GFX10PLUS-NEXT: s_sub_i32 s10, s4, 64
+; GFX10PLUS-NEXT: s_and_b64 s[2:3], s[2:3], 1
+; GFX10PLUS-NEXT: s_sub_i32 s5, 64, s4
+; GFX10PLUS-NEXT: s_cmp_lt_u32 s4, 64
; GFX10PLUS-NEXT: s_cselect_b32 s11, 1, 0
-; GFX10PLUS-NEXT: s_cmp_eq_u32 s3, 0
+; GFX10PLUS-NEXT: s_cmp_eq_u32 s4, 0
; GFX10PLUS-NEXT: s_cselect_b32 s12, 1, 0
-; GFX10PLUS-NEXT: s_lshr_b64 s[6:7], s[0:1], s3
-; GFX10PLUS-NEXT: s_lshl_b64 s[8:9], s[4:5], s2
-; GFX10PLUS-NEXT: s_lshr_b64 s[2:3], s[4:5], s3
+; GFX10PLUS-NEXT: s_lshr_b64 s[6:7], s[0:1], s4
+; GFX10PLUS-NEXT: s_lshl_b64 s[8:9], s[2:3], s5
+; GFX10PLUS-NEXT: s_lshr_b64 s[4:5], s[2:3], s4
; GFX10PLUS-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9]
-; GFX10PLUS-NEXT: s_lshr_b64 s[4:5], s[4:5], s10
+; GFX10PLUS-NEXT: s_lshr_b64 s[2:3], s[2:3], s10
; GFX10PLUS-NEXT: s_cmp_lg_u32 s11, 0
-; GFX10PLUS-NEXT: s_cselect_b64 s[4:5], s[6:7], s[4:5]
+; GFX10PLUS-NEXT: s_cselect_b64 s[2:3], s[6:7], s[2:3]
; GFX10PLUS-NEXT: s_cmp_lg_u32 s12, 0
-; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5]
+; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3]
; GFX10PLUS-NEXT: s_cmp_lg_u32 s11, 0
-; GFX10PLUS-NEXT: s_cselect_b32 s2, s2, 0
+; GFX10PLUS-NEXT: s_cselect_b32 s2, s4, 0
; GFX10PLUS-NEXT: ; return to shader part epilog
%result = lshr i65 %value, %amount
ret i65 %result
@@ -1797,22 +1801,22 @@ define amdgpu_ps i65 @s_lshr_i65(i65 inreg %value, i65 inreg %amount) {
define amdgpu_ps i65 @s_lshr_i65_33(i65 inreg %value) {
; GCN-LABEL: s_lshr_i65_33:
; GCN: ; %bb.0:
-; GCN-NEXT: s_and_b64 s[2:3], s[2:3], 1
-; GCN-NEXT: s_lshr_b32 s0, s1, 1
-; GCN-NEXT: s_mov_b32 s1, 0
-; GCN-NEXT: s_lshl_b64 s[4:5], s[2:3], 31
-; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
-; GCN-NEXT: s_lshr_b32 s2, s3, 1
+; GCN-NEXT: s_mov_b32 s3, 0
+; GCN-NEXT: s_and_b64 s[4:5], s[2:3], 1
+; GCN-NEXT: s_lshr_b32 s2, s1, 1
+; GCN-NEXT: s_lshl_b64 s[0:1], s[4:5], 31
+; GCN-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
+; GCN-NEXT: s_lshr_b32 s2, s5, 1
; GCN-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_lshr_i65_33:
; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: s_and_b64 s[2:3], s[2:3], 1
-; GFX10PLUS-NEXT: s_lshr_b32 s0, s1, 1
-; GFX10PLUS-NEXT: s_mov_b32 s1, 0
-; GFX10PLUS-NEXT: s_lshl_b64 s[4:5], s[2:3], 31
-; GFX10PLUS-NEXT: s_lshr_b32 s2, s3, 1
-; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
+; GFX10PLUS-NEXT: s_mov_b32 s3, 0
+; GFX10PLUS-NEXT: s_and_b64 s[4:5], s[2:3], 1
+; GFX10PLUS-NEXT: s_lshr_b32 s2, s1, 1
+; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[4:5], 31
+; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX10PLUS-NEXT: s_lshr_b32 s2, s5, 1
; GFX10PLUS-NEXT: ; return to shader part epilog
%result = lshr i65 %value, 33
ret i65 %result
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll
index bac80f0777c024..ac6660b76ded98 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll
@@ -1440,6 +1440,7 @@ define i65 @v_sext_inreg_i65_22(i65 %value) {
; GFX6-LABEL: v_sext_inreg_i65_22:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v3, 0
; GFX6-NEXT: v_lshl_b64 v[2:3], v[2:3], 22
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 10, v1
; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
@@ -1455,6 +1456,7 @@ define i65 @v_sext_inreg_i65_22(i65 %value) {
; GFX8-LABEL: v_sext_inreg_i65_22:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v3, 0
; GFX8-NEXT: v_lshlrev_b64 v[2:3], 22, v[2:3]
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 10, v1
; GFX8-NEXT: v_or_b32_e32 v2, v2, v3
@@ -1470,6 +1472,7 @@ define i65 @v_sext_inreg_i65_22(i65 %value) {
; GFX9-LABEL: v_sext_inreg_i65_22:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: v_lshlrev_b64 v[2:3], 22, v[2:3]
; GFX9-NEXT: v_lshrrev_b32_e32 v3, 10, v1
; GFX9-NEXT: v_or_b32_e32 v2, v2, v3
@@ -1484,6 +1487,7 @@ define i65 @v_sext_inreg_i65_22(i65 %value) {
; GFX10PLUS-LABEL: v_sext_inreg_i65_22:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10PLUS-NEXT: v_mov_b32_e32 v3, 0
; GFX10PLUS-NEXT: v_lshlrev_b64 v[2:3], 22, v[2:3]
; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v3, 10, v1
; GFX10PLUS-NEXT: v_lshrrev_b64 v[0:1], 0, v[0:1]
@@ -1555,29 +1559,29 @@ define i65 @...
[truncated]
|
; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF | ||
; CHECK-NEXT: $w0 = COPY [[DEF]](s32) | ||
; CHECK-NEXT: $w1 = COPY [[DEF1]](s32) | ||
; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[DEF]](s32) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Future work: G_ZEXT of undef -> 0
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The test case is G_MERGE_VALUES of undef, undef, which can become undef. We decided it will become zero in the future.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why do ZEXT instead of ANYEXT? Some special poison semantics?
My model of G_MERGE_VALUES is that the sources are memcopied into the destination. Sext would not fit into that model. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
x(s8) = merge(0b1011, undef)
==>
x = sext(0b1011)
==>
x = 0b11111011
Why would that be an incorrect transformation if the upper bits were undef? Unless I'm missing something funny with poison, G_ANYEXT is the least restrictive and most optimizable output, since you're not defining bits that were previously undefined.
ZEXT was/is more aggressive. |
For best code quality you want the opposite. That is, you want to put as few restrictions as possible on the generated code while maintaining correctness. |
All the wins are gone. Maybe, we are exploiting Zext and the known zero upper bits better. |
That suggests we're solving the wrong problem. It's usually better to find a particular example you're trying to optimize and work backwards, to see what things are missing. |
With G_ANYEXT, there are almost no changes. Perhaps G_MERGE_VALUES with two operands is not worth it to combine. |
LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); | ||
if (isLegalOrBeforeLegalizer({TargetOpcode::G_CONSTANT, {DstTy}})) { | ||
Builder.buildConstant(MI.getOperand(0), C); | ||
MI.eraseFromParent(); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please take a step back to address the root causes of the issues.
We can't be just not doing what the function says it would do. This change needs its own rationale or PR if it is indeed the right change to do.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am interested in root cause analysis. We are combining G_ZEXT of undef into 0 in this PR and it relies on this function. We never build something before checking legality. The change itself is not negotiable. You can ask to move into a separate PR of course.
Pull out of llvm#113616 Legality checks for CombinerHelper::replaceInstWithConstant.
into zext x ; CHECK-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC]](s32), [[DEF]](s32) Please continue padding merge values. // %bits_8_15:_(s8) = G_IMPLICIT_DEF // %0:_(s16) = G_MERGE_VALUES %bits_0_7:(s8), %bits_8_15:(s8) %bits_8_15 is defined by undef. Its value is undefined and we can pick an arbitrary value. For optimization, we pick zero. // %0:_(s16) = G_ZEXT %bits_0_7:(s8) The upper bits of %0 are zero and the lower bits come from %bits_0_7.
e9a59fc
to
d9baace
Compare
Ping. |
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/27/builds/1932 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/24/builds/2637 Here is the relevant piece of the build log for the reference
|
into anyext x
; CHECK-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [TRUNC], [DEF]
Please continue padding merge values.
// %bits_8_15:(s8) = G_IMPLICIT_DEF
// %0:(s16) = G_MERGE_VALUES %bits_0_7:(s8), %bits_8_15:(s8)
%bits_8_15 is defined by undef. Its value is undefined and we can pick an arbitrary value. For optimization, we pick anyext, which plays well with the undefinedness.
// %0:_(s16) = G_ANYEXT %bits_0_7:(s8)
The upper bits of %0 are undefined and the lower bits come from %bits_0_7.