-
Notifications
You must be signed in to change notification settings - Fork 14.3k
AMDGPU: Materialize bitwise not of inline immediates #95960
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
AMDGPU: Materialize bitwise not of inline immediates #95960
Conversation
This stack of pull requests is managed by Graphite. Learn more about stacking. |
@llvm/pr-subscribers-llvm-globalisel @llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesIf we have a bitwise negated inline immediate, we can materialize One test shows some VOPD regressions on gfx11 which should Patch is 42.75 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/95960.diff 16 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 647fae904d393..b064a371b759e 100644
--- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -183,15 +183,31 @@ bool SIShrinkInstructions::isKImmOrKUImmOperand(const MachineOperand &Src,
return false;
}
-/// \returns true if the constant in \p Src should be replaced with a bitreverse
-/// of an inline immediate.
-bool SIShrinkInstructions::isReverseInlineImm(const MachineOperand &Src,
- int32_t &ReverseImm) const {
- if (!isInt<32>(Src.getImm()) || TII->isInlineConstant(Src))
- return false;
-
- ReverseImm = reverseBits<int32_t>(static_cast<int32_t>(Src.getImm()));
- return ReverseImm >= -16 && ReverseImm <= 64;
+/// \returns the opcode of an instruction a move immediate of the constant \p
+/// Src can be replaced with if the constant is replaced with \p ModifiedImm.
+/// i.e.
+///
+/// If the bitreverse of a constant is an inline immediate, reverse the
+/// immediate and return the bitreverse opcode.
+///
+/// If the bitwise negation of a constant is an inline immediate, reverse the
+/// immediate and return the bitwise not opcode.
+static unsigned canModifyToInlineImmOp32(const SIInstrInfo *TII,
+ const MachineOperand &Src,
+ int32_t &ModifiedImm, bool Scalar) {
+ if (TII->isInlineConstant(Src))
+ return 0;
+ int32_t SrcImm = static_cast<int32_t>(Src.getImm());
+
+ ModifiedImm = ~SrcImm;
+ if (TII->isInlineConstant(APInt(32, ModifiedImm)))
+ return Scalar ? AMDGPU::S_NOT_B32 : AMDGPU::V_NOT_B32_e32;
+
+ ModifiedImm = reverseBits<int32_t>(SrcImm);
+ if (TII->isInlineConstant(APInt(32, ModifiedImm)))
+ return Scalar ? AMDGPU::S_BREV_B32 : AMDGPU::V_BFREV_B32_e32;
+
+ return 0;
}
/// Copy implicit register operands from specified instruction to this
@@ -801,10 +817,12 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
// XXX - not exactly a check for post-regalloc run.
MachineOperand &Src = MI.getOperand(1);
if (Src.isImm() && MI.getOperand(0).getReg().isPhysical()) {
- int32_t ReverseImm;
- if (isReverseInlineImm(Src, ReverseImm)) {
- MI.setDesc(TII->get(AMDGPU::V_BFREV_B32_e32));
- Src.setImm(ReverseImm);
+ int32_t ModImm;
+ unsigned ModOpcode =
+ canModifyToInlineImmOp32(TII, Src, ModImm, /*Scalar=*/false);
+ if (ModOpcode != 0) {
+ MI.setDesc(TII->get(ModOpcode));
+ Src.setImm(static_cast<int64_t>(ModImm));
continue;
}
}
@@ -863,17 +881,21 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
MachineOperand &Src = MI.getOperand(1);
if (Src.isImm() && Dst.getReg().isPhysical()) {
- int32_t ReverseImm;
+ unsigned ModOpc;
+ int32_t ModImm;
if (isKImmOperand(Src)) {
MI.setDesc(TII->get(AMDGPU::S_MOVK_I32));
Src.setImm(SignExtend64(Src.getImm(), 32));
- } else if (isReverseInlineImm(Src, ReverseImm)) {
- MI.setDesc(TII->get(AMDGPU::S_BREV_B32));
- Src.setImm(ReverseImm);
+ continue;
}
- }
- continue;
+ if ((ModOpc = canModifyToInlineImmOp32(TII, Src, ModImm,
+ /*Scalar=*/true))) {
+ MI.setDesc(TII->get(ModOpc));
+ Src.setImm(static_cast<int64_t>(ModImm));
+ continue;
+ }
+ }
}
// Shrink scalar logic operations.
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll
index 496ee9f2dbb2b..c8b82716a9fe1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll
@@ -178,7 +178,7 @@ define <2 x i16> @v_add_v2i16_neg_inline_imm_splat(<2 x i16> %a) {
; GFX8-LABEL: v_add_v2i16_neg_inline_imm_splat:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, 0xffffffc0
+; GFX8-NEXT: v_not_b32_e32 v1, 63
; GFX8-NEXT: v_add_u16_e32 v2, 0xffc0, v0
; GFX8-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
@@ -244,7 +244,7 @@ define <2 x i16> @v_add_v2i16_neg_inline_imm_hi(<2 x i16> %a) {
; GFX8-LABEL: v_add_v2i16_neg_inline_imm_hi:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, 0xffffffc0
+; GFX8-NEXT: v_not_b32_e32 v1, 63
; GFX8-NEXT: v_add_u16_e32 v2, 4, v0
; GFX8-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
index 4df5fa18e2942..afffebea451a0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
@@ -1486,7 +1486,7 @@ define amdgpu_ps i24 @s_fshl_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt)
; GFX6: ; %bb.0:
; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, 24
; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX6-NEXT: v_mov_b32_e32 v1, 0xffffffe8
+; GFX6-NEXT: v_not_b32_e32 v1, 23
; GFX6-NEXT: s_and_b32 s2, s2, 0xffffff
; GFX6-NEXT: s_bfe_u32 s1, s1, 0x170001
; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
@@ -1516,7 +1516,7 @@ define amdgpu_ps i24 @s_fshl_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt)
; GFX8: ; %bb.0:
; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, 24
; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX8-NEXT: v_mov_b32_e32 v1, 0xffffffe8
+; GFX8-NEXT: v_not_b32_e32 v1, 23
; GFX8-NEXT: s_and_b32 s2, s2, 0xffffff
; GFX8-NEXT: s_bfe_u32 s1, s1, 0x170001
; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
@@ -1546,7 +1546,7 @@ define amdgpu_ps i24 @s_fshl_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt)
; GFX9: ; %bb.0:
; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, 24
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT: v_mov_b32_e32 v1, 0xffffffe8
+; GFX9-NEXT: v_not_b32_e32 v1, 23
; GFX9-NEXT: s_and_b32 s2, s2, 0xffffff
; GFX9-NEXT: s_bfe_u32 s1, s1, 0x170001
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
@@ -1646,7 +1646,7 @@ define i24 @v_fshl_i24(i24 %lhs, i24 %rhs, i24 %amt) {
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v3, 24
; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v3
-; GFX6-NEXT: v_mov_b32_e32 v4, 0xffffffe8
+; GFX6-NEXT: v_not_b32_e32 v4, 23
; GFX6-NEXT: v_and_b32_e32 v2, 0xffffff, v2
; GFX6-NEXT: v_bfe_u32 v1, v1, 1, 23
; GFX6-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3
@@ -1676,7 +1676,7 @@ define i24 @v_fshl_i24(i24 %lhs, i24 %rhs, i24 %amt) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v3, 24
; GFX8-NEXT: v_rcp_iflag_f32_e32 v3, v3
-; GFX8-NEXT: v_mov_b32_e32 v4, 0xffffffe8
+; GFX8-NEXT: v_not_b32_e32 v4, 23
; GFX8-NEXT: v_and_b32_e32 v2, 0xffffff, v2
; GFX8-NEXT: v_bfe_u32 v1, v1, 1, 23
; GFX8-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3
@@ -1706,7 +1706,7 @@ define i24 @v_fshl_i24(i24 %lhs, i24 %rhs, i24 %amt) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, 24
; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3
-; GFX9-NEXT: v_mov_b32_e32 v4, 0xffffffe8
+; GFX9-NEXT: v_not_b32_e32 v4, 23
; GFX9-NEXT: v_and_b32_e32 v2, 0xffffff, v2
; GFX9-NEXT: v_bfe_u32 v1, v1, 1, 23
; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3
@@ -1822,7 +1822,7 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
; GFX6-NEXT: s_lshl_b32 s6, s6, 16
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX6-NEXT: s_lshl_b32 s0, s0, 16
-; GFX6-NEXT: v_mov_b32_e32 v3, 0xffffffe8
+; GFX6-NEXT: v_not_b32_e32 v3, 23
; GFX6-NEXT: s_or_b32 s6, s8, s6
; GFX6-NEXT: v_or_b32_e32 v0, s0, v0
; GFX6-NEXT: s_lshr_b32 s0, s2, 16
@@ -1959,7 +1959,7 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
; GFX8-NEXT: s_or_b32 s2, s2, s6
; GFX8-NEXT: s_lshl_b32 s3, s3, 8
; GFX8-NEXT: s_and_b32 s6, s9, 0xff
-; GFX8-NEXT: v_mov_b32_e32 v1, 0xffffffe8
+; GFX8-NEXT: v_not_b32_e32 v1, 23
; GFX8-NEXT: s_or_b32 s3, s8, s3
; GFX8-NEXT: s_and_b32 s6, 0xffff, s6
; GFX8-NEXT: v_mul_lo_u32 v1, v0, v1
@@ -2079,7 +2079,7 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
; GFX9-NEXT: s_or_b32 s2, s2, s6
; GFX9-NEXT: s_lshl_b32 s3, s3, 8
; GFX9-NEXT: s_and_b32 s6, s9, 0xff
-; GFX9-NEXT: v_mov_b32_e32 v1, 0xffffffe8
+; GFX9-NEXT: v_not_b32_e32 v1, 23
; GFX9-NEXT: s_or_b32 s3, s8, s3
; GFX9-NEXT: s_and_b32 s6, 0xffff, s6
; GFX9-NEXT: v_mul_lo_u32 v1, v0, v1
@@ -2414,7 +2414,7 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v6, 24
; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v6
-; GFX6-NEXT: v_mov_b32_e32 v7, 0xffffffe8
+; GFX6-NEXT: v_not_b32_e32 v7, 23
; GFX6-NEXT: v_and_b32_e32 v4, 0xffffff, v4
; GFX6-NEXT: v_and_b32_e32 v5, 0xffffff, v5
; GFX6-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6
@@ -2461,7 +2461,7 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v6, 24
; GFX8-NEXT: v_rcp_iflag_f32_e32 v6, v6
-; GFX8-NEXT: v_mov_b32_e32 v7, 0xffffffe8
+; GFX8-NEXT: v_not_b32_e32 v7, 23
; GFX8-NEXT: v_and_b32_e32 v4, 0xffffff, v4
; GFX8-NEXT: v_and_b32_e32 v5, 0xffffff, v5
; GFX8-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6
@@ -2508,7 +2508,7 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v6, 24
; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v6
-; GFX9-NEXT: v_mov_b32_e32 v7, 0xffffffe8
+; GFX9-NEXT: v_not_b32_e32 v7, 23
; GFX9-NEXT: v_and_b32_e32 v4, 0xffffff, v4
; GFX9-NEXT: v_and_b32_e32 v5, 0xffffff, v5
; GFX9-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
index 61588e640be18..8538dcabca924 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
@@ -1487,7 +1487,7 @@ define amdgpu_ps i24 @s_fshr_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt)
; GFX6: ; %bb.0:
; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, 24
; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX6-NEXT: v_mov_b32_e32 v1, 0xffffffe8
+; GFX6-NEXT: v_not_b32_e32 v1, 23
; GFX6-NEXT: s_and_b32 s2, s2, 0xffffff
; GFX6-NEXT: s_lshl_b32 s0, s0, 1
; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
@@ -1518,7 +1518,7 @@ define amdgpu_ps i24 @s_fshr_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt)
; GFX8: ; %bb.0:
; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, 24
; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX8-NEXT: v_mov_b32_e32 v1, 0xffffffe8
+; GFX8-NEXT: v_not_b32_e32 v1, 23
; GFX8-NEXT: s_and_b32 s2, s2, 0xffffff
; GFX8-NEXT: s_lshl_b32 s0, s0, 1
; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
@@ -1549,7 +1549,7 @@ define amdgpu_ps i24 @s_fshr_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt)
; GFX9: ; %bb.0:
; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, 24
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT: v_mov_b32_e32 v1, 0xffffffe8
+; GFX9-NEXT: v_not_b32_e32 v1, 23
; GFX9-NEXT: s_and_b32 s2, s2, 0xffffff
; GFX9-NEXT: s_and_b32 s1, s1, 0xffffff
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
@@ -1652,7 +1652,7 @@ define i24 @v_fshr_i24(i24 %lhs, i24 %rhs, i24 %amt) {
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v3, 24
; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v3
-; GFX6-NEXT: v_mov_b32_e32 v4, 0xffffffe8
+; GFX6-NEXT: v_not_b32_e32 v4, 23
; GFX6-NEXT: v_and_b32_e32 v2, 0xffffff, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX6-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3
@@ -1683,7 +1683,7 @@ define i24 @v_fshr_i24(i24 %lhs, i24 %rhs, i24 %amt) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v3, 24
; GFX8-NEXT: v_rcp_iflag_f32_e32 v3, v3
-; GFX8-NEXT: v_mov_b32_e32 v4, 0xffffffe8
+; GFX8-NEXT: v_not_b32_e32 v4, 23
; GFX8-NEXT: v_and_b32_e32 v2, 0xffffff, v2
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX8-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3
@@ -1714,7 +1714,7 @@ define i24 @v_fshr_i24(i24 %lhs, i24 %rhs, i24 %amt) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, 24
; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3
-; GFX9-NEXT: v_mov_b32_e32 v4, 0xffffffe8
+; GFX9-NEXT: v_not_b32_e32 v4, 23
; GFX9-NEXT: v_and_b32_e32 v2, 0xffffff, v2
; GFX9-NEXT: v_and_b32_e32 v1, 0xffffff, v1
; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3
@@ -1820,7 +1820,7 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
; GFX6-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2
; GFX6-NEXT: v_mov_b32_e32 v0, s0
-; GFX6-NEXT: v_mov_b32_e32 v3, 0xffffffe8
+; GFX6-NEXT: v_not_b32_e32 v3, 23
; GFX6-NEXT: s_lshr_b32 s6, s0, 16
; GFX6-NEXT: s_and_b32 s8, s0, 0xff
; GFX6-NEXT: s_lshl_b32 s9, s9, 8
@@ -1962,7 +1962,7 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
; GFX8-NEXT: s_or_b32 s2, s2, s8
; GFX8-NEXT: s_lshl_b32 s3, s3, 8
; GFX8-NEXT: s_and_b32 s8, s11, 0xff
-; GFX8-NEXT: v_mov_b32_e32 v1, 0xffffffe8
+; GFX8-NEXT: v_not_b32_e32 v1, 23
; GFX8-NEXT: s_or_b32 s3, s10, s3
; GFX8-NEXT: s_and_b32 s8, 0xffff, s8
; GFX8-NEXT: v_mul_lo_u32 v1, v0, v1
@@ -2082,7 +2082,7 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
; GFX9-NEXT: s_or_b32 s2, s2, s8
; GFX9-NEXT: s_lshl_b32 s3, s3, 8
; GFX9-NEXT: s_and_b32 s8, s11, 0xff
-; GFX9-NEXT: v_mov_b32_e32 v1, 0xffffffe8
+; GFX9-NEXT: v_not_b32_e32 v1, 23
; GFX9-NEXT: s_or_b32 s3, s10, s3
; GFX9-NEXT: s_and_b32 s8, 0xffff, s8
; GFX9-NEXT: v_mul_lo_u32 v1, v0, v1
@@ -2424,7 +2424,7 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v6, 24
; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v6
-; GFX6-NEXT: v_mov_b32_e32 v7, 0xffffffe8
+; GFX6-NEXT: v_not_b32_e32 v7, 23
; GFX6-NEXT: v_and_b32_e32 v4, 0xffffff, v4
; GFX6-NEXT: v_and_b32_e32 v5, 0xffffff, v5
; GFX6-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6
@@ -2473,7 +2473,7 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v6, 24
; GFX8-NEXT: v_rcp_iflag_f32_e32 v6, v6
-; GFX8-NEXT: v_mov_b32_e32 v7, 0xffffffe8
+; GFX8-NEXT: v_not_b32_e32 v7, 23
; GFX8-NEXT: v_and_b32_e32 v4, 0xffffff, v4
; GFX8-NEXT: v_and_b32_e32 v5, 0xffffff, v5
; GFX8-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6
@@ -2522,7 +2522,7 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v6, 24
; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v6
-; GFX9-NEXT: v_mov_b32_e32 v7, 0xffffffe8
+; GFX9-NEXT: v_not_b32_e32 v7, 23
; GFX9-NEXT: v_and_b32_e32 v4, 0xffffff, v4
; GFX9-NEXT: v_and_b32_e32 v5, 0xffffff, v5
; GFX9-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
index 6e96a4ddbc0b3..546376c5962be 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
@@ -865,7 +865,7 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4
; GFX1030-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX1030-NEXT: flat_load_dword v2, v[0:1]
; GFX1030-NEXT: v_mov_b32_e32 v0, 0xb36211c7
-; GFX1030-NEXT: v_mov_b32_e32 v1, 0x102
+; GFX1030-NEXT: v_bfrev_b32_e32 v1, 4.0
; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[0:3]
; GFX1030-NEXT: s_waitcnt vmcnt(0)
@@ -894,7 +894,7 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4
; GFX1013-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX1013-NEXT: flat_load_dword v2, v[0:1]
; GFX1013-NEXT: v_mov_b32_e32 v0, 0xb36211c7
-; GFX1013-NEXT: v_mov_b32_e32 v1, 0x102
+; GFX1013-NEXT: v_bfrev_b32_e32 v1, 4.0
; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[4:7]
; GFX1013-NEXT: s_waitcnt vmcnt(0)
@@ -973,7 +973,7 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray
; GFX1030-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX1030-NEXT: flat_load_dword v2, v[0:1]
; GFX1030-NEXT: v_mov_b32_e32 v0, 0xb36211c6
-; GFX1030-NEXT: v_mov_b32_e32 v1, 0x102
+; GFX1030-NEXT: v_bfrev_b32_e32 v1, 4.0
; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[0:3] a16
; GFX1030-NEXT: s_waitcnt vmcnt(0)
@@ -999,7 +999,7 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray
; GFX1013-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX1013-NEXT: flat_load_dword v2, v[0:1]
; GFX1013-NEXT: v_mov_b32_e32 v0, 0xb36211c6
-; GFX1013-NEXT: v_mov_b32_e32 v1, 0x102
+; GFX1013-NEXT: v_bfrev_b32_e32 v1, 4.0
; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[4:7] a16
; GFX1013-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
index 7ad19a4797003..0f8ebbc794f29 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
@@ -423,7 +423,7 @@ bb:
define amdgpu_ps <2 x i64> @s_shl_v2i64_zext_v2i32(<2 x i32> inreg %x) {
; GCN-LABEL: s_shl_v2i64_zext_v2i32:
; GCN: ; %bb.0:
-; GCN-NEXT: s_brev_b32 s2, -4
+; GCN-NEXT: s_not_b32 s2, -2.0
; GCN-NEXT: s_mov_b32 s3, s2
; GCN-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
; GCN-NEXT: s_lshl_b32 s0, s0, 2
@@ -434,7 +434,7 @@ define amdgpu_ps <2 x i64> @s_shl_v2i64_zext_v2i32(<2 x i32> inreg %x) {
;
; GFX10PLUS-LABEL: s_shl_v2i64_zext_v2i32:
; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: s_brev_b32 s2, -4
+; GFX10PLUS-NEXT: s_not_b32 s2, -2.0
; GFX10PLUS-NEXT: s_mov_b32 s3, s2
; GFX10PLUS-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
; GFX10PLUS-NEXT: s_mov_b32 s3, 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.v2i16.ll
index 5613501736ff0..855687281ce9a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.v2i16.ll
@@ -147,7 +147,7 @@ define <2 x i16> @v_sub_v2i16_neg_inline_imm_splat(<2 x i16> %a) {
; GFX8-LABEL: v_sub_v2i16_neg_inline_imm_splat:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, 0xffffffc0
+; GFX8-NEXT: v_not_b32_e32 v1, 63
; GFX8-NEXT: v_subrev_u16_e32 v2, 0xffc0, v0
; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
@@ -211,7 +211,7 @@ define <2 x i16> @v_sub_v2i16_neg_inline_imm_hi(<2 x i16...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Very nice!
The first 2 changes I thought to avoid the VOPD regression didn't work. Checking if it's a post-RA run and moving the VOPD pass both still see the same regression |
If we have a bitwise negated inline immediate, we can materialize it with s_not_b32/v_not_b32. This mirrors the current bitreverse handling. As a side effect, we also now handle the bitreversed FP immediate case. One test shows some VOPD regressions on gfx11 which should probably be fixed. Previously the 2 v_mov_b32 could be packed, but now the mismatched opcode + mov can't. This problem already already existed for the bfrev case, it just happens more often now.
75a9959
to
5566107
Compare
The VOPD lit regression was from now handling the reversed FP immediate case. We were always interfering with VOPD here |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
If we have a bitwise negated inline immediate, we can materialize it with s_not_b32/v_not_b32. This mirrors the current bitreverse handling. As a side effect, we also now handle the bitreversed FP immediate case. One test shows some VOPD regressions on gfx11 which should probably be fixed. Previously the 2 v_mov_b32 could be packed, but now the mismatched opcode + mov can't. This problem already already existed for the bfrev case, it just happens more often now.
If we have a bitwise negated inline immediate, we can materialize
it with s_not_b32/v_not_b32. This mirrors the current bitreverse
handling.
As a side effect, we also now handle the bitreversed FP immediate
case.
One test shows some VOPD regressions on gfx11 which should
probably be fixed. Previously the 2 v_mov_b32 could be packed,
but now the mismatched opcode + mov can't. This problem already
already existed for the bfrev case, it just happens more often now.