-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[GISel] Fold shifts to constant result. #123510
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
37b8ac1
to
56dfed9
Compare
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-llvm-globalisel Author: lialan (lialan) ChangesThis resolves #123212 Patch is 72.80 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/123510.diff 13 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 94e36e412b0cf7..a51aa876e1deb0 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -840,8 +840,10 @@ class CombinerHelper {
bool matchRedundantBinOpInEquality(MachineInstr &MI,
BuildFnTy &MatchInfo) const;
- /// Match shifts greater or equal to the bitwidth of the operation.
- bool matchShiftsTooBig(MachineInstr &MI) const;
+ /// Match shifts greater or equal to the range (bitwidth of the operation, or
+ /// the source value).
+ bool matchShiftsTooBig(MachineInstr &MI,
+ std::optional<int64_t> &MatchInfo) const;
/// Match constant LHS ops that should be commuted.
bool matchCommuteConstantToRHS(MachineInstr &MI) const;
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 8641eabbdd84c6..ae0856550d9356 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -306,11 +306,18 @@ def ptr_add_immed_chain : GICombineRule<
[{ return Helper.matchPtrAddImmedChain(*${d}, ${matchinfo}); }]),
(apply [{ Helper.applyPtrAddImmedChain(*${d}, ${matchinfo}); }])>;
+def shift_result_matchdata : GIDefMatchData<"std::optional<int64_t>">;
def shifts_too_big : GICombineRule<
- (defs root:$root),
+ (defs root:$root, shift_result_matchdata:$matchinfo),
(match (wip_match_opcode G_SHL, G_ASHR, G_LSHR):$root,
- [{ return Helper.matchShiftsTooBig(*${root}); }]),
- (apply [{ Helper.replaceInstWithUndef(*${root}); }])>;
+ [{ return Helper.matchShiftsTooBig(*${root}, ${matchinfo}); }]),
+ (apply [{
+ if (${matchinfo}) {
+ Helper.replaceInstWithConstant(*${root}, *${matchinfo});
+ } else {
+ Helper.replaceInstWithUndef(*${root});
+ }
+ }])>;
// Fold shift (shift base x), y -> shift base, (x+y), if shifts are same
def shift_immed_matchdata : GIDefMatchData<"RegisterImmPair">;
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 4e3aaf5da7198c..4821c21301946b 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -6590,12 +6590,55 @@ bool CombinerHelper::matchRedundantBinOpInEquality(MachineInstr &MI,
return CmpInst::isEquality(Pred) && Y.isValid();
}
-bool CombinerHelper::matchShiftsTooBig(MachineInstr &MI) const {
+static std::optional<unsigned>
+getMaxUsefulShift(KnownBits ValueKB, unsigned Opcode,
+ std::optional<int64_t> &Result) {
+ assert(Opcode == TargetOpcode::G_SHL || Opcode == TargetOpcode::G_LSHR ||
+ Opcode == TargetOpcode::G_ASHR && "Expect G_SHL, G_LSHR or G_ASHR.");
+ auto SignificantBits = 0;
+ switch (Opcode) {
+ case TargetOpcode::G_SHL:
+ SignificantBits = ValueKB.countMinTrailingZeros();
+ Result = 0;
+ break;
+ case TargetOpcode::G_LSHR:
+ Result = 0;
+ SignificantBits = ValueKB.countMinLeadingZeros();
+ break;
+ case TargetOpcode::G_ASHR:
+ if (ValueKB.isNonNegative()) {
+ SignificantBits = ValueKB.countMinLeadingZeros();
+ Result = 0;
+ } else if (ValueKB.isNegative()) {
+ SignificantBits = ValueKB.countMinLeadingOnes();
+ Result = -1;
+ } else {
+ // Cannot determine shift result.
+ Result = std::nullopt;
+ }
+ break;
+ default:
+ break;
+ }
+ return ValueKB.getBitWidth() - SignificantBits;
+}
+
+bool CombinerHelper::matchShiftsTooBig(
+ MachineInstr &MI, std::optional<int64_t> &MatchInfo) const {
+ Register ShiftVal = MI.getOperand(1).getReg();
Register ShiftReg = MI.getOperand(2).getReg();
LLT ResTy = MRI.getType(MI.getOperand(0).getReg());
auto IsShiftTooBig = [&](const Constant *C) {
auto *CI = dyn_cast<ConstantInt>(C);
- return CI && CI->uge(ResTy.getScalarSizeInBits());
+ if (!CI)
+ return false;
+ if (CI->uge(ResTy.getScalarSizeInBits())) {
+ MatchInfo = std::nullopt;
+ return true;
+ }
+ auto OptMaxUsefulShift = getMaxUsefulShift(KB->getKnownBits(ShiftVal),
+ MI.getOpcode(), MatchInfo);
+ return OptMaxUsefulShift && CI->uge(*OptMaxUsefulShift);
};
return matchUnaryPredicate(MRI, ShiftReg, IsShiftTooBig);
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shifts.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shifts.mir
new file mode 100644
index 00000000000000..d3a3827f35a18a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shifts.mir
@@ -0,0 +1,115 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn -run-pass=amdgpu-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
+
+---
+name: combine_ashr
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr31
+
+ liveins: $vgpr0, $vgpr1
+
+ ; CHECK-LABEL: name: combine_ashr
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr31, $vgpr0, $vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: G_STORE [[C]](s32), [[MV]](p0) :: (store (s32))
+ ; CHECK-NEXT: SI_RETURN
+ %9:_(s32) = COPY $vgpr0
+ %10:_(s32) = COPY $vgpr1
+ %0:_(p0) = G_MERGE_VALUES %9(s32), %10(s32)
+ %12:_(s32) = G_CONSTANT i32 10
+ %11:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x)
+ %13:_(s32) = G_ASHR %11, %12(s32)
+ G_STORE %13(s32), %0(p0) :: (store (s32))
+ SI_RETURN
+
+...
+---
+name: combine_lshr
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr31
+
+ liveins: $vgpr0, $vgpr1
+
+ ; CHECK-LABEL: name: combine_lshr
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr31, $vgpr0, $vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: G_STORE [[C]](s32), [[MV]](p0) :: (store (s32))
+ ; CHECK-NEXT: SI_RETURN
+ %9:_(s32) = COPY $vgpr0
+ %10:_(s32) = COPY $vgpr1
+ %0:_(p0) = G_MERGE_VALUES %9(s32), %10(s32)
+ %12:_(s32) = G_CONSTANT i32 10
+ %11:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x)
+ %13:_(s32) = G_LSHR %11, %12(s32)
+ G_STORE %13(s32), %0(p0) :: (store (s32))
+ SI_RETURN
+
+...
+---
+name: combine_shl
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr31
+
+ liveins: $vgpr0, $vgpr1
+
+ ; CHECK-LABEL: name: combine_shl
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr31, $vgpr0, $vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: G_STORE [[C]](s32), [[MV]](p0) :: (store (s32))
+ ; CHECK-NEXT: SI_RETURN
+ %9:_(s32) = COPY $vgpr0
+ %10:_(s32) = COPY $vgpr1
+ %0:_(p0) = G_MERGE_VALUES %9(s32), %10(s32)
+ %12:_(s32) = G_CONSTANT i32 16
+ %11:_(s32) = G_CONSTANT i32 4294901760
+ %13:_(s32) = G_SHL %11, %12(s32)
+ G_STORE %13(s32), %0(p0) :: (store (s32))
+ SI_RETURN
+
+...
+---
+name: combine_ashr2
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr31
+
+ liveins: $vgpr0, $vgpr1
+
+ ; CHECK-LABEL: name: combine_ashr2
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr31, $vgpr0, $vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 -1
+ ; CHECK-NEXT: G_STORE [[C]](s8), [[MV]](p0) :: (store (s8))
+ ; CHECK-NEXT: SI_RETURN
+ %9:_(s32) = COPY $vgpr0
+ %10:_(s32) = COPY $vgpr1
+ %0:_(p0) = G_MERGE_VALUES %9(s32), %10(s32)
+ %12:_(s32) = G_CONSTANT i32 1
+ %11:_(s8) = G_CONSTANT i8 -2
+ %13:_(s8) = G_ASHR %11, %12(s32)
+ G_STORE %13(s8), %0(p0) :: (store (s8))
+ SI_RETURN
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.postlegal.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.postlegal.mir
index 6ae8895322d6f9..a8cd974b01ab4b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.postlegal.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.postlegal.mir
@@ -374,23 +374,15 @@ body: |
; GFX6-LABEL: name: do_not_shl_v2s32_zero_by_16_from_zext_v2s16
; GFX6: liveins: $vgpr0
; GFX6-NEXT: {{ $}}
- ; GFX6-NEXT: %zero:_(s16) = G_CONSTANT i16 0
- ; GFX6-NEXT: %zerovector:_(<2 x s16>) = G_BUILD_VECTOR %zero(s16), %zero(s16)
- ; GFX6-NEXT: %shiftamt:_(s16) = G_CONSTANT i16 16
- ; GFX6-NEXT: %shiftamtvector:_(<2 x s16>) = G_BUILD_VECTOR %shiftamt(s16), %shiftamt(s16)
- ; GFX6-NEXT: %extend:_(<2 x s32>) = G_ZEXT %zerovector(<2 x s16>)
- ; GFX6-NEXT: %shl:_(<2 x s32>) = G_SHL %extend, %shiftamtvector(<2 x s16>)
+ ; GFX6-NEXT: %6:_(s32) = G_CONSTANT i32 0
+ ; GFX6-NEXT: %shl:_(<2 x s32>) = G_BUILD_VECTOR %6(s32), %6(s32)
; GFX6-NEXT: $vgpr0_vgpr1 = COPY %shl(<2 x s32>)
;
; GFX9-LABEL: name: do_not_shl_v2s32_zero_by_16_from_zext_v2s16
; GFX9: liveins: $vgpr0
; GFX9-NEXT: {{ $}}
- ; GFX9-NEXT: %zero:_(s16) = G_CONSTANT i16 0
- ; GFX9-NEXT: %zerovector:_(<2 x s16>) = G_BUILD_VECTOR %zero(s16), %zero(s16)
- ; GFX9-NEXT: %shiftamt:_(s16) = G_CONSTANT i16 16
- ; GFX9-NEXT: %shiftamtvector:_(<2 x s16>) = G_BUILD_VECTOR %shiftamt(s16), %shiftamt(s16)
- ; GFX9-NEXT: %extend:_(<2 x s32>) = G_ZEXT %zerovector(<2 x s16>)
- ; GFX9-NEXT: %shl:_(<2 x s32>) = G_SHL %extend, %shiftamtvector(<2 x s16>)
+ ; GFX9-NEXT: %6:_(s32) = G_CONSTANT i32 0
+ ; GFX9-NEXT: %shl:_(<2 x s32>) = G_BUILD_VECTOR %6(s32), %6(s32)
; GFX9-NEXT: $vgpr0_vgpr1 = COPY %shl(<2 x s32>)
%zero:_(s16) = G_CONSTANT i16 0
%zerovector:_(<2 x s16>) = G_BUILD_VECTOR %zero, %zero:_(s16)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.prelegal.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.prelegal.mir
index 6ceb41199af6da..3780542cd87993 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.prelegal.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.prelegal.mir
@@ -246,23 +246,15 @@ body: |
; GFX6-LABEL: name: do_not_shl_v2s32_zero_by_16_from_zext_v2s16
; GFX6: liveins: $vgpr0, $vgpr1
; GFX6-NEXT: {{ $}}
- ; GFX6-NEXT: %zero:_(s16) = G_CONSTANT i16 0
- ; GFX6-NEXT: %zerovector:_(<2 x s16>) = G_BUILD_VECTOR %zero(s16), %zero(s16)
- ; GFX6-NEXT: %shiftamt:_(s16) = G_CONSTANT i16 16
- ; GFX6-NEXT: %shiftamtvector:_(<2 x s16>) = G_BUILD_VECTOR %shiftamt(s16), %shiftamt(s16)
- ; GFX6-NEXT: %extend:_(<2 x s32>) = G_ZEXT %zerovector(<2 x s16>)
- ; GFX6-NEXT: %shl:_(<2 x s32>) = G_SHL %extend, %shiftamtvector(<2 x s16>)
+ ; GFX6-NEXT: %6:_(s32) = G_CONSTANT i32 0
+ ; GFX6-NEXT: %shl:_(<2 x s32>) = G_BUILD_VECTOR %6(s32), %6(s32)
; GFX6-NEXT: $vgpr0_vgpr1 = COPY %shl(<2 x s32>)
;
; GFX9-LABEL: name: do_not_shl_v2s32_zero_by_16_from_zext_v2s16
; GFX9: liveins: $vgpr0, $vgpr1
; GFX9-NEXT: {{ $}}
- ; GFX9-NEXT: %zero:_(s16) = G_CONSTANT i16 0
- ; GFX9-NEXT: %zerovector:_(<2 x s16>) = G_BUILD_VECTOR %zero(s16), %zero(s16)
- ; GFX9-NEXT: %shiftamt:_(s16) = G_CONSTANT i16 16
- ; GFX9-NEXT: %shiftamtvector:_(<2 x s16>) = G_BUILD_VECTOR %shiftamt(s16), %shiftamt(s16)
- ; GFX9-NEXT: %extend:_(<2 x s32>) = G_ZEXT %zerovector(<2 x s16>)
- ; GFX9-NEXT: %shl:_(<2 x s32>) = G_SHL %extend, %shiftamtvector(<2 x s16>)
+ ; GFX9-NEXT: %6:_(s32) = G_CONSTANT i32 0
+ ; GFX9-NEXT: %shl:_(<2 x s32>) = G_BUILD_VECTOR %6(s32), %6(s32)
; GFX9-NEXT: $vgpr0_vgpr1 = COPY %shl(<2 x s32>)
%zero:_(s16) = G_CONSTANT i16 0
%zerovector:_(<2 x s16>) = G_BUILD_VECTOR %zero, %zero:_(s16)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
index b9cd330ee2b5f9..4ddbb0afd7fc58 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
@@ -1434,13 +1434,11 @@ define float @v_test_sitofp_i64_byte_to_f32(i64 %arg0) {
; SI-LABEL: v_test_sitofp_i64_byte_to_f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_ffbh_i32_e32 v2, 0
+; SI-NEXT: v_add_i32_e32 v2, vcc, -1, v2
; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
-; SI-NEXT: v_ashrrev_i32_e32 v2, 31, v0
-; SI-NEXT: v_ffbh_i32_e32 v3, 0
-; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v2
-; SI-NEXT: v_add_i32_e32 v3, vcc, -1, v3
; SI-NEXT: v_mov_b32_e32 v1, 0
-; SI-NEXT: v_min_u32_e32 v2, v3, v2
+; SI-NEXT: v_min_u32_e32 v2, 32, v2
; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], v2
; SI-NEXT: v_min_u32_e32 v0, 1, v0
; SI-NEXT: v_or_b32_e32 v0, v1, v0
@@ -1452,13 +1450,11 @@ define float @v_test_sitofp_i64_byte_to_f32(i64 %arg0) {
; VI-LABEL: v_test_sitofp_i64_byte_to_f32:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_ffbh_i32_e32 v2, 0
+; VI-NEXT: v_add_u32_e32 v2, vcc, -1, v2
; VI-NEXT: v_and_b32_e32 v0, 0xff, v0
-; VI-NEXT: v_ashrrev_i32_e32 v2, 31, v0
-; VI-NEXT: v_ffbh_i32_e32 v3, 0
-; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v2
-; VI-NEXT: v_add_u32_e32 v3, vcc, -1, v3
; VI-NEXT: v_mov_b32_e32 v1, 0
-; VI-NEXT: v_min_u32_e32 v2, v3, v2
+; VI-NEXT: v_min_u32_e32 v2, 32, v2
; VI-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
; VI-NEXT: v_min_u32_e32 v0, 1, v0
; VI-NEXT: v_or_b32_e32 v0, v1, v0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
index cc185aff9eff22..784611cf68dd23 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
@@ -1800,9 +1800,9 @@ define amdgpu_ps i65 @s_lshr_i65_33(i65 inreg %value) {
; GCN-NEXT: s_and_b64 s[2:3], s[2:3], 1
; GCN-NEXT: s_lshr_b32 s0, s1, 1
; GCN-NEXT: s_mov_b32 s1, 0
-; GCN-NEXT: s_lshl_b64 s[4:5], s[2:3], 31
-; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
-; GCN-NEXT: s_lshr_b32 s2, s3, 1
+; GCN-NEXT: s_lshl_b64 s[2:3], s[2:3], 31
+; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
+; GCN-NEXT: s_mov_b32 s2, 0
; GCN-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_lshr_i65_33:
@@ -1810,9 +1810,9 @@ define amdgpu_ps i65 @s_lshr_i65_33(i65 inreg %value) {
; GFX10PLUS-NEXT: s_and_b64 s[2:3], s[2:3], 1
; GFX10PLUS-NEXT: s_lshr_b32 s0, s1, 1
; GFX10PLUS-NEXT: s_mov_b32 s1, 0
-; GFX10PLUS-NEXT: s_lshl_b64 s[4:5], s[2:3], 31
-; GFX10PLUS-NEXT: s_lshr_b32 s2, s3, 1
-; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
+; GFX10PLUS-NEXT: s_lshl_b64 s[2:3], s[2:3], 31
+; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX10PLUS-NEXT: s_mov_b32 s2, 0
; GFX10PLUS-NEXT: ; return to shader part epilog
%result = lshr i65 %value, 33
ret i65 %result
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll
index 88eb0e4b848c95..2fa5492c8a2b72 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll
@@ -608,34 +608,25 @@ define i32 @v_sdiv_i32_24bit(i32 %num, i32 %den) {
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0
; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v1
-; GISEL-NEXT: v_ashrrev_i32_e32 v2, 31, v0
-; GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v1
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3
-; GISEL-NEXT: v_xor_b32_e32 v0, v0, v2
-; GISEL-NEXT: v_xor_b32_e32 v1, v1, v3
-; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v1
-; GISEL-NEXT: v_sub_i32_e32 v5, vcc, 0, v1
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4
-; GISEL-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4
-; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4
-; GISEL-NEXT: v_mul_lo_u32 v5, v5, v4
-; GISEL-NEXT: v_mul_hi_u32 v5, v4, v5
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5
-; GISEL-NEXT: v_mul_hi_u32 v4, v0, v4
-; GISEL-NEXT: v_mul_lo_u32 v5, v4, v1
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v4
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v5
+; GISEL-NEXT: v_cvt_f32_u32_e32 v2, v1
+; GISEL-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v2, v2
+; GISEL-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
+; GISEL-NEXT: v_cvt_u32_f32_e32 v2, v2
+; GISEL-NEXT: v_mul_lo_u32 v3, v3, v2
+; GISEL-NEXT: v_mul_hi_u32 v3, v2, v3
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3
+; GISEL-NEXT: v_mul_hi_u32 v2, v0, v2
+; GISEL-NEXT: v_mul_lo_u32 v3, v2, v1
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v2
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v3
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
-; GISEL-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
-; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v0, v1
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v4
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
+; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v0, v1
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v2
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
-; GISEL-NEXT: v_xor_b32_e32 v1, v2, v3
-; GISEL-NEXT: v_xor_b32_e32 v0, v0, v1
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
; GISEL-NEXT: s_setpc_b64 s[30:31]
;
; CGP-LABEL: v_sdiv_i32_24bit:
@@ -677,20 +668,6 @@ define <2 x i32> @v_sdiv_v2i32_24bit(<2 x i32> %num, <2 x i32> %den) {
; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v1
; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v2
; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v3
-; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v0
-; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v2
-; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v1
-; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v3
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5
-; GISEL-NEXT: v_xor_b32_e32 v8, v4, v5
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v6
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7
-; GISEL-NEXT: v_xor_b32_e32 v9, v6, v7
-; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4
-; GISEL-NEXT: v_xor_b32_e32 v2, v2, v5
-; GISEL-NEXT: v_xor_b32_e32 v1, v1, v6
-; GISEL-NEXT: v_xor_b32_e32 v3, v3, v7
; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v2
; GISEL-NEXT: v_sub_i32_e32 v5, vcc, 0, v2
; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v3
@@ -711,15 +688,15 @@ define <2 x i32> @v_sdiv_v2i32_24bit(<2 x i32> %num, <2 x i32> %den) {
; GISEL-NEXT: v_mul_hi_u32 v5, v1, v5
; GISEL-NEXT: v_mul_lo_u32 v6, v4, v2
; GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v4
-; GISEL-NEXT: v_mul_lo_u32 v10, v5, v3
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, 1, v5
+; GISEL-NEXT: v_mul_lo_u32 v8, v5, v3
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, 1, v5
; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v6
-; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v10
+; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v8
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
; GISEL-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc
; GISEL-NEXT: v_sub_i32_e64 v6, s[4:5], v0, v2
; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v3
-; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[4:5]
; GISEL-NEXT: v_sub_i32_e64 v7, s[6...
[truncated]
|
Should not call these "out-of-range" shifts. They are just shifts that can be folded to a constant based on the known bits of the operands. |
92b3db3
to
bb12265
Compare
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 | ||
; CHECK-NEXT: G_STORE [[C]](s32), [[MV]](p0) :: (store (s32)) | ||
; CHECK-NEXT: SI_RETURN | ||
%9:_(s32) = COPY $vgpr0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Compact register numbers
@@ -0,0 +1,115 @@ | |||
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py | |||
# RUN: llc -mtriple=amdgcn -run-pass=amdgpu-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Don't need -verify-machineinstrs
G_STORE %13(s8), %0(p0) :: (store (s8)) | ||
SI_RETURN | ||
|
||
... |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Test vector cases
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
test cases added
ef51471
to
43f01ed
Compare
case TargetOpcode::G_ASHR: | ||
if (ValueKB.isNonNegative()) { | ||
SignificantBits = ValueKB.countMinLeadingZeros(); | ||
Result = 0; | ||
} else if (ValueKB.isNegative()) { | ||
SignificantBits = ValueKB.countMinLeadingOnes(); | ||
Result = -1; | ||
} else { | ||
// Cannot determine shift result. | ||
Result = std::nullopt; | ||
} | ||
break; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This should be using computeKnownSignBits instead of computeKnownBits, it can be slightly smarter
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, can switch to computeNumSignBits separately
This resolves #123212