-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AMDGPU] Canonicalize G_ZEXT of the shift amount in RegBankCombiner #131792
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
This stack of pull requests is managed by Graphite. Learn more about stacking. |
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-llvm-globalisel Author: Pierre van Houtryve (Pierre-vh) ChangesCanonicalize it to a G_AND instead so that ISel patterns can pick it I'm also looking at making a DAG version of this in a separate patch. Full diff: https://github.com/llvm/llvm-project/pull/131792.diff 3 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index a21505356274b..a0eb0ffea8d7e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -134,6 +134,22 @@ def combine_fmul_with_select_to_fldexp : GICombineRule<
[{ return Helper.matchCombineFmulWithSelectToFldexp(*${root}, *${sel}, ${matchinfo}); }]),
(apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>;
+// (shift x, (zext amt)) -> (shift x, (and (anyext amt), mask)
+//
+// The pattern is longer, but is better for matching during ISel.
+class canonicalize_zext_shift_amt<Instruction opc> : GICombineRule<
+ (defs root:$dst),
+ (match (G_ZEXT $amt, $amtsrc):$zext,
+ (opc $dst, $src, $amt):$shift),
+ (apply [{ applyCanonicalizeZextShiftAmt(*${shift}, *${zext}); }])>;
+
+def canonicalize_zext_lshr : canonicalize_zext_shift_amt<G_LSHR>;
+def canonicalize_zext_ashr : canonicalize_zext_shift_amt<G_ASHR>;
+def canonicalize_zext_shl : canonicalize_zext_shift_amt<G_SHL>;
+
+def zext_of_shift_amount_combines : GICombineGroup<[
+ canonicalize_zext_lshr, canonicalize_zext_ashr, canonicalize_zext_shl
+]>;
let Predicates = [Has16BitInsts, NotHasMed3_16] in {
// For gfx8, expand f16-fmed3-as-f32 into a min/max f16 sequence. This
@@ -181,5 +197,5 @@ def AMDGPURegBankCombiner : GICombiner<
zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain,
fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp,
identity_combines, redundant_and, constant_fold_cast_op,
- cast_of_cast_combines]> {
+ cast_of_cast_combines, zext_of_shift_amount_combines]> {
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
index 98c48f4fe3705..68bffe5bbb7f5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
@@ -87,6 +87,8 @@ class AMDGPURegBankCombinerImpl : public Combiner {
void applyMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo) const;
void applyClamp(MachineInstr &MI, Register &Reg) const;
+ void applyCanonicalizeZextShiftAmt(MachineInstr &MI, MachineInstr &Ext) const;
+
private:
SIModeRegisterDefaults getMode() const;
bool getIEEE() const;
@@ -362,6 +364,34 @@ void AMDGPURegBankCombinerImpl::applyMed3(MachineInstr &MI,
MI.eraseFromParent();
}
+void AMDGPURegBankCombinerImpl::applyCanonicalizeZextShiftAmt(
+ MachineInstr &MI, MachineInstr &Ext) const {
+ unsigned ShOpc = MI.getOpcode();
+ assert(ShOpc == AMDGPU::G_SHL || ShOpc == AMDGPU::G_LSHR ||
+ ShOpc == AMDGPU::G_ASHR);
+ assert(Ext.getOpcode() == AMDGPU::G_ZEXT);
+
+ Register AmtReg = Ext.getOperand(1).getReg();
+ Register ShDst = MI.getOperand(0).getReg();
+ Register ShSrc = MI.getOperand(1).getReg();
+
+ LLT ExtAmtTy = MRI.getType(Ext.getOperand(0).getReg());
+ LLT AmtTy = MRI.getType(AmtReg);
+
+ auto &RB = *MRI.getRegBank(AmtReg);
+
+ auto NewExt = B.buildAnyExt(ExtAmtTy, AmtReg);
+ auto Mask = B.buildConstant(
+ ExtAmtTy, maskTrailingOnes<uint64_t>(AmtTy.getScalarSizeInBits()));
+ auto And = B.buildAnd(ExtAmtTy, NewExt, Mask);
+ B.buildInstr(ShOpc, {ShDst}, {ShSrc, And});
+
+ MRI.setRegBank(NewExt.getReg(0), RB);
+ MRI.setRegBank(Mask.getReg(0), RB);
+ MRI.setRegBank(And.getReg(0), RB);
+ MI.eraseFromParent();
+}
+
SIModeRegisterDefaults AMDGPURegBankCombinerImpl::getMode() const {
return MF.getInfo<SIMachineFunctionInfo>()->getMode();
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shift-amount-zext.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shift-amount-zext.mir
new file mode 100644
index 0000000000000..77d30f6fa5223
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shift-amount-zext.mir
@@ -0,0 +1,146 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn -run-pass=amdgpu-regbank-combiner %s -o - | FileCheck %s
+
+---
+name: lshr_zext_i16
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1
+
+ ; CHECK-LABEL: name: lshr_zext_i16
+ ; CHECK: liveins: $sgpr0, $sgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: %src:sgpr(s32) = COPY $sgpr0
+ ; CHECK-NEXT: %regamt:sgpr(s32) = COPY $sgpr1
+ ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535
+ ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND %regamt, [[C]]
+ ; CHECK-NEXT: %res:sgpr(s32) = G_LSHR %src, [[AND]](s32)
+ ; CHECK-NEXT: $sgpr0 = COPY %res(s32)
+ %src:sgpr(s32) = COPY $sgpr0
+ %regamt:sgpr(s32) = COPY $sgpr1
+ %amt:sgpr(s16) = G_TRUNC %regamt
+ %zextamt:sgpr(s32) = G_ZEXT %amt
+ %res:sgpr(s32) = G_LSHR %src, %zextamt
+ $sgpr0 = COPY %res
+...
+
+---
+name: ashr_zext_i16
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1
+
+ ; CHECK-LABEL: name: ashr_zext_i16
+ ; CHECK: liveins: $sgpr0, $sgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: %src:sgpr(s32) = COPY $sgpr0
+ ; CHECK-NEXT: %regamt:sgpr(s32) = COPY $sgpr1
+ ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535
+ ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND %regamt, [[C]]
+ ; CHECK-NEXT: %res:sgpr(s32) = G_ASHR %src, [[AND]](s32)
+ ; CHECK-NEXT: $sgpr0 = COPY %res(s32)
+ %src:sgpr(s32) = COPY $sgpr0
+ %regamt:sgpr(s32) = COPY $sgpr1
+ %amt:sgpr(s16) = G_TRUNC %regamt
+ %zextamt:sgpr(s32) = G_ZEXT %amt
+ %res:sgpr(s32) = G_ASHR %src, %zextamt
+ $sgpr0 = COPY %res
+...
+
+---
+name: shl_zext_i16
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1
+
+ ; CHECK-LABEL: name: shl_zext_i16
+ ; CHECK: liveins: $sgpr0, $sgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: %src:sgpr(s32) = COPY $sgpr0
+ ; CHECK-NEXT: %regamt:sgpr(s32) = COPY $sgpr1
+ ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535
+ ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND %regamt, [[C]]
+ ; CHECK-NEXT: %res:sgpr(s32) = G_SHL %src, [[AND]](s32)
+ ; CHECK-NEXT: $sgpr0 = COPY %res(s32)
+ %src:sgpr(s32) = COPY $sgpr0
+ %regamt:sgpr(s32) = COPY $sgpr1
+ %amt:sgpr(s16) = G_TRUNC %regamt
+ %zextamt:sgpr(s32) = G_ZEXT %amt
+ %res:sgpr(s32) = G_SHL %src, %zextamt
+ $sgpr0 = COPY %res
+...
+
+---
+name: lshr_zext_i8
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1
+
+ ; CHECK-LABEL: name: lshr_zext_i8
+ ; CHECK: liveins: $sgpr0, $sgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: %src:sgpr(s32) = COPY $sgpr0
+ ; CHECK-NEXT: %regamt:sgpr(s32) = COPY $sgpr1
+ ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 255
+ ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND %regamt, [[C]]
+ ; CHECK-NEXT: %res:sgpr(s32) = G_LSHR %src, [[AND]](s32)
+ ; CHECK-NEXT: $sgpr0 = COPY %res(s32)
+ %src:sgpr(s32) = COPY $sgpr0
+ %regamt:sgpr(s32) = COPY $sgpr1
+ %amt:sgpr(s8) = G_TRUNC %regamt
+ %zextamt:sgpr(s32) = G_ZEXT %amt
+ %res:sgpr(s32) = G_LSHR %src, %zextamt
+ $sgpr0 = COPY %res
+...
+
+---
+name: ashr_zext_i8
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1
+
+ ; CHECK-LABEL: name: ashr_zext_i8
+ ; CHECK: liveins: $sgpr0, $sgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: %src:sgpr(s32) = COPY $sgpr0
+ ; CHECK-NEXT: %regamt:sgpr(s32) = COPY $sgpr1
+ ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 255
+ ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND %regamt, [[C]]
+ ; CHECK-NEXT: %res:sgpr(s32) = G_ASHR %src, [[AND]](s32)
+ ; CHECK-NEXT: $sgpr0 = COPY %res(s32)
+ %src:sgpr(s32) = COPY $sgpr0
+ %regamt:sgpr(s32) = COPY $sgpr1
+ %amt:sgpr(s8) = G_TRUNC %regamt
+ %zextamt:sgpr(s32) = G_ZEXT %amt
+ %res:sgpr(s32) = G_ASHR %src, %zextamt
+ $sgpr0 = COPY %res
+...
+
+---
+name: shl_zext_i8
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1
+
+ ; CHECK-LABEL: name: shl_zext_i8
+ ; CHECK: liveins: $sgpr0, $sgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: %src:sgpr(s32) = COPY $sgpr0
+ ; CHECK-NEXT: %regamt:sgpr(s32) = COPY $sgpr1
+ ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 255
+ ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND %regamt, [[C]]
+ ; CHECK-NEXT: %res:sgpr(s32) = G_SHL %src, [[AND]](s32)
+ ; CHECK-NEXT: $sgpr0 = COPY %res(s32)
+ %src:sgpr(s32) = COPY $sgpr0
+ %regamt:sgpr(s32) = COPY $sgpr1
+ %amt:sgpr(s8) = G_TRUNC %regamt
+ %zextamt:sgpr(s32) = G_ZEXT %amt
+ %res:sgpr(s32) = G_SHL %src, %zextamt
+ $sgpr0 = COPY %res
+...
|
16204d5
to
a71242d
Compare
Canonicalize it to a G_AND instead so that ISel patterns can pick it up and ignore it, as the shift instructions only read low bits. G_ZEXT would be lowered to a v/s_and anyway in most cases.
a71242d
to
c24b307
Compare
@@ -134,6 +134,22 @@ def combine_fmul_with_select_to_fldexp : GICombineRule< | |||
[{ return Helper.matchCombineFmulWithSelectToFldexp(*${root}, *${sel}, ${matchinfo}); }]), | |||
(apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>; | |||
|
|||
// (shift x, (zext amt)) -> (shift x, (and (anyext amt), mask) | |||
// | |||
// The pattern is longer, but is better for matching during ISel. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I guess we could handle both forms there
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah ideally ISel should just handle more cases, but I wasted a lot of hours trying to make the ISel patterns work but I lack experience in that area so I couldn't get it done, I always got weird inference errors
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
At worst you should only need to add more explicit types to the patterns. The shift amount has a general register that supports many types
Merge activity
|
Canonicalize it to a G_AND instead so that ISel patterns can pick it
up and ignore it, as the shift instructions only read low bits.
G_ZEXT would be lowered to a v/s_and anyway in most cases.
I'm also looking at making a DAG version of this in a separate patch.