-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AMDGPU][RegBankInfo] Promote scalar i16 and/or/xor to i32 #131306
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-llvm-globalisel @llvm/pr-subscribers-backend-amdgpu Author: Pierre van Houtryve (Pierre-vh) ChangesSee #64591 Patch is 72.45 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/131306.diff 5 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index c19ee14ab1574..27b86723ce474 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -2416,9 +2416,10 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
Register DstReg = MI.getOperand(0).getReg();
LLT DstTy = MRI.getType(DstReg);
- if (DstTy.getSizeInBits() == 1) {
- const RegisterBank *DstBank =
+ const RegisterBank *DstBank =
OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
+
+ if (DstTy.getSizeInBits() == 1) {
if (DstBank == &AMDGPU::VCCRegBank)
break;
@@ -2432,6 +2433,29 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
return;
}
+ // 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
+ // Packed 16-bit operations need to be scalarized and promoted.
+ if (DstTy.getSizeInBits() == 16 && DstBank == &AMDGPU::SGPRRegBank) {
+ const LLT S32 = LLT::scalar(32);
+ MachineBasicBlock *MBB = MI.getParent();
+ MachineFunction *MF = MBB->getParent();
+ ApplyRegBankMapping ApplySALU(B, *this, MRI, &AMDGPU::SGPRRegBank);
+ LegalizerHelper Helper(*MF, ApplySALU, B);
+ // Widen to S32, but handle `G_XOR x, -1` differently. Legalizer widening
+ // will use a G_ANYEXT to extend the -1 which prevents matching G_XOR -1
+ // as "not".
+ if (MI.getOpcode() == AMDGPU::G_XOR &&
+ mi_match(MI.getOperand(2).getReg(), MRI, m_SpecificICstOrSplat(-1))) {
+ Helper.widenScalarSrc(MI, S32, 1, AMDGPU::G_ANYEXT);
+ Helper.widenScalarSrc(MI, S32, 2, AMDGPU::G_SEXT);
+ Helper.widenScalarDst(MI, S32);
+ } else {
+ if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
+ llvm_unreachable("widen scalar should have succeeded");
+ }
+ return;
+ }
+
if (DstTy.getSizeInBits() != 64)
break;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll
index 1a94429b1b5a1..36359579ea442 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll
@@ -391,20 +391,20 @@ define amdgpu_ps i16 @s_andn2_i16_commute(i16 inreg %src0, i16 inreg %src1) {
define amdgpu_ps { i16, i16 } @s_andn2_i16_multi_use(i16 inreg %src0, i16 inreg %src1) {
; GCN-LABEL: s_andn2_i16_multi_use:
; GCN: ; %bb.0:
-; GCN-NEXT: s_xor_b32 s1, s3, -1
+; GCN-NEXT: s_not_b32 s1, s3
; GCN-NEXT: s_andn2_b32 s0, s2, s3
; GCN-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_andn2_i16_multi_use:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_andn2_b32 s0, s2, s3
-; GFX10-NEXT: s_xor_b32 s1, s3, -1
+; GFX10-NEXT: s_not_b32 s1, s3
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_andn2_i16_multi_use:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_and_not1_b32 s0, s2, s3
-; GFX11-NEXT: s_xor_b32 s1, s3, -1
+; GFX11-NEXT: s_not_b32 s1, s3
; GFX11-NEXT: ; return to shader part epilog
%not.src1 = xor i16 %src1, -1
%and = and i16 %src0, %not.src1
@@ -482,14 +482,14 @@ define amdgpu_ps float @v_andn2_i16_sv(i16 inreg %src0, i16 %src1) {
define amdgpu_ps float @v_andn2_i16_vs(i16 %src0, i16 inreg %src1) {
; GCN-LABEL: v_andn2_i16_vs:
; GCN: ; %bb.0:
-; GCN-NEXT: s_xor_b32 s0, s2, -1
+; GCN-NEXT: s_not_b32 s0, s2
; GCN-NEXT: v_and_b32_e32 v0, s0, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GCN-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: v_andn2_i16_vs:
; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: s_xor_b32 s0, s2, -1
+; GFX10PLUS-NEXT: s_not_b32 s0, s2
; GFX10PLUS-NEXT: v_and_b32_e32 v0, s0, v0
; GFX10PLUS-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX10PLUS-NEXT: ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
index e60739fd84059..3a52497bd6e91 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
@@ -1052,17 +1052,14 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
; GFX8-NEXT: s_lshr_b32 s2, s2, s3
; GFX8-NEXT: s_or_b32 s1, s1, s2
; GFX8-NEXT: s_and_b32 s2, s10, 7
-; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
-; GFX8-NEXT: s_and_b32 s3, s7, 0xff
; GFX8-NEXT: s_lshl_b32 s2, s4, s2
-; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
+; GFX8-NEXT: s_and_b32 s3, s7, 0xff
; GFX8-NEXT: s_andn2_b32 s4, 7, s10
; GFX8-NEXT: s_lshr_b32 s3, s3, 1
; GFX8-NEXT: s_and_b32 s4, 0xffff, s4
; GFX8-NEXT: s_lshr_b32 s3, s3, s4
; GFX8-NEXT: s_or_b32 s2, s2, s3
; GFX8-NEXT: s_and_b32 s3, s11, 7
-; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
; GFX8-NEXT: s_lshl_b32 s3, s5, s3
; GFX8-NEXT: s_andn2_b32 s5, 7, s11
; GFX8-NEXT: s_and_b32 s1, s1, 0xff
@@ -1113,17 +1110,14 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
; GFX9-NEXT: s_lshr_b32 s2, s2, s3
; GFX9-NEXT: s_or_b32 s1, s1, s2
; GFX9-NEXT: s_and_b32 s2, s10, 7
-; GFX9-NEXT: s_and_b32 s2, 0xffff, s2
-; GFX9-NEXT: s_and_b32 s3, s7, 0xff
; GFX9-NEXT: s_lshl_b32 s2, s4, s2
-; GFX9-NEXT: s_and_b32 s3, 0xffff, s3
+; GFX9-NEXT: s_and_b32 s3, s7, 0xff
; GFX9-NEXT: s_andn2_b32 s4, 7, s10
; GFX9-NEXT: s_lshr_b32 s3, s3, 1
; GFX9-NEXT: s_and_b32 s4, 0xffff, s4
; GFX9-NEXT: s_lshr_b32 s3, s3, s4
; GFX9-NEXT: s_or_b32 s2, s2, s3
; GFX9-NEXT: s_and_b32 s3, s11, 7
-; GFX9-NEXT: s_and_b32 s3, 0xffff, s3
; GFX9-NEXT: s_lshl_b32 s3, s5, s3
; GFX9-NEXT: s_andn2_b32 s5, 7, s11
; GFX9-NEXT: s_and_b32 s1, s1, 0xff
@@ -1148,57 +1142,54 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
; GFX10-NEXT: s_lshr_b32 s7, s1, 16
; GFX10-NEXT: s_lshr_b32 s8, s1, 24
; GFX10-NEXT: s_and_b32 s1, s1, 0xff
-; GFX10-NEXT: s_and_b32 s11, s2, 7
+; GFX10-NEXT: s_lshr_b32 s9, s2, 8
+; GFX10-NEXT: s_lshr_b32 s10, s2, 16
+; GFX10-NEXT: s_lshr_b32 s11, s2, 24
+; GFX10-NEXT: s_and_b32 s12, s2, 7
; GFX10-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX10-NEXT: s_andn2_b32 s12, 7, s2
-; GFX10-NEXT: s_and_b32 s11, 0xffff, s11
+; GFX10-NEXT: s_andn2_b32 s2, 7, s2
; GFX10-NEXT: s_lshr_b32 s1, s1, 1
-; GFX10-NEXT: s_and_b32 s12, 0xffff, s12
-; GFX10-NEXT: s_lshr_b32 s3, s0, 8
-; GFX10-NEXT: s_lshr_b32 s4, s0, 16
-; GFX10-NEXT: s_lshr_b32 s5, s0, 24
-; GFX10-NEXT: s_lshr_b32 s9, s2, 8
-; GFX10-NEXT: s_lshl_b32 s0, s0, s11
-; GFX10-NEXT: s_lshr_b32 s1, s1, s12
+; GFX10-NEXT: s_and_b32 s2, 0xffff, s2
; GFX10-NEXT: s_and_b32 s6, s6, 0xff
-; GFX10-NEXT: s_or_b32 s0, s0, s1
-; GFX10-NEXT: s_and_b32 s1, s9, 7
+; GFX10-NEXT: s_lshr_b32 s1, s1, s2
+; GFX10-NEXT: s_and_b32 s2, s9, 7
; GFX10-NEXT: s_and_b32 s6, 0xffff, s6
; GFX10-NEXT: s_andn2_b32 s9, 7, s9
-; GFX10-NEXT: s_lshr_b32 s10, s2, 16
-; GFX10-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX10-NEXT: s_lshr_b32 s3, s0, 8
+; GFX10-NEXT: s_and_b32 s12, 0xffff, s12
+; GFX10-NEXT: s_and_b32 s2, 0xffff, s2
; GFX10-NEXT: s_lshr_b32 s6, s6, 1
; GFX10-NEXT: s_and_b32 s9, 0xffff, s9
-; GFX10-NEXT: s_lshl_b32 s1, s3, s1
+; GFX10-NEXT: s_lshr_b32 s4, s0, 16
+; GFX10-NEXT: s_lshr_b32 s5, s0, 24
+; GFX10-NEXT: s_lshl_b32 s0, s0, s12
+; GFX10-NEXT: s_lshl_b32 s2, s3, s2
; GFX10-NEXT: s_lshr_b32 s3, s6, s9
-; GFX10-NEXT: s_and_b32 s6, s10, 7
-; GFX10-NEXT: s_or_b32 s1, s1, s3
-; GFX10-NEXT: s_and_b32 s3, 0xffff, s6
-; GFX10-NEXT: s_and_b32 s6, s7, 0xff
-; GFX10-NEXT: s_lshr_b32 s2, s2, 24
-; GFX10-NEXT: s_lshl_b32 s3, s4, s3
-; GFX10-NEXT: s_and_b32 s4, 0xffff, s6
+; GFX10-NEXT: s_or_b32 s0, s0, s1
+; GFX10-NEXT: s_or_b32 s1, s2, s3
+; GFX10-NEXT: s_and_b32 s3, s7, 0xff
; GFX10-NEXT: s_andn2_b32 s6, 7, s10
-; GFX10-NEXT: s_lshr_b32 s4, s4, 1
+; GFX10-NEXT: s_lshr_b32 s3, s3, 1
; GFX10-NEXT: s_and_b32 s6, 0xffff, s6
-; GFX10-NEXT: s_and_b32 s7, s2, 7
-; GFX10-NEXT: s_andn2_b32 s2, 7, s2
-; GFX10-NEXT: s_lshr_b32 s4, s4, s6
-; GFX10-NEXT: s_and_b32 s6, 0xffff, s7
+; GFX10-NEXT: s_and_b32 s2, s10, 7
+; GFX10-NEXT: s_lshr_b32 s3, s3, s6
+; GFX10-NEXT: s_andn2_b32 s6, 7, s11
+; GFX10-NEXT: s_lshl_b32 s2, s4, s2
+; GFX10-NEXT: s_and_b32 s4, s11, 7
; GFX10-NEXT: s_lshr_b32 s7, s8, 1
-; GFX10-NEXT: s_and_b32 s2, 0xffff, s2
-; GFX10-NEXT: s_lshl_b32 s5, s5, s6
-; GFX10-NEXT: s_lshr_b32 s2, s7, s2
-; GFX10-NEXT: s_or_b32 s3, s3, s4
+; GFX10-NEXT: s_and_b32 s6, 0xffff, s6
+; GFX10-NEXT: s_lshl_b32 s4, s5, s4
+; GFX10-NEXT: s_lshr_b32 s5, s7, s6
+; GFX10-NEXT: s_or_b32 s2, s2, s3
; GFX10-NEXT: s_and_b32 s1, s1, 0xff
-; GFX10-NEXT: s_or_b32 s2, s5, s2
+; GFX10-NEXT: s_or_b32 s3, s4, s5
; GFX10-NEXT: s_and_b32 s0, s0, 0xff
; GFX10-NEXT: s_lshl_b32 s1, s1, 8
-; GFX10-NEXT: s_and_b32 s3, s3, 0xff
-; GFX10-NEXT: s_or_b32 s0, s0, s1
-; GFX10-NEXT: s_lshl_b32 s1, s3, 16
; GFX10-NEXT: s_and_b32 s2, s2, 0xff
; GFX10-NEXT: s_or_b32 s0, s0, s1
+; GFX10-NEXT: s_lshl_b32 s1, s2, 16
+; GFX10-NEXT: s_and_b32 s2, s3, 0xff
+; GFX10-NEXT: s_or_b32 s0, s0, s1
; GFX10-NEXT: s_lshl_b32 s1, s2, 24
; GFX10-NEXT: s_or_b32 s0, s0, s1
; GFX10-NEXT: ; return to shader part epilog
@@ -1209,57 +1200,54 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
; GFX11-NEXT: s_lshr_b32 s7, s1, 16
; GFX11-NEXT: s_lshr_b32 s8, s1, 24
; GFX11-NEXT: s_and_b32 s1, s1, 0xff
-; GFX11-NEXT: s_and_b32 s11, s2, 7
+; GFX11-NEXT: s_lshr_b32 s9, s2, 8
+; GFX11-NEXT: s_lshr_b32 s10, s2, 16
+; GFX11-NEXT: s_lshr_b32 s11, s2, 24
+; GFX11-NEXT: s_and_b32 s12, s2, 7
; GFX11-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX11-NEXT: s_and_not1_b32 s12, 7, s2
-; GFX11-NEXT: s_and_b32 s11, 0xffff, s11
+; GFX11-NEXT: s_and_not1_b32 s2, 7, s2
; GFX11-NEXT: s_lshr_b32 s1, s1, 1
-; GFX11-NEXT: s_and_b32 s12, 0xffff, s12
-; GFX11-NEXT: s_lshr_b32 s3, s0, 8
-; GFX11-NEXT: s_lshr_b32 s4, s0, 16
-; GFX11-NEXT: s_lshr_b32 s5, s0, 24
-; GFX11-NEXT: s_lshr_b32 s9, s2, 8
-; GFX11-NEXT: s_lshl_b32 s0, s0, s11
-; GFX11-NEXT: s_lshr_b32 s1, s1, s12
+; GFX11-NEXT: s_and_b32 s2, 0xffff, s2
; GFX11-NEXT: s_and_b32 s6, s6, 0xff
-; GFX11-NEXT: s_or_b32 s0, s0, s1
-; GFX11-NEXT: s_and_b32 s1, s9, 7
+; GFX11-NEXT: s_lshr_b32 s1, s1, s2
+; GFX11-NEXT: s_and_b32 s2, s9, 7
; GFX11-NEXT: s_and_b32 s6, 0xffff, s6
; GFX11-NEXT: s_and_not1_b32 s9, 7, s9
-; GFX11-NEXT: s_lshr_b32 s10, s2, 16
-; GFX11-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX11-NEXT: s_lshr_b32 s3, s0, 8
+; GFX11-NEXT: s_and_b32 s12, 0xffff, s12
+; GFX11-NEXT: s_and_b32 s2, 0xffff, s2
; GFX11-NEXT: s_lshr_b32 s6, s6, 1
; GFX11-NEXT: s_and_b32 s9, 0xffff, s9
-; GFX11-NEXT: s_lshl_b32 s1, s3, s1
+; GFX11-NEXT: s_lshr_b32 s4, s0, 16
+; GFX11-NEXT: s_lshr_b32 s5, s0, 24
+; GFX11-NEXT: s_lshl_b32 s0, s0, s12
+; GFX11-NEXT: s_lshl_b32 s2, s3, s2
; GFX11-NEXT: s_lshr_b32 s3, s6, s9
-; GFX11-NEXT: s_and_b32 s6, s10, 7
-; GFX11-NEXT: s_or_b32 s1, s1, s3
-; GFX11-NEXT: s_and_b32 s3, 0xffff, s6
-; GFX11-NEXT: s_and_b32 s6, s7, 0xff
-; GFX11-NEXT: s_lshr_b32 s2, s2, 24
-; GFX11-NEXT: s_lshl_b32 s3, s4, s3
-; GFX11-NEXT: s_and_b32 s4, 0xffff, s6
+; GFX11-NEXT: s_or_b32 s0, s0, s1
+; GFX11-NEXT: s_or_b32 s1, s2, s3
+; GFX11-NEXT: s_and_b32 s3, s7, 0xff
; GFX11-NEXT: s_and_not1_b32 s6, 7, s10
-; GFX11-NEXT: s_lshr_b32 s4, s4, 1
+; GFX11-NEXT: s_lshr_b32 s3, s3, 1
; GFX11-NEXT: s_and_b32 s6, 0xffff, s6
-; GFX11-NEXT: s_and_b32 s7, s2, 7
-; GFX11-NEXT: s_and_not1_b32 s2, 7, s2
-; GFX11-NEXT: s_lshr_b32 s4, s4, s6
-; GFX11-NEXT: s_and_b32 s6, 0xffff, s7
+; GFX11-NEXT: s_and_b32 s2, s10, 7
+; GFX11-NEXT: s_lshr_b32 s3, s3, s6
+; GFX11-NEXT: s_and_not1_b32 s6, 7, s11
+; GFX11-NEXT: s_lshl_b32 s2, s4, s2
+; GFX11-NEXT: s_and_b32 s4, s11, 7
; GFX11-NEXT: s_lshr_b32 s7, s8, 1
-; GFX11-NEXT: s_and_b32 s2, 0xffff, s2
-; GFX11-NEXT: s_lshl_b32 s5, s5, s6
-; GFX11-NEXT: s_lshr_b32 s2, s7, s2
-; GFX11-NEXT: s_or_b32 s3, s3, s4
+; GFX11-NEXT: s_and_b32 s6, 0xffff, s6
+; GFX11-NEXT: s_lshl_b32 s4, s5, s4
+; GFX11-NEXT: s_lshr_b32 s5, s7, s6
+; GFX11-NEXT: s_or_b32 s2, s2, s3
; GFX11-NEXT: s_and_b32 s1, s1, 0xff
-; GFX11-NEXT: s_or_b32 s2, s5, s2
+; GFX11-NEXT: s_or_b32 s3, s4, s5
; GFX11-NEXT: s_and_b32 s0, s0, 0xff
; GFX11-NEXT: s_lshl_b32 s1, s1, 8
-; GFX11-NEXT: s_and_b32 s3, s3, 0xff
-; GFX11-NEXT: s_or_b32 s0, s0, s1
-; GFX11-NEXT: s_lshl_b32 s1, s3, 16
; GFX11-NEXT: s_and_b32 s2, s2, 0xff
; GFX11-NEXT: s_or_b32 s0, s0, s1
+; GFX11-NEXT: s_lshl_b32 s1, s2, 16
+; GFX11-NEXT: s_and_b32 s2, s3, 0xff
+; GFX11-NEXT: s_or_b32 s0, s0, s1
; GFX11-NEXT: s_lshl_b32 s1, s2, 24
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_or_b32 s0, s0, s1
@@ -1862,56 +1850,53 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
; GFX6: ; %bb.0:
; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, 24
; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2
+; GFX6-NEXT: s_bfe_u32 s9, s0, 0x80008
; GFX6-NEXT: s_lshr_b32 s6, s0, 16
; GFX6-NEXT: s_lshr_b32 s7, s1, 8
-; GFX6-NEXT: s_bfe_u32 s9, s0, 0x80008
; GFX6-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2
; GFX6-NEXT: s_and_b32 s8, s0, 0xff
; GFX6-NEXT: s_lshl_b32 s9, s9, 8
+; GFX6-NEXT: s_or_b32 s8, s8, s9
; GFX6-NEXT: s_and_b32 s6, s6, 0xff
; GFX6-NEXT: s_and_b32 s1, s1, 0xff
; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: s_and_b32 s0, s7, 0xff
-; GFX6-NEXT: s_or_b32 s8, s8, s9
-; GFX6-NEXT: s_and_b32 s6, 0xffff, s6
-; GFX6-NEXT: v_alignbit_b32 v0, s1, v0, 24
-; GFX6-NEXT: s_and_b32 s0, 0xffff, s0
; GFX6-NEXT: s_and_b32 s8, 0xffff, s8
; GFX6-NEXT: s_lshl_b32 s6, s6, 16
-; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX6-NEXT: s_lshl_b32 s0, s0, 16
+; GFX6-NEXT: v_alignbit_b32 v0, s1, v0, 24
+; GFX6-NEXT: s_and_b32 s0, 0xffff, s0
; GFX6-NEXT: v_not_b32_e32 v3, 23
; GFX6-NEXT: s_or_b32 s6, s8, s6
+; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX6-NEXT: s_lshl_b32 s0, s0, 16
+; GFX6-NEXT: s_bfe_u32 s8, s2, 0x80008
+; GFX6-NEXT: v_mul_lo_u32 v4, v2, v3
; GFX6-NEXT: v_or_b32_e32 v0, s0, v0
; GFX6-NEXT: s_lshr_b32 s0, s2, 16
; GFX6-NEXT: s_lshr_b32 s1, s3, 8
-; GFX6-NEXT: s_bfe_u32 s8, s2, 0x80008
-; GFX6-NEXT: v_mul_lo_u32 v4, v2, v3
; GFX6-NEXT: s_and_b32 s7, s2, 0xff
; GFX6-NEXT: s_lshl_b32 s8, s8, 8
+; GFX6-NEXT: s_or_b32 s7, s7, s8
; GFX6-NEXT: s_and_b32 s0, s0, 0xff
; GFX6-NEXT: s_and_b32 s3, s3, 0xff
; GFX6-NEXT: v_mov_b32_e32 v1, s2
; GFX6-NEXT: s_and_b32 s1, s1, 0xff
-; GFX6-NEXT: s_or_b32 s7, s7, s8
-; GFX6-NEXT: s_and_b32 s0, 0xffff, s0
-; GFX6-NEXT: v_alignbit_b32 v1, s3, v1, 24
-; GFX6-NEXT: s_and_b32 s1, 0xffff, s1
; GFX6-NEXT: s_and_b32 s7, 0xffff, s7
; GFX6-NEXT: s_lshl_b32 s0, s0, 16
+; GFX6-NEXT: v_alignbit_b32 v1, s3, v1, 24
+; GFX6-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX6-NEXT: s_or_b32 s0, s7, s0
; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX6-NEXT: s_lshl_b32 s1, s1, 16
-; GFX6-NEXT: s_or_b32 s0, s7, s0
-; GFX6-NEXT: v_or_b32_e32 v1, s1, v1
-; GFX6-NEXT: s_lshr_b32 s1, s4, 16
; GFX6-NEXT: s_bfe_u32 s7, s4, 0x80008
; GFX6-NEXT: v_mul_hi_u32 v4, v2, v4
+; GFX6-NEXT: v_or_b32_e32 v1, s1, v1
+; GFX6-NEXT: s_lshr_b32 s1, s4, 16
; GFX6-NEXT: s_and_b32 s3, s4, 0xff
; GFX6-NEXT: s_lshl_b32 s7, s7, 8
-; GFX6-NEXT: s_and_b32 s1, s1, 0xff
; GFX6-NEXT: s_or_b32 s3, s3, s7
-; GFX6-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX6-NEXT: s_and_b32 s1, s1, 0xff
; GFX6-NEXT: s_and_b32 s3, 0xffff, s3
; GFX6-NEXT: s_lshl_b32 s1, s1, 16
; GFX6-NEXT: s_or_b32 s1, s3, s1
@@ -1985,56 +1970,53 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
; GFX8-NEXT: s_lshl_b32 s6, s6, 8
; GFX8-NEXT: s_or_b32 s0, s0, s6
; GFX8-NEXT: s_and_b32 s6, s7, 0xff
-; GFX8-NEXT: s_and_b32 s6, 0xffff, s6
+; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, 24
; GFX8-NEXT: s_lshr_b32 s9, s1, 8
; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
; GFX8-NEXT: s_lshl_b32 s6, s6, 16
; GFX8-NEXT: s_and_b32 s1, s1, 0xff
+; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX8-NEXT: s_or_b32 s0, s0, s6
; GFX8-NEXT: s_lshl_b32 s1, s1, 8
; GFX8-NEXT: s_and_b32 s6, s9, 0xff
-; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, 24
; GFX8-NEXT: s_or_b32 s1, s8, s1
; GFX8-NEXT: s_and_b32 s6, 0xffff, s6
-; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
; GFX8-NEXT: s_lshl_b32 s6, s6, 16
; GFX8-NEXT: s_or_b32 s1, s1, s6
; GFX8-NEXT: s_lshr_b32 s6, s2, 8
+; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX8-NEXT: s_and_b32 s6, s6, 0xff
+; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX8-NEXT: s_lshr_b32 s7, s2, 16
; GFX8-NEXT: s_lshr_b32 s8, s2, 24
; GFX8-NEXT: s_and_b32 s2, s2, 0xff
; GFX8-NEXT: s_lshl_b32 s6, s6, 8
-; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX8-NEXT: s_or_b32 s2, s2, s6
; GFX8-NEXT: s_and_b32 s6, s7, 0xff
-; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX8-NEXT: s_and_b32 s6, 0xffff, s6
; GFX8-NEXT: s_lshr_b32 s9, s3, 8
; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
; GFX8-NEXT: s_lshl_b32 s6, s6, 16
; GFX8-NEXT: s_and_b32 s3, s3, 0xff
+; GFX8-NEXT: v_not_b32_e32 v1, 23
; GFX8-NEXT: s_or_b32 s2, s2, s6
; GFX8-NEXT: s_lshl_b32 s3, s3, 8
; GFX8-NEXT: s_and_b32 s6, s9, 0xff
-; GFX8-NEXT: v_not_b32_e32 v1, 23
+; GFX8-NEXT: v_mul_lo_u32 v2, v0, v1
; GFX8-NEXT: s_or_b32 s3, s8, s3
; GFX8-NEXT: s_and_b32 s6, 0xffff, s6
-; GFX8-NEXT: v_mul_lo_u32 v2, v0, v1
; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
; GFX8-NEXT: s_lshl_b32 s6, s6, 16
; GFX8-NEXT: s_or_b32 s3, s3, s6
; GFX8-NEXT: s_lshr_b32 s6, s4, 8
; GFX8-NEXT: s_and_b32 s6, s6, 0xff
+; GFX8-NEXT: v_mul_hi_u32 v2, v0, v2
; GFX8-NEXT: s_lshr_b32 s7, s4, 16
; GFX8-NEXT: s_lshr_b32 s8, s4, 24
; GFX8-NEXT: s_and_b32 s4, s4, 0xff
; GFX8-NEXT: s_lshl_b32 s6, s6, 8
-; GFX8-NEXT: v_mul_hi_u32 v2, v0, v2
; GFX8-NEXT: s_or_b32 s4, s4, s6
; GFX8-NEXT: s_and_b32 s6, s7, 0xff
-; GFX8-NEXT: s_and_b32 s6, 0xffff, s6
; GFX8-NEXT: s_and_b32 s4, 0xffff, s4
; GFX8-NEXT: s_lshl_b32 s6, s6, 16
; GFX8-NEXT: s_or_b32 s4, s4, s6
@@ -2105,56 +2087,53 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
; GFX9-NEXT: s_lshl_b32 s6, s6, 8
; GFX9-NEXT: s_or_b32 s0, s0, s6
; GFX9-NEXT: s_and_b32 s6, s7, 0xff
-; GFX9-NEXT: s_and_b32 s6, 0xffff, s6
+; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, 24
; GFX9-NEXT: s_lshr_b32 s9, s1, 8
; GFX9-NEXT: s_and_b32 s0, 0xffff, s0
; GFX9-NEXT: s_lshl_b32 s6, s6, 16
; GFX9-NEXT: s_and_b32 s1, s1, 0xff
+; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX9-NEXT: s_or_b32 s0, s0, s6
; GFX9-NEXT: s_lshl_b32 s1, s1, 8
; GFX9-NEXT: s_and_b32 s6, s9, 0xff
-; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, 24
; GFX9-NEXT: s_or_b32 s1, s8, s1
; GFX9-NEXT: s_and_b32 s6, 0xffff, s6
-; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX9-NEXT: s_and_b32 s1, 0xffff, s1
; GFX9-NEXT: s_lshl_b32 s6, s6, 16
; GFX9-NEXT: s_or_b32 s1, s1, s6
; GFX9-NEXT: s_lshr_b32 s6, s2, 8
+; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: s_and_b32 s6, s6, 0xff
+; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX9-NEXT: s_lshr_b32 s7, s2, 16
; GFX9-NEXT: s_lshr_b32 s8, s2, 24
; GFX9-NEXT: s_and_b32 s2, s2, 0xff
; GFX9-NEXT: s_lshl_b32 s6, s6, 8
-; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: s_or_b32 s2, s2, s6
; GFX9-NEXT: s_and_b32 s6, s7, 0xff
-; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT: s_and_b32 s6, 0xffff, s6
; GFX9-NEXT: s_lshr_b32 s9, s3, 8
; GFX9-NEXT: s_and_b32 s2, 0xffff, s2
; GFX9-NEXT: s_lshl_b32 s6, s6, 16
; GFX9-NEXT: s_and_b32 s3, s3, 0xff
+; GFX9-NEXT: v_not_b32_e32 v1, 23
; GF...
[truncated]
|
a9f0563
to
1af8346
Compare
// Widen to S32, but handle `G_XOR x, -1` differently. Legalizer widening | ||
// will use a G_ANYEXT to extend the -1 which prevents matching G_XOR -1 | ||
// as "not". |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe the legalizerhelper should account for this and sext -1s
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
s_not_b32 s1, s3
has no advantage over s_xor_b32 s1, s3, -1
so maybe it's not worth worrying about?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd still expect to form not patterns canonically. I don't know why the iSA provides it, but other patterns to make use of not
// Widen to S32, but handle `G_XOR x, -1` differently. Legalizer widening | ||
// will use a G_ANYEXT to extend the -1 which prevents matching G_XOR -1 | ||
// as "not". |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
s_not_b32 s1, s3
has no advantage over s_xor_b32 s1, s3, -1
so maybe it's not worth worrying about?
@@ -2432,6 +2433,29 @@ void AMDGPURegisterBankInfo::applyMappingImpl( | |||
return; | |||
} | |||
|
|||
// 16-bit operations are VALU only, but can be promoted to 32-bit SALU. | |||
// Packed 16-bit operations need to be scalarized and promoted. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What does the comment about packed 16-bit refer to? I don't see any code that implements that here. Is it a TODO? Or does it happen somewhere else? But anyway, for and/or/xor you don't need to scalarize and promote them - you can do the operation directly on the packed values.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It was copy pasted from below and I forgot to remove it, it's irrelevant here
Merge activity
|
251835c
to
3a471d0
Compare
858234d
to
3635048
Compare
3635048
to
7559408
Compare
See #64591