@@ -2094,6 +2094,74 @@ bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect(
2094
2094
return true ;
2095
2095
}
2096
2096
2097
+ // Break s_mul_u64 into 32-bit vector operations.
2098
+ void AMDGPURegisterBankInfo::applyMappingSMULU64 (
2099
+ MachineIRBuilder &B, const OperandsMapper &OpdMapper) const {
2100
+ SmallVector<Register, 2 > DefRegs (OpdMapper.getVRegs (0 ));
2101
+ SmallVector<Register, 2 > Src0Regs (OpdMapper.getVRegs (1 ));
2102
+ SmallVector<Register, 2 > Src1Regs (OpdMapper.getVRegs (2 ));
2103
+
2104
+ // All inputs are SGPRs, nothing special to do.
2105
+ if (DefRegs.empty ()) {
2106
+ assert (Src0Regs.empty () && Src1Regs.empty ());
2107
+ applyDefaultMapping (OpdMapper);
2108
+ return ;
2109
+ }
2110
+
2111
+ assert (DefRegs.size () == 2 );
2112
+ assert (Src0Regs.size () == Src1Regs.size () &&
2113
+ (Src0Regs.empty () || Src0Regs.size () == 2 ));
2114
+
2115
+ MachineRegisterInfo &MRI = OpdMapper.getMRI ();
2116
+ MachineInstr &MI = OpdMapper.getMI ();
2117
+ Register DstReg = MI.getOperand (0 ).getReg ();
2118
+ LLT HalfTy = LLT::scalar (32 );
2119
+
2120
+ // Depending on where the source registers came from, the generic code may
2121
+ // have decided to split the inputs already or not. If not, we still need to
2122
+ // extract the values.
2123
+
2124
+ if (Src0Regs.empty ())
2125
+ split64BitValueForMapping (B, Src0Regs, HalfTy, MI.getOperand (1 ).getReg ());
2126
+ else
2127
+ setRegsToType (MRI, Src0Regs, HalfTy);
2128
+
2129
+ if (Src1Regs.empty ())
2130
+ split64BitValueForMapping (B, Src1Regs, HalfTy, MI.getOperand (2 ).getReg ());
2131
+ else
2132
+ setRegsToType (MRI, Src1Regs, HalfTy);
2133
+
2134
+ setRegsToType (MRI, DefRegs, HalfTy);
2135
+
2136
+ // The multiplication is done as follows:
2137
+ //
2138
+ // Op1H Op1L
2139
+ // * Op0H Op0L
2140
+ // --------------------
2141
+ // Op1H*Op0L Op1L*Op0L
2142
+ // + Op1H*Op0H Op1L*Op0H
2143
+ // -----------------------------------------
2144
+ // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L
2145
+ //
2146
+ // We drop Op1H*Op0H because the result of the multiplication is a 64-bit
2147
+ // value and that would overflow.
2148
+ // The low 32-bit value is Op1L*Op0L.
2149
+ // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from
2150
+ // Op1L*Op0L).
2151
+
2152
+ ApplyRegBankMapping ApplyBank (B, *this , MRI, &AMDGPU::VGPRRegBank);
2153
+
2154
+ Register Hi = B.buildUMulH (HalfTy, Src0Regs[0 ], Src1Regs[0 ]).getReg (0 );
2155
+ Register MulLoHi = B.buildMul (HalfTy, Src0Regs[0 ], Src1Regs[1 ]).getReg (0 );
2156
+ Register Add = B.buildAdd (HalfTy, Hi, MulLoHi).getReg (0 );
2157
+ Register MulHiLo = B.buildMul (HalfTy, Src0Regs[1 ], Src1Regs[0 ]).getReg (0 );
2158
+ B.buildAdd (DefRegs[1 ], Add, MulHiLo);
2159
+ B.buildMul (DefRegs[0 ], Src0Regs[0 ], Src1Regs[0 ]);
2160
+
2161
+ MRI.setRegBank (DstReg, AMDGPU::VGPRRegBank);
2162
+ MI.eraseFromParent ();
2163
+ }
2164
+
2097
2165
void AMDGPURegisterBankInfo::applyMappingImpl (
2098
2166
MachineIRBuilder &B, const OperandsMapper &OpdMapper) const {
2099
2167
MachineInstr &MI = OpdMapper.getMI ();
@@ -2394,13 +2462,21 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
2394
2462
Register DstReg = MI.getOperand (0 ).getReg ();
2395
2463
LLT DstTy = MRI.getType (DstReg);
2396
2464
2465
+ // Special case for s_mul_u64. There is not a vector equivalent of
2466
+ // s_mul_u64. Hence, we have to break down s_mul_u64 into 32-bit vector
2467
+ // multiplications.
2468
+ if (Opc == AMDGPU::G_MUL && DstTy.getSizeInBits () == 64 ) {
2469
+ applyMappingSMULU64 (B, OpdMapper);
2470
+ return ;
2471
+ }
2472
+
2397
2473
// 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
2398
2474
// Packed 16-bit operations need to be scalarized and promoted.
2399
2475
if (DstTy != LLT::scalar (16 ) && DstTy != LLT::fixed_vector (2 , 16 ))
2400
2476
break ;
2401
2477
2402
2478
const RegisterBank *DstBank =
2403
- OpdMapper.getInstrMapping ().getOperandMapping (0 ).BreakDown [0 ].RegBank ;
2479
+ OpdMapper.getInstrMapping ().getOperandMapping (0 ).BreakDown [0 ].RegBank ;
2404
2480
if (DstBank == &AMDGPU::VGPRRegBank)
2405
2481
break ;
2406
2482
@@ -2451,6 +2527,72 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
2451
2527
2452
2528
return ;
2453
2529
}
2530
+ case AMDGPU::G_AMDGPU_S_MUL_I64_I32:
2531
+ case AMDGPU::G_AMDGPU_S_MUL_U64_U32: {
2532
+ // This is a special case for s_mul_u64. We use
2533
+ // G_AMDGPU_S_MUL_I64_I32 opcode to represent an s_mul_u64 operation
2534
+ // where the 33 higher bits are sign-extended and
2535
+ // G_AMDGPU_S_MUL_U64_U32 opcode to represent an s_mul_u64 operation
2536
+ // where the 32 higher bits are zero-extended. In case scalar registers are
2537
+ // selected, both opcodes are lowered as s_mul_u64. If the vector registers
2538
+ // are selected, then G_AMDGPU_S_MUL_I64_I32 and
2539
+ // G_AMDGPU_S_MUL_U64_U32 are lowered with a vector mad instruction.
2540
+
2541
+ // Insert basic copies.
2542
+ applyDefaultMapping (OpdMapper);
2543
+
2544
+ Register DstReg = MI.getOperand (0 ).getReg ();
2545
+ Register SrcReg0 = MI.getOperand (1 ).getReg ();
2546
+ Register SrcReg1 = MI.getOperand (2 ).getReg ();
2547
+ const LLT S32 = LLT::scalar (32 );
2548
+ const LLT S64 = LLT::scalar (64 );
2549
+ assert (MRI.getType (DstReg) == S64 && " This is a special case for s_mul_u64 "
2550
+ " that handles only 64-bit operands." );
2551
+ const RegisterBank *DstBank =
2552
+ OpdMapper.getInstrMapping ().getOperandMapping (0 ).BreakDown [0 ].RegBank ;
2553
+
2554
+ // Replace G_AMDGPU_S_MUL_I64_I32 and G_AMDGPU_S_MUL_U64_U32
2555
+ // with s_mul_u64 operation.
2556
+ if (DstBank == &AMDGPU::SGPRRegBank) {
2557
+ MI.setDesc (TII->get (AMDGPU::S_MUL_U64));
2558
+ MRI.setRegClass (DstReg, &AMDGPU::SGPR_64RegClass);
2559
+ MRI.setRegClass (SrcReg0, &AMDGPU::SGPR_64RegClass);
2560
+ MRI.setRegClass (SrcReg1, &AMDGPU::SGPR_64RegClass);
2561
+ return ;
2562
+ }
2563
+
2564
+ // Replace G_AMDGPU_S_MUL_I64_I32 and G_AMDGPU_S_MUL_U64_U32
2565
+ // with a vector mad.
2566
+ assert (MRI.getRegBankOrNull (DstReg) == &AMDGPU::VGPRRegBank &&
2567
+ " The destination operand should be in vector registers." );
2568
+
2569
+ DebugLoc DL = MI.getDebugLoc ();
2570
+
2571
+ // Extract the lower subregister from the first operand.
2572
+ Register Op0L = MRI.createVirtualRegister (&AMDGPU::VGPR_32RegClass);
2573
+ MRI.setRegClass (Op0L, &AMDGPU::VGPR_32RegClass);
2574
+ MRI.setType (Op0L, S32);
2575
+ B.buildTrunc (Op0L, SrcReg0);
2576
+
2577
+ // Extract the lower subregister from the second operand.
2578
+ Register Op1L = MRI.createVirtualRegister (&AMDGPU::VGPR_32RegClass);
2579
+ MRI.setRegClass (Op1L, &AMDGPU::VGPR_32RegClass);
2580
+ MRI.setType (Op1L, S32);
2581
+ B.buildTrunc (Op1L, SrcReg1);
2582
+
2583
+ unsigned NewOpc = Opc == AMDGPU::G_AMDGPU_S_MUL_U64_U32
2584
+ ? AMDGPU::G_AMDGPU_MAD_U64_U32
2585
+ : AMDGPU::G_AMDGPU_MAD_I64_I32;
2586
+
2587
+ MachineIRBuilder B (MI);
2588
+ Register Zero64 = B.buildConstant (S64, 0 ).getReg (0 );
2589
+ MRI.setRegClass (Zero64, &AMDGPU::VReg_64RegClass);
2590
+ Register CarryOut = MRI.createVirtualRegister (&AMDGPU::VReg_64RegClass);
2591
+ MRI.setRegClass (CarryOut, &AMDGPU::VReg_64RegClass);
2592
+ B.buildInstr (NewOpc, {DstReg, CarryOut}, {Op0L, Op1L, Zero64});
2593
+ MI.eraseFromParent ();
2594
+ return ;
2595
+ }
2454
2596
case AMDGPU::G_SEXT_INREG: {
2455
2597
SmallVector<Register, 2 > SrcRegs (OpdMapper.getVRegs (1 ));
2456
2598
if (SrcRegs.empty ())
@@ -3669,7 +3811,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
3669
3811
3670
3812
case AMDGPU::G_AND:
3671
3813
case AMDGPU::G_OR:
3672
- case AMDGPU::G_XOR: {
3814
+ case AMDGPU::G_XOR:
3815
+ case AMDGPU::G_MUL: {
3673
3816
unsigned Size = MRI.getType (MI.getOperand (0 ).getReg ()).getSizeInBits ();
3674
3817
if (Size == 1 ) {
3675
3818
const RegisterBank *DstBank
@@ -3737,7 +3880,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
3737
3880
case AMDGPU::G_PTRMASK:
3738
3881
case AMDGPU::G_ADD:
3739
3882
case AMDGPU::G_SUB:
3740
- case AMDGPU::G_MUL:
3741
3883
case AMDGPU::G_SHL:
3742
3884
case AMDGPU::G_LSHR:
3743
3885
case AMDGPU::G_ASHR:
@@ -3755,6 +3897,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
3755
3897
case AMDGPU::G_SHUFFLE_VECTOR:
3756
3898
case AMDGPU::G_SBFX:
3757
3899
case AMDGPU::G_UBFX:
3900
+ case AMDGPU::G_AMDGPU_S_MUL_I64_I32:
3901
+ case AMDGPU::G_AMDGPU_S_MUL_U64_U32:
3758
3902
if (isSALUMapping (MI))
3759
3903
return getDefaultMappingSOP (MI);
3760
3904
return getDefaultMappingVOP (MI);
0 commit comments